In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import of libraries

In [None]:
import os
import numpy as np
import pandas as pd 

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt 
from pandas.plotting import scatter_matrix

# Scalers
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Data processing, evaluation and model selection
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

# Metrics
from sklearn.metrics import mean_squared_error, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report 

# Classification libraries
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# XGBoost
import xgboost as xgb

## Plotting settings

In [None]:
# Notebook style plotting
%matplotlib notebook

%matplotlib inline

# Default colourpalette
color_paletten = sns.color_palette()

# Set theme for plotting with seaborn
theme = sns.set_theme(style='whitegrid', palette=color_paletten)

## Import of data

In [None]:
# Change directory
os.chdir('/kaggle/input/nasa-asteroids-classification/')

In [None]:
# Read .csv file into DataFrame
datan = pd.read_csv('nasa.csv')

## Data exploration

In [None]:
datan.info()

In [None]:
print(datan.isnull().sum())

In [None]:
datan.head()

In [None]:
datan.describe()

In [None]:
datan['Hazardous'].describe()

In [None]:
datan.nunique()

In [None]:
datan['Orbiting Body'].unique()

In [None]:
datan['Equinox'].unique()

In [None]:
datan['Close Approach Date'].unique()

In [None]:
datan['Orbit Determination Date'].unique()

## Selection of features

### Dropping redundant features

In [None]:
# Neo Reference ID and Name are redundant as identifier, drop Name, set Neo Reference ID as index
datan.set_index(['Neo Reference ID'], drop=True)
datan.drop(['Name'], axis=1, inplace=True)

In [None]:
# Orbiting Body and Equinox are the same for all rows, does not provide any information and can be dropped
datan.drop(['Orbiting Body'], axis=1, inplace=True)
datan.drop(['Equinox'], axis=1, inplace=True)

In [None]:
# Close Approach Date and Orbit Determination Date are difficult to handle as numerical data, dropped
datan.drop(['Close Approach Date'], axis=1, inplace=True)
datan.drop(['Orbit Determination Date'], axis=1, inplace=True)

In [None]:
# Estimated Min and Max Diamater in kilometer, meter, miles and feet. These are redundant and only one unit is needed.
# Keeping Min and Max Diameter in meter, while droppping the others.

datan.drop(['Est Dia in KM(min)'], axis=1, inplace=True)
datan.drop(['Est Dia in KM(max)'], axis=1, inplace=True)
datan.drop(['Est Dia in Miles(min)'], axis=1, inplace=True)
datan.drop(['Est Dia in Miles(max)'], axis=1, inplace=True)
datan.drop(['Est Dia in Feet(min)'], axis=1, inplace=True)
datan.drop(['Est Dia in Feet(max)'], axis=1, inplace=True)

In [None]:
# Relative Velocity km per sec and hour, and Miles per hour redundant, choosing to keep Km/Sec, dropping other.
datan.drop(['Relative Velocity km per hr'], axis=1, inplace=True)
datan.drop(['Miles per hour'], axis=1, inplace=True)

In [None]:
# Miss Dist in units: (Astronomical), (lunar), (kilometers), (miles). 
# Not exact same number of unique values ( and the unique values might not be the same),
# So not redundant, but number of unique values similar, For this choosing to keep the unit with most unique values:
# (Astronomical), while dropping the other.
datan.drop(['Miss Dist.(lunar)'], axis=1, inplace=True)
datan.drop(['Miss Dist.(kilometers)'], axis=1, inplace=True)
datan.drop(['Miss Dist.(miles)'], axis=1, inplace=True)

In [None]:
# Reset index and set as Neo Reference ID again
datan.reset_index(drop=True)
datan.set_index('Neo Reference ID', inplace=True)

### Selection of X

In [None]:
# Dropping column with labels, rest of columns as features in X
X = datan.drop(['Hazardous'], axis=1)
print(X.columns)
X.head()

### Selection of y

In [None]:
# Selection of column with labels to y
y = datan['Hazardous']
le = LabelEncoder()
y = le.fit_transform(y=y)
y

## Visualisation - data exploration

### Histograms of features in X

In [None]:
column_count = X.shape[1]

fig, axes = plt.subplots(column_count,1, figsize=(12, 4*column_count))

for (features, colname) in enumerate(X):
    sns.histplot(ax=axes[features], x=colname, data=X)

In [None]:
#grid = sns.PairGrid(data=datan, height = 0.4)

# Map different plots to different sections
#grid = grid.map_upper(sns.pointplot, color = 'darkblue')
#grid = grid.map_lower(sns.kdeplot, cmap = 'Blues', fill=True)
#grid = grid.map_diag(plt.hist, bins = 10, color = 'darkblue', edgecolor = 'b')

### Correlation plot of LINEAR correlations of all used data

In [None]:
corr = datan.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, axes = plt.subplots(figsize=(16, 14))
fig.tight_layout(pad=12)

colormap = sns.diverging_palette(150, 250, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=colormap, vmax=1.0, center=0,
            square=True, cbar_kws={"shrink": .6})

# Classification

## Pre-processing

### Parameters

In [None]:
# How to select data for k-folds cross-validation
kfold = KFold(n_splits=10, shuffle=True)

#from sklearn.utils.class_weight import compute_class_weight
#classes = np.unique(y)
#weight_unbal = compute_class_weight(class_weight = 'balanced', classes = classes, y=y)
#weight_unbal

### Splitting of data into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=555)

### Creating Objects of scalers

In [None]:
scaler_mm_X = MinMaxScaler()
#scaler_mm_y = MinMaxScaler()

scaler_ss_X = StandardScaler()
#scaler_ss_y = StandardScaler()

### Min-max normalization

In [None]:
X_train_n = scaler_mm_X.fit_transform(X_train)
#y_train_n = scaler_mm_y.fit_transform(y_train.reshape(-1, 1))
X_test_n = scaler_mm_X.transform(X_test)
#y_test_n = scaler_mm_y.transform(y_test.reshape(-1, 1))

#y_train_n = y_train_n.ravel()
#y_test_n = y_test_n.ravel()

### Z-score Normalization

In [None]:
X_train_s = scaler_ss_X.fit_transform(X_train)
#y_train_s = scaler_ss_y.fit_transform(y_train.reshape(-1, 1))
X_test_s = scaler_ss_X.transform(X_test)
#y_test_s = scaler_ss_y.transform(y_test.reshape(-1, 1))

#y_train_s = y_train_s.ravel()
#y_test_s = y_test_s.ravel()

## Train and Evaluate models

### Naive Bayes classification

#### Fit on training data and predict on test data

In [None]:
#nb_classifier = CategoricalNB()

#nb_classifier.fit(X_train_s, y_train)

#y_hat_nb = nb_classifier.predict(X_test)

# !Failed due to not enough memory available

#### Classification report

In [None]:
#class_report = classification_report(y_test, y_hat_knn, output_dict=True)
#pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
#plt.rcParams['figure.figsize'] = [6, 6]

#fig, ax = plt.subplots()

#cm = confusion_matrix(y_test, y_hat_nb)

#sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

#ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
#ax.set_title('Confusion Matrix'); 
#ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

#plt.show()

#### Cross-validation

In [None]:
#nb_score = cross_val_score(nb_classifier, X_test_s, y_test, cv=kfold, scoring = 'f1_weighted')
#nb_scores = nb_score.mean()
#nb_scores

### K-nearest neighbors Classification

#### Fit on training data and predict on test data

In [None]:
n_neigh = 3

knn_classifier = KNeighborsClassifier(n_neighbors = n_neigh)

knn_classifier.fit(X_train_s, y_train)

y_hat_knn = knn_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_knn, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_knn)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
knn_score = cross_val_score(knn_classifier, X_train_s, y_train, cv=kfold, scoring='f1_weighted')
knn_scores = knn_score.mean()
knn_scores

### Logistic regression Classification

#### Fit on training data and predict on test data

In [None]:
lr_classifier = LogisticRegression(class_weight='balanced',  max_iter = 200)

lr_classifier.fit(X_train_s, y_train)

y_hat_lr = lr_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_lr, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_lr)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
lr_score = cross_val_score(lr_classifier, X_train_s, y_train, cv=kfold, scoring='f1_weighted')
lr_scores = lr_score.mean()
lr_scores

### C-support vector Classification

#### Fit on training data and predict on test data

In [None]:
svm_classifier = SVC(class_weight='balanced')

svm_classifier.fit(X_train_s, y_train)

y_hat_svm = svm_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_svm, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_svm)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
svm_score = cross_val_score(svm_classifier, X_train_s, y_train, cv=kfold, scoring='f1_weighted')
svm_scores = svm_score.mean()
svm_scores

### Decision tree Classification

#### Fit on training data and predict on test data

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth = None, class_weight='balanced')
# worse cross-vals score with: class_weight = weight_unbal

dt_classifier.fit(X_train_s, y_train)

y_hat_dt = dt_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_dt, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_dt)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
dt_score = cross_val_score(dt_classifier, X_train_s, y_train, cv=kfold, scoring='f1_weighted')
dt_scores = dt_score.mean()
dt_scores

### Random forest Classification

#### Fit on training data and predict on test data

In [None]:
rf_classifier = RandomForestClassifier(max_depth = None, n_estimators = 100, class_weight='balanced')

rf_classifier.fit(X_train_s, y_train)

y_hat_rf = rf_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_rf, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_rf)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
rf_score = cross_val_score(rf_classifier, X_train_s, y_train, cv=kfold, scoring='f1_weighted')
rf_scores = rf_score.mean()
rf_scores

### Extra trees Classifier 

#### Fit on training data and predict on test data

In [None]:
et_classifier = ExtraTreesClassifier(max_depth = None, n_estimators = 100, class_weight='balanced')

et_classifier.fit(X_train_s, y_train)

y_hat_et = et_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_et, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_et)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
et_score = cross_val_score(et_classifier, X_train_s, y_train, cv=kfold, scoring='f1_weighted')
et_scores = et_score.mean()
et_scores

### Gradient boosting Classifier

#### Fit on training data and predict on test data

In [None]:
gb_classifier = GradientBoostingClassifier(max_depth = None, n_estimators = 100)

gb_classifier.fit(X_train_s, y_train)

y_hat_gb = gb_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_gb, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_gb)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
gb_score = cross_val_score(gb_classifier, X_train_s, y_train, cv=kfold, scoring="f1_weighted")
gb_scores = gb_score.mean()
gb_scores

### AdaBoost Classifier

#### Fit on training data and predict on test data

In [None]:
ab_classifier = AdaBoostClassifier(n_estimators = 100)

ab_classifier.fit(X_train_s, y_train)

y_hat_ab = ab_classifier.predict(X_test_s)

#### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_ab, output_dict=True)
pd.DataFrame(class_report)

#### Confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_ab)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

#### Cross-validation

In [None]:
ab_score = cross_val_score(ab_classifier, X_train_s, y_train, cv=kfold, scoring='f1_weighted')
ab_scores = ab_score.mean()
ab_scores

## XGBOOST

### Initialize XGBoost classifier object

In [None]:
print(xgb.__version__)

# Dictionary with default parameters, not used
param = {'base_score': 0.5,
         'booster': 'gbtree',
         'colsample_bylevel': 1,
         'colsample_bynode': 1,
         'colsample_bytree': 1,
         'gamma': 0,
         'grow_policy': 'depthwise',
         'learning_rate': 0.1,
         'max_delta_step': 0,
         'max_depth': None,
         'min_child_weight': 1,
         'missing': 'nan',
         'n_estimators': 100,
         'num_parallel_tree': 1,
         'objective': 'binary:logistic',
         'process_type': 'default',
         'predictor': 'auto',
         'reg_alpha': 0,
         'reg_lambda': 1,
         'sampling_method': 'uniform',
         'sketch_eps' : 0.03,
         'scale_pos_weight': 1,
         'seed': 0,
         'silent': 1,
         'subsample': 1,
         'tree_method': 'auto',
         'updater': 'grow_colmaker,prune'
        }


# Weighing for imbalanced labels scale_pos_weight = total_negative_examples / total_positive_examples
weight_unbal_xgb = 3932/(4687-3932)

# Creation of classification object
xgb_classifier = xgb.XGBClassifier(#eta=0.3, 
                                   objective='binary:logistic',
                                   #num_class = 2,
                                   use_label_encoder=False, 
                                   #max_depth=4, 
                                   #colsample_bytree=0.8,
                                   #subsample=0.8,
                                   scale_pos_weight=weight_unbal_xgb,
                                   #gamma=0,
                                   #reg_lambda=1,
                                   #reg_alpha=0,
                                   #n_estimators=100,
                                   verbosity = 0
                                   )
xgb_classifier

### Cross validation

In [None]:
xgb_score = cross_val_score(xgb_classifier, X_train, y_train, cv=kfold, scoring='f1_weighted')
xgb_scores = xgb_score.mean()
xgb_scores

### GRID SEARCH

In [None]:
# Number of folds for cross-validation
kfold2 = KFold(n_splits=5, shuffle=True)

# Parameters and values to grid search over
param_grid = [
    {'eta': [0.01, 0.1, 0.3], 'max_depth': [2, 3, 4, 5, 6], 'n_estimators': [100, 200, 400]},
]

# Grid-search object, with crossvalidation evaluation
grid_search = GridSearchCV(xgb_classifier, param_grid, cv=kfold2,
                           scoring='neg_log_loss',
                           return_train_score=True,
                           )

# Fit on data, search
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters
grid_search.best_params_

In [None]:
# List all tested parameters and the cross-validation score
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print((mean_score), params)

In [None]:
# Best parameters fed into new classification object for next grid search
best1 = grid_search.best_estimator_

In [None]:
kfold2 = KFold(n_splits=5, shuffle=True)

# depth < 3 would not contain much information, trying search 3,4,5
param_grid = [
    {'eta': [0.1], 'max_depth': [3], 'n_estimators': [80, 100, 140]},
]

grid_search = GridSearchCV(best1, param_grid, cv=kfold2,
                           scoring='neg_log_loss',
                           return_train_score=True,
                           )

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print((mean_score), params)

In [None]:
best2 = grid_search.best_estimator_

In [None]:
kfold2 = KFold(n_splits=5, shuffle=True)

param_grid = [
    {'min_child_weight' :[0.02, 0.04, 0.08, 0.016], 'subsample': [0.95, 0.975, 1.0]},
]

grid_search = GridSearchCV(best2, param_grid, cv=kfold2,
                           scoring='neg_log_loss',
                           return_train_score=True,
                           )

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print((mean_score), params)

In [None]:
best3 = grid_search.best_estimator_

In [None]:
kfold2 = KFold(n_splits=5, shuffle=True)

param_grid = [
    {'colsample_bylevel': [0.5, 0.75, 1.0], 'colsample_node': [0.5, 0.75, 1.0],
    'colsample_bytree': [0.5, 0.75, 1.0]},
]

grid_search = GridSearchCV(best3, param_grid, cv=kfold2,
                           scoring='neg_log_loss',
                           return_train_score=True,
                           )

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print((mean_score), params)

In [None]:
best4 = grid_search.best_estimator_

In [None]:
kfold2 = KFold(n_splits=5, shuffle=True)

param_grid = [
    {'gamma': [0.05, 0.1, 0.15], 'reg_lambda': [0.025, 0.05, 0.1], 'reg_alpha': [0.05, 0.1, 0.15]}
]

grid_search = GridSearchCV(best4, param_grid, cv=kfold2,
                           scoring='neg_log_loss',
                           return_train_score=True,
                           )

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print((mean_score), params)

In [None]:
xcb_final = grid_search.best_estimator_

In [None]:
xcb_final

### Fit on training data and predict on test data

In [None]:
xcb_final.fit(X_train,y_train)

y_hat_xgb = xcb_final.predict(X_test)

### Classification report

In [None]:
class_report = classification_report(y_test, y_hat_xgb, output_dict=True)
pd.DataFrame(class_report)

### Plot boosted tree

In [None]:
plt.rcParams['figure.figsize'] = [14, 14]

xgb.plot_tree(xcb_final, show_node_id = True)

plt.show()

### Plot importance of features

In [None]:
plt.rcParams['figure.figsize'] = [14, 14]

xgb.plot_importance(xcb_final)

plt.show()

### Plot confusion matrix

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]

fig, ax = plt.subplots()

cm = confusion_matrix(y_test, y_hat_xgb)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='d'); 

ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']); ax.yaxis.set_ticklabels(['Non-Hazardous', 'Hazardous']);

plt.show()

In [None]:
xgb_score = cross_val_score(xcb_final, X_train, y_train, cv=kfold, scoring='f1_weighted')
xgb_scores = xgb_score.mean()
xgb_scores