# Predictive Models

In [1]:
import numpy as np
import pandas as pd

In [2]:
# full dataset
df = pd.read_csv('data/bone_marrow_processed.csv')

# reduced dataset
red_df = pd.read_csv('data/reduced_data.csv')

In [3]:
df.head()

Unnamed: 0,donor_age,recipient_age,recipient_body_mass,CMV_status,antigen,allel,CD34_x1e6_per_kg,CD3_x1e8_per_kg,CD3_to_CD34_ratio,ANC_recovery,...,tx_post_relapse_no,tx_post_relapse_yes,acute_GvHD_II_III_IV_no,acute_GvHD_II_III_IV_yes,acute_GvHD_III_IV_no,acute_GvHD_III_IV_yes,extensive_chronic_GvHD_no,extensive_chronic_GvHD_yes,relapse_no,relapse_yes
0,-1.289981,-0.062658,-0.041097,1.335283,-0.92276,-0.843864,-0.4745,0.167062,1.33876,1.246508,...,1,0,0,1,0,1,1,0,1,0
1,-1.227878,-1.120972,-0.779836,-1.428901,-0.92276,-0.843864,-0.747563,-1.141966,11.078295,0.23372,...,1,0,0,1,1,0,1,0,0,1
2,-0.857918,-0.629612,-0.636192,0.413889,-0.92276,-0.843864,-0.399661,-1.139332,19.01323,2.596891,...,1,0,0,1,1,0,1,0,0,1
3,0.753104,1.543712,0.728424,-0.507506,-0.92276,-0.843864,-0.772846,-1.21308,29.481647,2.596891,...,1,0,0,1,0,1,1,0,1,0
4,-0.013717,-1.631231,-1.374932,-1.428901,1.083706,0.283297,4.041147,2.187231,3.972255,-0.441471,...,1,0,1,0,1,0,1,0,1,0


In [4]:
red_df.head()

Unnamed: 0,recipient_age,recipient_body_mass,CD3_x1e8_per_kg,PLT_recovery,survival_time,donor_ABO_AB,recipient_rh_minus,recipient_rh_plus,disease_lymphoma,risk_group_high,risk_group_low,acute_GvHD_III_IV_no,acute_GvHD_III_IV_yes,relapse_no,relapse_yes
0,-0.062658,-0.041097,0.167062,0.548219,0.071115,0,0,1,0,1,0,0,1,1,0
1,-1.120972,-0.779836,-1.141966,0.150305,-0.915531,0,0,1,0,0,1,1,0,0,1
2,-0.629612,-0.636192,-1.139332,-0.332877,-0.594517,0,0,1,0,0,1,1,0,0,1
3,1.543712,0.728424,-1.21308,-0.077075,-1.045353,0,0,1,0,0,1,0,1,1,0
4,-1.631231,-1.374932,2.187231,-0.503412,1.303242,0,1,0,0,1,0,1,0,1,0


In [5]:
# split columns into features (X) & result (y)
X = df.drop('survival_status', axis = 1)
y = df['survival_status']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# split train and test data for both datasets

# full dataset
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size=.8)

# reduced dataset
red_X_train, red_X_test, red_y_train, red_y_test = train_test_split(red_df, y, train_size=.8)

## Building  Predictive Models

In order to judge the performance of the models, both accuracy and recall will be looked at. In this context, recall is especially important since we are interested in identifying the highest possible number truly positive (did not survive) cases.

### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

**Using Full Dataset**

L1 Regularization

In [67]:
%%capture

# parameters of logistic regression model
params = {'C':[0.01, 0.05, 0.1, 0.2],
          'solver':['liblinear','saga']}

# grid search
gscv = GridSearchCV(LogisticRegression(penalty='l1', max_iter=300),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [68]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.05, 'solver': 'liblinear'}


In [69]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.50 +- 0.09
recall: 0.80 +- 0.80


L2 Regularization

In [12]:
%%capture

# parameters of logistic regression model
params = {'C':[0.2, 0.3, 0.4],
          'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}

# grid search
gscv = GridSearchCV(LogisticRegression(penalty='l2', max_iter=300),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [13]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.3, 'solver': 'lbfgs'}


In [14]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.80 +- 0.46
recall: 0.75 +- 0.77


ElasticNet

In [70]:
%%capture

# parameters of logistic regression model
params = {'C':[0.01, 0.05, 0.1],
          'l1_ratio':[0.1, 0.2, 0.3]}

# grid search
gscv = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=300),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [71]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.05, 'l1_ratio': 0.3}


In [72]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.93 +- 0.20
recall: 0.95 +- 0.20




**Using Reduced Dataset**

L1 Regularization

In [76]:
%%capture

# parameters of logistic regression model
params = {'C':[0.1, 0.2, 0.3],
          'solver':['liblinear','saga']}

# grid search
gscv = GridSearchCV(LogisticRegression(penalty='l1', max_iter=300),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [77]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.2, 'solver': 'liblinear'}


In [78]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.92 +- 0.13
recall: 1.00 +- 0.00


L2 Regularization

In [85]:
%%capture

# parameters of logistic regression model
params = {'C':[0.05, 0.1, 0.2],
          'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}

# grid search
gscv = GridSearchCV(LogisticRegression(penalty='l2', max_iter=300),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [86]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.1, 'solver': 'lbfgs'}


In [87]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.89 +- 0.11
recall: 0.93 +- 0.27


ElasticNet

In [96]:
%%capture

# parameters of logistic regression model
params = {'C':[0.01, 0.05, 0.1],
          'l1_ratio':[0.05, 0.01, 0.05]}

# grid search
gscv = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=300),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [97]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.05, 'l1_ratio': 0.05}


In [98]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.87 +- 0.02
recall: 0.87 +- 0.33


### Support Vector Machine (SVM)

In [27]:
from sklearn.svm import SVC

**Using Full Dataset**

In [99]:
%%capture

# parameters of svm model
params = {'C':[0.05, 0.1, 0.2],
          'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
          'gamma':['scale', 'auto'],
          'degree':[2, 3, 4] # ignored if kernel is not poly
         }

# grid search
gscv = GridSearchCV(SVC(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [100]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


In [101]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.79 +- 0.33
recall: 0.73 +- 0.55


**Using Reduced Dataset**

In [105]:
%%capture

# parameters of svm model
params = {'C':[0.1, 0.2, 0.3],
          'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
          'gamma':['scale', 'auto'],
          'degree':[2, 3, 4, 5] # ignored if kernel is not poly
         }

# grid search
gscv = GridSearchCV(SVC(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [106]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'C': 0.2, 'degree': 2, 'gamma': 'auto', 'kernel': 'sigmoid'}


In [107]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.61 +- 0.21
recall: 0.13 +- 0.33


### K-Nearest Neighbors (KNN)

In [34]:
from sklearn.neighbors import KNeighborsClassifier

**Using Full Dataset**

In [35]:
%%capture

# parameters of knn model
params = {'n_neighbors':[19, 20, 21, 22, 23],
          'weights':['uniform', 'distance'],
          'metric':['manhattan', 'euclidean', 'cosine']}

# grid search
gscv = GridSearchCV(KNeighborsClassifier(n_jobs=-1),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [36]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'metric': 'cosine', 'n_neighbors': 21, 'weights': 'uniform'}


In [37]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.43 +- 0.34
recall: 0.38 +- 0.47


**Using Reduced Dataset**

In [111]:
%%capture

# parameters of knn model
params = {'n_neighbors':[3, 4, 5, 7],
          'weights':['uniform', 'distance'],
          'metric':['manhattan', 'euclidean', 'cosine']}

# grid search
gscv = GridSearchCV(KNeighborsClassifier(n_jobs=-1),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [112]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'metric': 'cosine', 'n_neighbors': 5, 'weights': 'uniform'}


In [113]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.82 +- 0.21
recall: 0.88 +- 0.29


### Decison Tree Classifier (DTC)

In [44]:
from sklearn.tree import DecisionTreeClassifier

**Using Full Dataset**

In [45]:
%%capture

# parameters of dtc model
params = {'criterion':['gini', 'entropy', 'log_loss'],
          'splitter':['best', 'random'],
          'max_depth':[4, 5, 6, 7, 8]}

# grid search
gscv = GridSearchCV(DecisionTreeClassifier(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [46]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'criterion': 'log_loss', 'max_depth': 6, 'splitter': 'random'}


In [47]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.77 +- 0.17
recall: 0.85 +- 0.40


**Using Reduced Dataset**

In [48]:
%%capture

# parameters of dtc model
params = {'criterion':['gini', 'entropy', 'log_loss'],
          'splitter':['best', 'random'],
          'max_depth':[2, 3, 4, 5, 6]}

# grid search
gscv = GridSearchCV(DecisionTreeClassifier(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [49]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'criterion': 'gini', 'max_depth': 2, 'splitter': 'best'}


In [50]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.87 +- 0.22
recall: 0.85 +- 0.40


### Random Forest Classifier

In [51]:
from sklearn.ensemble import RandomForestClassifier

**Using Full Dataset**

In [52]:
%%capture

# parameters of rfc model
params = {'n_estimators':[60, 70, 80, 90, 100],
          'criterion':['gini', 'entropy', 'log_loss'],
          'max_depth':[4, 6, 8, 10, 12]}

# grid search
gscv = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [53]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 80}


In [54]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.69 +- 0.62
recall: 0.65 +- 0.75


**Using Reduced Dataset**

In [55]:
%%capture

# parameters of rfc model
params = {'n_estimators':[40, 50, 60, 70],
          'criterion':['gini', 'entropy', 'log_loss'],
          'max_depth':[2, 4, 6, 8, 10]}

# grid search
gscv = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [56]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50}


In [57]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.93 +- 0.20
recall: 1.00 +- 0.00


### Gradient Boosting Classifier (GBC)

In [114]:
from sklearn.ensemble import GradientBoostingClassifier

**Using Full Dataset**

In [120]:
%%capture

# parameters of gbc model
params = {'learning_rate':[0.05, 0.1, 0.2, 0.3],
          'n_estimators':[30, 40, 50, 60],
          'max_depth':[2, 4, 6, 8]}

# grid search
gscv = GridSearchCV(GradientBoostingClassifier(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [121]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 60}


In [122]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.85 +- 0.29
recall: 0.80 +- 0.37


**Using Reduced Dataset**

In [216]:
%%capture

# parameters of gbc model
params = {'learning_rate':[0.005, 0.01, 0.05, 0.1],
          'n_estimators':[30, 40, 50, 60],
          'max_depth':[2, 4, 6, 8]}

# grid search
gscv = GridSearchCV(GradientBoostingClassifier(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [217]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 50}


In [218]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.84 +- 0.09
recall: 0.91 +- 0.22


### Extreme Gradient Boosting (XGB)

In [62]:
from xgboost import XGBClassifier

**Using Full Dataset**

In [221]:
%%capture

# parameters of xgb model
params = {'eta':[0.05, 0.1, 0.2, 0.4],
          'gamma':[1, 2, 3],
          'max_depth': [2, 3, 4, 5],
          'min_child_weight': [1, 2, 3]}

# grid search
gscv = GridSearchCV(XGBClassifier(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(X_train, y_train)

In [222]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'eta': 0.1, 'gamma': 2, 'max_depth': 3, 'min_child_weight': 2}


In [223]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,X_test,y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.95 +- 0.13
recall: 0.93 +- 0.27


**Using Reduced Dataset**

In [224]:
%%capture

# parameters of xgb model
params = {'eta':[0.05, 0.1, 0.15],
          'gamma':[1, 1.5, 2],
          'max_depth': [2, 3, 4],
          'min_child_weight': [1, 2, 3]}

# grid search
gscv = GridSearchCV(XGBClassifier(),
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=True)

# fit mdodel with full dataset
gscv.fit(red_X_train, red_y_train)

In [225]:
# what are the parameters of the best model?
print(gscv.best_params_)

{'eta': 0.1, 'gamma': 1.5, 'max_depth': 3, 'min_child_weight': 1}


In [226]:
# cross-val score of the best model
scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5)
rc_scores = cross_val_score(gscv.best_estimator_,red_X_test,red_y_test,cv=5, scoring='recall')

print(f'accuracy: {np.mean(scores):0.2f} +- {2*np.std(scores):0.2f}')
print(f'recall: {np.mean(rc_scores):0.2f} +- {2*np.std(rc_scores):0.2f}')

accuracy: 0.89 +- 0.11
recall: 0.91 +- 0.22
