In [46]:
import pandas as pd
import numpy as np
df = pd.read_csv('/home/jovyan/data1030/data1030-oscars-prediction-project/data/pre_training_data.csv')
print("The shape of the dataset is: ")
print(df.shape)
print("The balance of the dataset is: ")
label = 'Oscar_Best_Picture_won'
y = df[label]
print(y.value_counts()/len(y))
X = df.drop(columns=['movie'])

The shape of the dataset is: 
(1235, 1019)
The balance of the dataset is: 
0    0.985425
1    0.014575
Name: Oscar_Best_Picture_won, dtype: float64


In [47]:
print("Of the features that have missing values, the fraction of missing values is:")
nulls = df.isnull().sum(axis=0)/df.shape[0]
print(nulls[nulls > 0])
print("")
print("The total fraction of missing features in the data set is:")
print(sum(df.isnull().sum(axis=1)!=0)/df.shape[0])
print("")

Of the features that have missing values, the fraction of missing values is:
metascore         0.023482
gross             0.034008
user_reviews      0.011336
critic_reviews    0.008097
popularity        0.109312
dtype: float64

The total fraction of missing features in the data set is:
0.12874493927125505



In [160]:
from sklearn.metrics import accuracy_score, make_scorer, recall_score, \
    precision_score, confusion_matrix, fbeta_score, average_precision_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
import xgboost
import seaborn as sn
from matplotlib import pyplot as plt

num_cols = 'duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_nominations,Oscar_nominated,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Hollywood_Film_won,Hollywood_Film_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated'
num_cols = [x for x in num_cols.split(',')]
mis_cols = df.columns[df.isna().any()].tolist()

np.random.seed(0)

In [87]:
import time
start = time.clock()
random_state = 100
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)
# continuous transformer setup
cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
# multivariate imputer setup
impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = RandomForestRegressor(n_estimators=100), random_state=random_state))])
# set up column transformer
preprocessor = ColumnTransformer(
    transformers = [
        ('impute', impute_transformer, mis_cols),
        ('num', cnts_transformer, num_cols)])
# create the pipeline: preprocessor + supervised ML method
pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                    LogisticRegression(penalty = 'l1', solver='liblinear', max_iter = 1000, multi_class = 'auto'))])
X_p = impute_transformer.fit_transform(cnts_transformer.fit_transform(X_other))
lr = LogisticRegression(penalty = 'l1', solver='liblinear', max_iter = 1000, multi_class = 'auto')
lr.fit(X_p, y_other)
lr_pred = lr.predict(impute_transformer.transform(cnts_transformer.transform(X_test)))
r_lr = recall_score(y_true = y_test, y_pred = lr_pred)
p_lr = precision_score(y_true = y_test, y_pred = lr_pred)
f_lr = fbeta_score(y_true = y_test, y_pred = lr_pred, beta = 1)
cm = confusion_matrix(y_true = y_test, y_pred = lr_pred)
print(r_lr)
print(p_lr)
print(f_lr)
print(cm)
stop = time.clock()
print(stop-start)

  


1.0
1.0
1.0
[[243   0]
 [  0   4]]
212.6930900000001




In [76]:
def plot_confusion_matrix(cm):
    data = cm
    df_cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(y_test))
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (10,7))
    sn.set(font_scale=1.4)#for label size
    ax = sn.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16}, fmt='.2f')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)

## Logistic Regression

In [213]:
def LR_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    X = X.fillna(-999)
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols),
            ('impute', impute_transformer, mis_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                        LogisticRegression(penalty = 'l1', solver='liblinear', max_iter = 1000, multi_class = 'auto'))])
    #parameters to tune
    param_grid = {'clf__C': np.logspace(-4,4,20)}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=10, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [None]:
lr_ap_scores = []
lr_precision_scores = []
lr_recall_scores = []
lr_f1_scores = []
lr_models = []

print("Training Logistic Regression Model")
for i in range(6):
    lr_grid,lr_ap_score, lr_best_model, lr_best_pred, lr_best_precision, lr_best_recall, lr_best_f1 = LR_pipeline_kfold_GridSearchCV(X,y,10*i,4)
    print("________________________________________________________")
    print("Random State:",10*i)
    print("Best Parameters:")
    print(lr_grid.best_params_)
    print('best CV score:',lr_grid.best_score_)
    print('average precision score:',lr_ap_score)
    print('precision: ', lr_best_precision)
    print('recall: ', lr_best_recall)
    print('f1: ', lr_best_f1)
    lr_ap_scores.append(lr_ap_score)
    lr_precision_scores.append(lr_best_precision)
    lr_recall_scores.append(lr_best_recall)
    lr_f1_scores.append(lr_best_f1)
    lr_models.append(lr_best_model)
    print("________________________________________________________")
print("________________________________________________________")
print('Logistic Regression Results')
print('test average precision :',np.around(np.mean(lr_ap_scores),2),'+/-',np.around(np.std(lr_ap_scores),2))
print('test precision :',np.around(np.mean(lr_precision_scores),2),'+/-',np.around(np.std(lr_precision_scores),2))
print('test recall: ',np.around(np.mean(lr_recall_scores),2),'+/-',np.around(np.std(lr_recall_scores),2))
print('test f1: ',np.around(np.mean(lr_f1_scores),2),'+/-',np.around(np.std(lr_f1_scores),2))

**LOGISTIC REGRESSION RESULTS**

Metric|Mean|St Dev
---|---|---
Average Precision|0.35|0.2
Precision|0.64|0.18
Recall|0.5|0.2
F1|0.54|0.18

Best hyperparameters: `C` $= 1.0$

## Random Forest Model

In [155]:
def RF_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    X = X.fillna(-999)
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols),
            ('impute', impute_transformer, mis_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                        RandomForestClassifier(n_estimators = 100, random_state = random_state))])
    #parameters to tune
    param_grid = {'clf__max_depth': np.sort(np.concatenate([[2],  np.logspace(1,2,2)/2 , np.logspace(1,2,2)])),
                 'clf__min_samples_split': np.arange(2,23,5)}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=10, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [159]:
rf_ap_scores = []
rf_precision_scores = []
rf_recall_scores = []
rf_f1_scores = []

print("Training Random Forest Model")
for i in range(6):
    print("Iteration", i)
    rf_grid,rf_ap_score, rf_best_model, rf_best_pred, rf_best_precision, rf_best_recall, rf_best_f1 = RF_pipeline_kfold_GridSearchCV(X,y,10*i,4)
    print("________________________________________________________")
    print("Random State:",10*i)
    print("Best Parameters:")
    print(rf_grid.best_params_)
    print('best CV score:',rf_grid.best_score_)
    print('average precision score:',rf_ap_score)
    print('precision: ', rf_best_precision)
    print('recall: ', rf_best_recall)
    print('f1: ', rf_best_f1)
    rf_ap_scores.append(rf_ap_score)
    rf_precision_scores.append(rf_best_precision)
    rf_recall_scores.append(rf_best_recall)
    rf_f1_scores.append(rf_best_f1)
    print("________________________________________________________")
print("________________________________________________________")
print('Random Forest Model Results')
print('test average precision :',np.around(np.mean(rf_ap_scores),2),'+/-',np.around(np.std(rf_ap_scores),2))
print('test precision :',np.around(np.mean(rf_precision_scores),2),'+/-',np.around(np.std(rf_precision_scores),2))
print('test recall: ',np.around(np.mean(rf_recall_scores),2),'+/-',np.around(np.std(rf_recall_scores),2))
print('test f1: ',np.around(np.mean(rf_f1_scores),2),'+/-',np.around(np.std(rf_f1_scores),2))

Training Random Forest Model
Iteration 0
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.8min finished


________________________________________________________
Random State: 0
Best Parameters:
{'clf__max_depth': 5.0, 'clf__min_samples_split': 2}
best CV score: 0.07591093117408906
average precision score: 0.26214574898785425
precision:  1.0
recall:  0.25
f1:  0.4
________________________________________________________
Iteration 1
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.6min finished


________________________________________________________
Random State: 10
Best Parameters:
{'clf__max_depth': 5.0, 'clf__min_samples_split': 2}
best CV score: 0.18454790823211875
average precision score: 0.016194331983805668
precision:  0.0
recall:  0.0
f1:  0.0
________________________________________________________
Iteration 2
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.6min finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


________________________________________________________
Random State: 20
Best Parameters:
{'clf__max_depth': 2.0, 'clf__min_samples_split': 2}
best CV score: 0.01417004048582996
average precision score: 0.016194331983805668
precision:  0.0
recall:  0.0
f1:  0.0
________________________________________________________
Iteration 3
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.5min finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


________________________________________________________
Random State: 30
Best Parameters:
{'clf__max_depth': 5.0, 'clf__min_samples_split': 2}
best CV score: 0.07489878542510121
average precision score: 0.016194331983805668
precision:  0.0
recall:  0.0
f1:  0.0
________________________________________________________
Iteration 4
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.6min finished


________________________________________________________
Random State: 40
Best Parameters:
{'clf__max_depth': 5.0, 'clf__min_samples_split': 2}
best CV score: 0.2601214574898785
average precision score: 0.016194331983805668
precision:  0.0
recall:  0.0
f1:  0.0
________________________________________________________
Iteration 5
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.5min finished


________________________________________________________
Random State: 50
Best Parameters:
{'clf__max_depth': 5.0, 'clf__min_samples_split': 2}
best CV score: 0.12651821862348175
average precision score: 0.26214574898785425
precision:  1.0
recall:  0.25
f1:  0.4
________________________________________________________
________________________________________________________
Logistic Regression Results
test average precision : 0.1 +/- 0.12
test precision : 0.33 +/- 0.47
test recall:  0.08 +/- 0.12
test f1:  0.13 +/- 0.19


**RANDOM FOREST RESULTS**

Metric|Mean|St Dev
---|---|---
Average Precision|0.1|0.12
Precision|0.33|0.47
Recall|0.08|0.12
F1|0.13|0.19

Best hyperparameters: `max_depth` $= 5.0$, `min_samples_split` $= 2.0$

## XGBoost

In [197]:
from xgboost import XGBClassifier
def XGB_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    X = X.fillna(-999)
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols),
            ('impute', impute_transformer, mis_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                        xgboost.XGBClassifier())])
    #parameters to tune
    param_grid = {'clf__colsample_bytree': [0.75, 1.0], 
                  'clf__max_depth':[4,6,8] , 
                  'clf__min_child_weight': [2,5,10]}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=10, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [199]:
xgb_ap_scores = []
xgb_precision_scores = []
xgb_recall_scores = []
xgb_f1_scores = []

print("Training XGBoost Model")
for i in range(6):
    print("Iteration", i)
    xgb_grid,xgb_ap_score, xgb_best_model, xgb_best_pred, xgb_best_precision, xgb_best_recall, xgb_best_f1 = XGB_pipeline_kfold_GridSearchCV(X,y,10*i,4)
    print("________________________________________________________")
    print("Random State:",10*i)
    print("Best Parameters:")
    print(xgb_grid.best_params_)
    print('best CV score:',xgb_grid.best_score_)
    print('average precision score:',xgb_ap_score)
    print('precision: ', xgb_best_precision)
    print('recall: ', xgb_best_recall)
    print('f1: ', xgb_best_f1)
    xgb_ap_scores.append(xgb_ap_score)
    xgb_precision_scores.append(xgb_best_precision)
    xgb_recall_scores.append(xgb_best_recall)
    xgb_f1_scores.append(xgb_best_f1)
    print("________________________________________________________")
print("________________________________________________________")
print('XGBoost Model Results')
print('test average precision :',np.around(np.mean(xgb_ap_scores),2),'+/-',np.around(np.std(xgb_ap_scores),2))
print('test precision :',np.around(np.mean(xgb_precision_scores),2),'+/-',np.around(np.std(xgb_precision_scores),2))
print('test recall: ',np.around(np.mean(xgb_recall_scores),2),'+/-',np.around(np.std(xgb_recall_scores),2))
print('test f1: ',np.around(np.mean(xgb_f1_scores),2),'+/-',np.around(np.std(xgb_f1_scores),2))

Training XGBoost Model
Iteration 0
Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  9.9min finished


________________________________________________________
Random State: 0
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__max_depth': 4, 'clf__min_child_weight': 2}
best CV score: 0.30802968960863697
average precision score: 0.7540485829959515
precision:  1.0
recall:  0.75
f1:  0.8571428571428571
________________________________________________________
Iteration 1
Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  9.4min finished


________________________________________________________
Random State: 10
Best Parameters:
{'clf__colsample_bytree': 0.75, 'clf__max_depth': 4, 'clf__min_child_weight': 2}
best CV score: 0.2462887989203779
average precision score: 0.13714574898785425
precision:  0.5
recall:  0.25
f1:  0.3333333333333333
________________________________________________________
Iteration 2
Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  9.0min finished


________________________________________________________
Random State: 20
Best Parameters:
{'clf__colsample_bytree': 0.75, 'clf__max_depth': 4, 'clf__min_child_weight': 2}
best CV score: 0.2800269905533063
average precision score: 0.13714574898785425
precision:  0.5
recall:  0.25
f1:  0.3333333333333333
________________________________________________________
Iteration 3
Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  9.3min finished


________________________________________________________
Random State: 30
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__max_depth': 4, 'clf__min_child_weight': 2}
best CV score: 0.2388663967611336
average precision score: 0.26214574898785425
precision:  1.0
recall:  0.25
f1:  0.4
________________________________________________________
Iteration 4
Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  9.3min finished


________________________________________________________
Random State: 40
Best Parameters:
{'clf__colsample_bytree': 0.75, 'clf__max_depth': 4, 'clf__min_child_weight': 2}
best CV score: 0.45175438596491224
average precision score: 0.016194331983805668
precision:  0.0
recall:  0.0
f1:  0.0
________________________________________________________
Iteration 5
Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  9.2min finished


________________________________________________________
Random State: 50
Best Parameters:
{'clf__colsample_bytree': 0.75, 'clf__max_depth': 4, 'clf__min_child_weight': 2}
best CV score: 0.29014844804318485
average precision score: 0.5080971659919028
precision:  1.0
recall:  0.5
f1:  0.6666666666666666
________________________________________________________
________________________________________________________
Random Forest Model Results
test average precision : 0.3 +/- 0.25
test precision : 0.67 +/- 0.37
test recall:  0.33 +/- 0.24
test f1:  0.43 +/- 0.27


**XGBoost RESULTS**

Metric|Mean|St Dev
---|---|---
Average Precision|0.3|0.25
Precision|0.67|0.37
Recall|0.33|0.24
F1|0.43|0.27

Best hyperparameters: `colsample_bytree` $= 0.75$, `max_depth` $= 4$, `min_child_weight` $=2$

### XGBoost No Imputation

In [206]:
from xgboost import XGBClassifier
def XGB_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                        xgboost.XGBClassifier())])
    #parameters to tune
    param_grid = {'clf__colsample_bytree': [0.5, 0.75, 1.0], 
                  'clf__max_depth':[2,4,6,8] , 
                  'clf__min_child_weight': [2,5,10],
                  'clf__subsample': [0.5, 0.75, 1.0],
                  'clf__learning_rate': [0.01, 0.05,0.1]}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=10, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [217]:
xgb2_ap_scores = []
xgb2_precision_scores = []
xgb2_recall_scores = []
xgb2_f1_scores = []
xgb2_models = []

print("Training XGBoost Model")
for i in range(6):
    print("Iteration", i)
    xgb2_grid,xgb2_ap_score, xgb2_best_model, xgb2_best_pred, xgb2_best_precision, xgb2_best_recall, xgb2_best_f1 = XGB_pipeline_kfold_GridSearchCV(X,y,10*i,4)
    print("________________________________________________________")
    print("Random State:",10*i)
    print("Best Parameters:")
    print(xgb2_grid.best_params_)
    print('best CV score:',xgb2_grid.best_score_)
    print('average precision score:',xgb2_ap_score)
    print('precision: ', xgb2_best_precision)
    print('recall: ', xgb2_best_recall)
    print('f1: ', xgb2_best_f1)
    xgb2_ap_scores.append(xgb2_ap_score)
    xgb2_precision_scores.append(xgb2_best_precision)
    xgb2_recall_scores.append(xgb2_best_recall)
    xgb2_f1_scores.append(xgb2_best_f1)
    xgb2_models.append(xgb2_best_model)
    print("________________________________________________________")
print("________________________________________________________")
print('XGBoost No Impute Model Results')
print('test average precision :',np.around(np.mean(xgb2_ap_scores),2),'+/-',np.around(np.std(xgb2_ap_scores),2))
print('test precision :',np.around(np.mean(xgb2_precision_scores),2),'+/-',np.around(np.std(xgb2_precision_scores),2))
print('test recall: ',np.around(np.mean(xgb2_recall_scores),2),'+/-',np.around(np.std(xgb2_recall_scores),2))
print('test f1: ',np.around(np.mean(xgb2_f1_scores),2),'+/-',np.around(np.std(xgb2_f1_scores),2))

Training XGBoost Model
Iteration 0
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   

________________________________________________________
Random State: 0
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.416497975708502
average precision score: 0.45404858299595136
precision:  0.6
recall:  0.75
f1:  0.6666666666666665
________________________________________________________
Iteration 1
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1444s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elap

________________________________________________________
Random State: 10
Best Parameters:
{'clf__colsample_bytree': 0.5, 'clf__learning_rate': 0.1, 'clf__max_depth': 4, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.328272604588394
average precision score: 0.26214574898785425
precision:  1.0
recall:  0.25
f1:  0.4
________________________________________________________
Iteration 2
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1269s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elap

________________________________________________________
Random State: 20
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.45607287449392714
average precision score: 0.5665485829959515
precision:  0.75
recall:  0.75
f1:  0.75
________________________________________________________
Iteration 3
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1333s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elap

________________________________________________________
Random State: 30
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.37044534412955465
average precision score: 0.45404858299595136
precision:  0.6
recall:  0.75
f1:  0.6666666666666665
________________________________________________________
Iteration 4
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1379s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elap

________________________________________________________
Random State: 40
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.5714237516869095
average precision score: 0.25809716599190285
precision:  0.5
recall:  0.5
f1:  0.5
________________________________________________________
Iteration 5
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1327s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elap

________________________________________________________
Random State: 50
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.5421052631578948
average precision score: 0.34143049932523617
precision:  0.6666666666666666
recall:  0.5
f1:  0.5714285714285715
________________________________________________________
________________________________________________________
XGBoost No Impute Model Results
test average precision : 0.39 +/- 0.11
test precision : 0.69 +/- 0.16
test recall:  0.58 +/- 0.19
test f1:  0.59 +/- 0.12


[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.5min finished


**XGBoost No Impute RESULTS**

Metric|Mean|St Dev
---|---|---
Average Precision|0.39|0.11
Precision|0.69|0.16
Recall|0.58|0.19
F1|0.59|0.12

Best hyperparameters: `colsample_bytree` $= 1.0$, `max_depth` $= 2$, `min_child_weight` $=2$, `learning_rate` $=0.01$, `subsample` $=1.0$.

test model
metrics
confusion matrix, accuracy, precision, recall, AUROC, AU-PR Curve

RandomForestClassification.feature_importances_  
https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

XGBoost.get_score(importance_type = ...)  
https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.get_score

In [221]:
import pickle
pickle.dump(xgb2_models, open('/home/jovyan/data1030/data1030-oscars-prediction-project/results/models.sav', 'wb')) 

XGBoost sources:

https://www.kaggle.com/phunter/xgboost-with-gridsearchcv  
https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost  
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/  
https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f

In [196]:
param1 = {'colsample_bytree': 1.0,
 'eta': 0.01,
 'eval_metric': 'mae',
 'max_depth': 10,
 'min_child_weight': 6,
 'objective': 'reg:linear',
 'subsample': 0.8}
param2 = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}
param3 = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
set(param1.keys()).intersection(set(param2.keys())).intersection(set(param3.keys()))

{'colsample_bytree', 'max_depth', 'min_child_weight', 'subsample'}

### SVM Classifier

In [215]:
def SVM_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    X = X.fillna(-999)
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols),
            ('impute', impute_transformer, mis_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', SVC())])
    #parameters to tune
    param_grid = {'clf__C': np.logspace(-4,4,5),
                 'clf__gamma': np.logspace(-4,4,5)}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=10, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [216]:
svm_ap_scores = []
svm_precision_scores = []
svm_recall_scores = []
svm_f1_scores = []
svm_models= []

print("Training SVM Model")
for i in range(1):
    print("Iteration", i)
    svm_grid, svm_ap_score, svm_best_model, svm_best_pred, svm_best_precision, svm_best_recall, svm_best_f1 = SVM_pipeline_kfold_GridSearchCV(X,y,10*i,4)
    print("________________________________________________________")
    print("Random State:",10*i)
    print("Best Parameters:")
    print(svm_grid.best_params_)
    print('best CV score:',svm_grid.best_score_)
    print('average precision score:',svm_ap_score)
    print('precision: ', svm_best_precision)
    print('recall: ', svm_best_recall)
    print('f1: ', svm_best_f1)
    svm_ap_scores.append(svm_ap_score)
    svm_precision_scores.append(svm_best_precision)
    svm_recall_scores.append(svm_best_recall)
    svm_f1_scores.append(svm_best_f1)
    svm_models.append(svm_best_model)
    print("________________________________________________________")
print("________________________________________________________")
print('SVM Results')
print('test average precision :',np.around(np.mean(svm_ap_scores),2),'+/-',np.around(np.std(svm_ap_scores),2))
print('test precision :',np.around(np.mean(svm_precision_scores),2),'+/svm_-',np.around(np.std(svm_precision_scores),2))
print('test recall: ',np.around(np.mean(svm_recall_scores),2),'+/-',np.around(np.std(svm_recall_scores),2))
print('test f1: ',np.around(np.mean(svm_f1_scores),2),'+/-',np.around(np.std(svm_f1_scores),2))

Training SVM Model
Iteration 0
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   17.3s
exception calling callback for <Future at 0x7f0519c5cf98 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 309, in __call__
    self.parallel.dispatch_next()
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 731, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 716, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/opt/conda/lib/python3.7/site-pack

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}