In [16]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, recall_score, precision_score, \
    confusion_matrix, fbeta_score, average_precision_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
import xgboost
import seaborn as sn
from matplotlib import pyplot as plt

In [18]:
df = pd.read_csv('/home/jovyan/data1030/data1030-oscars-prediction-project/data/pre_training_data.csv')
label = 'Oscar_Best_Picture_won'
y = df[label]
X = df.drop(columns=['movie'])

num_cols = 'duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_nominations,Oscar_nominated,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Hollywood_Film_won,Hollywood_Film_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated'
num_cols = [x for x in num_cols.split(',')]
mis_cols = df.columns[df.isna().any()].tolist()

np.random.seed(0)

In [19]:
mv = pd.read_csv('/home/jovyan/data1030/data1030-oscars-prediction-project/data/movies.csv')
oscars_2019 = list(mv.index[mv['year'] == 2018])
X_2019 = X.loc[oscars_2019]
X_train = X.drop(oscars_2019)
y_train = y.drop(oscars_2019)

In [37]:
from xgboost import XGBClassifier
def XGB_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                        xgboost.XGBClassifier())])
    #parameters to tune
    param_grid = {'clf__colsample_bytree': [0.5, 0.75, 1.0], 
                  'clf__max_depth':[2,4,6,8] , 
                  'clf__min_child_weight': [2,5,10],
                  'clf__subsample': [0.5, 0.75, 1.0],
                  'clf__learning_rate': [0.01, 0.05,0.1]}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=1, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [38]:
def train_models(X,y):    
    xgb2_ap_scores = []
    xgb2_precision_scores = []
    xgb2_recall_scores = []
    xgb2_f1_scores = []
    xgb2_models = []

    print("Training XGBoost Model")
    for i in range(6):
        print("Iteration", i)
        xgb2_grid,xgb2_ap_score, xgb2_best_model, xgb2_best_pred, xgb2_best_precision, xgb2_best_recall, xgb2_best_f1 = XGB_pipeline_kfold_GridSearchCV(X,y,10*i,4)
        print("________________________________________________________")
        print("Random State:",10*i)
        print("Best Parameters:")
        print(xgb2_grid.best_params_)
        print('best CV score:',xgb2_grid.best_score_)
        print('average precision score:',xgb2_ap_score)
        print('precision: ', xgb2_best_precision)
        print('recall: ', xgb2_best_recall)
        print('f1: ', xgb2_best_f1)
        xgb2_ap_scores.append(xgb2_ap_score)
        xgb2_precision_scores.append(xgb2_best_precision)
        xgb2_recall_scores.append(xgb2_best_recall)
        xgb2_f1_scores.append(xgb2_best_f1)
        xgb2_models.append(xgb2_best_model)
        print("________________________________________________________")
    print("________________________________________________________")
    print('XGBoost No Impute Model Results')
    print('test average precision :',np.around(np.mean(xgb2_ap_scores),2),'+/-',np.around(np.std(xgb2_ap_scores),2))
    print('test precision :',np.around(np.mean(xgb2_precision_scores),2),'+/-',np.around(np.std(xgb2_precision_scores),2))
    print('test recall: ',np.around(np.mean(xgb2_recall_scores),2),'+/-',np.around(np.std(xgb2_recall_scores),2))
    print('test f1: ',np.around(np.mean(xgb2_f1_scores),2),'+/-',np.around(np.std(xgb2_f1_scores),2))
    return xgb2_models

In [39]:
models = train_models(X_train, y_train)

Training XGBoost Model
Iteration 0
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 1293 out of 1296 | elapsed:  1.3min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.3min finished


________________________________________________________
Random State: 0
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.5643895348837209
average precision score: 0.25843881856540085
precision:  0.5
recall:  0.5
f1:  0.5
________________________________________________________
Iteration 1
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


________________________________________________________
Random State: 10
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.21433814893117217
average precision score: 0.8
precision:  0.8
recall:  1.0
f1:  0.888888888888889
________________________________________________________
Iteration 2
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


________________________________________________________
Random State: 20
Best Parameters:
{'clf__colsample_bytree': 0.5, 'clf__learning_rate': 0.05, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 0.5}
best CV score: 0.1379492600422833
average precision score: 0.2626582278481013
precision:  1.0
recall:  0.25
f1:  0.4
________________________________________________________
Iteration 3
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


________________________________________________________
Random State: 30
Best Parameters:
{'clf__colsample_bytree': 0.75, 'clf__learning_rate': 0.1, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.2296996124031008
average precision score: 0.13765822784810128
precision:  0.5
recall:  0.25
f1:  0.3333333333333333
________________________________________________________
Iteration 4
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


________________________________________________________
Random State: 40
Best Parameters:
{'clf__colsample_bytree': 0.5, 'clf__learning_rate': 0.05, 'clf__max_depth': 4, 'clf__min_child_weight': 2, 'clf__subsample': 1.0}
best CV score: 0.4861698379140239
average precision score: 0.13765822784810128
precision:  0.5
recall:  0.25
f1:  0.3333333333333333
________________________________________________________
Iteration 5
Fitting 4 folds for each of 324 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   47.6s


________________________________________________________
Random State: 50
Best Parameters:
{'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.01, 'clf__max_depth': 2, 'clf__min_child_weight': 2, 'clf__subsample': 0.75}
best CV score: 0.6345093375616632
average precision score: 0.13765822784810128
precision:  0.5
recall:  0.25
f1:  0.3333333333333333
________________________________________________________
________________________________________________________
XGBoost No Impute Model Results
test average precision : 0.29 +/- 0.24
test precision : 0.63 +/- 0.2
test recall:  0.42 +/- 0.28
test f1:  0.46 +/- 0.2


[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.2min finished


In [66]:
def predict(models):
    predictions = []
    for pipe in models:
        predictions.append(pipe.predict_proba(X_2019)[:,1])
    predictions = np.asarray(predictions)
    final_preds = np.mean(predictions, axis=0)
    win_score = [np.around(100*x,1) for x in final_preds]
    movies_19 = list(mv['movie'].loc[oscars_2019])
    pred_19 = pd.DataFrame(list(zip(movies_19,win_score)), columns = ['movie', 'win_prob'])
    best_pic_noms = ['Black Panther', 'BlacKkKlansman', 'Bohemian Rhapsody', 'The Favourite', 'Green Book', 'Roma', 'A Star Is Born', 'Vice']
    nom_preds = pred_19[pred_19['movie'].isin(best_pic_noms)]
    nom_preds = nom_preds.sort_values(by='win_prob',ascending=False)
    return nom_preds

In [67]:
predict(models)

Unnamed: 0,movie,win_prob
20,Roma,36.6
14,The Favourite,18.7
4,Black Panther,13.9
1,A Star Is Born,13.3
21,Vice,11.6
32,Green Book,10.8
36,BlacKkKlansman,10.3
3,Bohemian Rhapsody,9.9
