In [46]:
import pandas as pd
import numpy as np
df = pd.read_csv('/home/jovyan/data1030/data1030-oscars-prediction-project/data/pre_training_data.csv')
print("The shape of the dataset is: ")
print(df.shape)
print("The balance of the dataset is: ")
label = 'Oscar_Best_Picture_won'
y = df[label]
print(y.value_counts()/len(y))
X = df.drop(columns=['movie'])

The shape of the dataset is: 
(1235, 1019)
The balance of the dataset is: 
0    0.985425
1    0.014575
Name: Oscar_Best_Picture_won, dtype: float64


In [47]:
print("Of the features that have missing values, the fraction of missing values is:")
nulls = df.isnull().sum(axis=0)/df.shape[0]
print(nulls[nulls > 0])
print("")
print("The total fraction of missing features in the data set is:")
print(sum(df.isnull().sum(axis=1)!=0)/df.shape[0])
print("")

Of the features that have missing values, the fraction of missing values is:
metascore         0.023482
gross             0.034008
user_reviews      0.011336
critic_reviews    0.008097
popularity        0.109312
dtype: float64

The total fraction of missing features in the data set is:
0.12874493927125505



In [84]:
from sklearn.metrics import accuracy_score, make_scorer, recall_score, \
    precision_score, confusion_matrix, fbeta_score, average_precision_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import seaborn as sn
from matplotlib import pyplot as plt

num_cols = 'duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_nominations,Oscar_nominated,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Hollywood_Film_won,Hollywood_Film_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated'
num_cols = [x for x in num_cols.split(',')]
mis_cols = df.columns[df.isna().any()].tolist()

np.random.seed(0)

In [87]:
import time
start = time.clock()
random_state = 100
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)
# continuous transformer setup
cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
# multivariate imputer setup
impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = RandomForestRegressor(n_estimators=100), random_state=random_state))])
# set up column transformer
preprocessor = ColumnTransformer(
    transformers = [
        ('impute', impute_transformer, mis_cols),
        ('num', cnts_transformer, num_cols)])
# create the pipeline: preprocessor + supervised ML method
pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                    LogisticRegression(penalty = 'l1', solver='liblinear', max_iter = 1000, multi_class = 'auto'))])
X_p = impute_transformer.fit_transform(cnts_transformer.fit_transform(X_other))
lr = LogisticRegression(penalty = 'l1', solver='liblinear', max_iter = 1000, multi_class = 'auto')
lr.fit(X_p, y_other)
lr_pred = lr.predict(impute_transformer.transform(cnts_transformer.transform(X_test)))
r_lr = recall_score(y_true = y_test, y_pred = lr_pred)
p_lr = precision_score(y_true = y_test, y_pred = lr_pred)
f_lr = fbeta_score(y_true = y_test, y_pred = lr_pred, beta = 1)
cm = confusion_matrix(y_true = y_test, y_pred = lr_pred)
print(r_lr)
print(p_lr)
print(f_lr)
print(cm)
stop = time.clock()
print(stop-start)

  


1.0
1.0
1.0
[[243   0]
 [  0   4]]
212.6930900000001




In [76]:
def plot_confusion_matrix(cm):
    data = cm
    df_cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(y_test))
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (10,7))
    sn.set(font_scale=1.4)#for label size
    ax = sn.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16}, fmt='.2f')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)

## Logistic Regression

In [143]:
def LR_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    X = X.fillna(-999)
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols),
            ('impute', impute_transformer, mis_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                        LogisticRegression(penalty = 'l1', solver='liblinear', max_iter = 1000, multi_class = 'auto'))])
    #parameters to tune
    param_grid = {'clf__C': np.logspace(-2,4,10)}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=10, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [142]:
ap_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

print("Training Logistic Regression Model")
for i in range(6):
    grid,ap_score, best_model, best_pred, best_precision, best_recall, best_f1 = LR_pipeline_kfold_GridSearchCV(X,y,10*i,4)
    print("________________________________________________________")
    print("Random State:",10*i)
    print("Best Parameters:")
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print('average precision score:',ap_score)
    print('precision: ', best_precision)
    print('recall: ', best_recall)
    print('f1: ', best_f1)
    ap_scores.append(ap_score)
    precision_scores.append(best_precision)
    recall_scores.append(best_recall)
    f1_scores.append(best_f1)
    print("________________________________________________________")
print("________________________________________________________")
print('Logistic Regression Results')
print('test average precision :',np.around(np.mean(ap_scores),2),'+/-',np.around(np.std(ap_scores),2))
print('test precision :',np.around(np.mean(precision_scores),2),'+/-',np.around(np.std(precision_scores),2))
print('test recall: ',np.around(np.mean(recall_scores),2),'+/-',np.around(np.std(recall_scores),2))
print('test f1: ',np.around(np.mean(f1_scores),2),'+/-',np.around(np.std(f1_scores),2))

Training Logistic Regression Model
Fitting 4 folds for each of 10 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]:



________________________________________________________
Random State: 0
Best Parameters:
{'clf__C': 1.0}
best CV score: 0.44797570850202434
average precision score: 0.7540485829959515
precision:  1.0
recall:  0.75
f1:  0.8571428571428571
________________________________________________________
Fitting 4 folds for each of 10 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   48.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min
[Parallel



________________________________________________________
Random State: 10
Best Parameters:
{'clf__C': 2154.4346900318824}
best CV score: 0.4647435897435897
average precision score: 0.3790485829959514
precision:  0.5
recall:  0.75
f1:  0.6
________________________________________________________
Fitting 4 folds for each of 10 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.0min
[Parallel



________________________________________________________
Random State: 20
Best Parameters:
{'clf__C': 0.21544346900318834}
best CV score: 0.43016194331983804
average precision score: 0.34143049932523617
precision:  0.6666666666666666
recall:  0.5
f1:  0.5714285714285715
________________________________________________________
Fitting 4 folds for each of 10 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   56.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   9 tasks  



________________________________________________________
Random State: 30
Best Parameters:
{'clf__C': 2154.4346900318824}
best CV score: 0.3827597840755735
average precision score: 0.09547908232118758
precision:  0.3333333333333333
recall:  0.25
f1:  0.28571428571428575
________________________________________________________
Fitting 4 folds for each of 10 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done   9 tasks  



________________________________________________________
Random State: 40
Best Parameters:
{'clf__C': 10000.0}
best CV score: 0.506578947368421
average precision score: 0.34143049932523617
precision:  0.6666666666666666
recall:  0.5
f1:  0.5714285714285715
________________________________________________________
Fitting 4 folds for each of 10 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   56.6s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:



________________________________________________________
Random State: 50
Best Parameters:
{'clf__C': 0.21544346900318834}
best CV score: 0.45006747638326583
average precision score: 0.26214574898785425
precision:  1.0
recall:  0.25
f1:  0.4
________________________________________________________
________________________________________________________
Logistic Regression Results
test average precision : 0.36 +/- 0.2
test precision : 0.69 +/- 0.24
test recall:  0.5 +/- 0.2
test f1:  0.55 +/- 0.18


**LOGISTIC REGRESSION RESULTS**

Metric|Mean|St Dev
---|---|---
Average Precision|0.36|0.2
Precision|0.69|0.24
Recall|0.5|0.2
F1|0.55|0.18


## Random Forest Model

In [155]:
def RF_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds):
    X = X.fillna(-999)
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # continuous transformer setup
    cnts_transformer = Pipeline(steps = [('ss', StandardScaler())])
    # multivariate imputer setup
    impute_transformer = Pipeline(steps = [('mi', IterativeImputer(estimator = 
                                RandomForestRegressor(n_estimators=100), random_state=random_state, missing_values = -999))])
    # set up column transformer
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', cnts_transformer, num_cols),
            ('impute', impute_transformer, mis_cols)])
    # create the pipeline: preprocessor + supervised ML method
    pipe = Pipeline(steps = [('preprocessor', preprocessor), ('clf', 
                        RandomForestClassifier(n_estimators = 100, random_state = random_state))])
    #parameters to tune
    param_grid = {'clf__max_depth': np.sort(np.concatenate([[2],  np.logspace(1,2,2)/2 , np.logspace(1,2,2)])),
                 'clf__min_samples_split': np.arange(2,23,5)}
    #prepare scorers
    scoring = {'recall': make_scorer(recall_score), 'precision' : make_scorer(precision_score), 'ap' : make_scorer(average_precision_score)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = scoring, refit = 'ap', #scoring = make_scorer(average_precision_score),
                            cv=kf, return_train_score = True,iid=True, verbose=10, n_jobs=-1)
    # do kfold CV on _other
    grid.fit(X_other, y_other)
    best_model = grid.best_estimator_
    best_pred = best_model.predict(X_test)
    best_precision = precision_score(y_test, best_pred) 
    best_recall = recall_score(y_test, best_pred)
    best_f1 = fbeta_score(y_test, best_pred, 1)
    return grid, grid.score(X_test, y_test), best_model, best_pred, best_precision, best_recall, best_f1

In [None]:
ap_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

print("Training Random Forest Model")
for i in range(6):
    print("Iteration", i)
    grid,ap_score, best_model, best_pred, best_precision, best_recall, best_f1 = RF_pipeline_kfold_GridSearchCV(X,y,10*i,4)
    print("________________________________________________________")
    print("Random State:",10*i)
    print("Best Parameters:")
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print('average precision score:',ap_score)
    print('precision: ', best_precision)
    print('recall: ', best_recall)
    print('f1: ', best_f1)
    ap_scores.append(ap_score)
    precision_scores.append(best_precision)
    recall_scores.append(best_recall)
    f1_scores.append(best_f1)
    print("________________________________________________________")
print("________________________________________________________")
print('Logistic Regression Results')
print('test average precision :',np.around(np.mean(ap_scores),2),'+/-',np.around(np.std(ap_scores),2))
print('test precision :',np.around(np.mean(precision_scores),2),'+/-',np.around(np.std(precision_scores),2))
print('test recall: ',np.around(np.mean(recall_scores),2),'+/-',np.around(np.std(recall_scores),2))
print('test f1: ',np.around(np.mean(f1_scores),2),'+/-',np.around(np.std(f1_scores),2))

Training Random Forest Model
Iteration 0
Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   33.1s


test model
metrics
confusion matrix, accuracy, precision, recall, AUROC, AU-PR Curve

RandomForestClassification.feature_importances_  
https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

XGBoost.get_score(importance_type = ...)  
https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.get_score