In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## Import the cleansed dataset and use Feature selection to identify dummy/target values

In [3]:
## Read cleansed/combined table

file_path = "combined_table.csv"
oscars_df = pd.read_csv(file_path)
oscars_df

Unnamed: 0,budget,release_year,revenue,runtime,title,genres,nominated,winner,category
0,22000000,2003,95708457,121,Underworld,Fantasy,1,1.0,WRITING
1,110000000,2000,215294342,165,The Patriot,Drama,1,0.0,ACTOR
2,379000,1929,4358000,100,The Broadway Melody,Drama,1,0.0,ACTRESS
3,110000000,2000,215294342,165,The Patriot,Drama,1,0.0,ART DIRECTION
4,379000,1929,4358000,100,The Broadway Melody,Drama,1,0.0,DIRECTING
...,...,...,...,...,...,...,...,...,...
8416,250000,1918,8000000,93,Mickey,Comedy,0,,
8417,165000000,2010,752600867,93,Shrek Forever After,Comedy,0,,
8418,6000000,2004,293793,90,Mickey,Drama,0,,
8419,6500000,1991,57504069,112,Boyz n the Hood,Crime,1,,


In [4]:
oscars_df.dtypes

budget            int64
release_year      int64
revenue           int64
runtime           int64
title            object
genres           object
nominated         int64
winner          float64
category         object
dtype: object

In [5]:
# drop unecessary columns
X = pd.get_dummies(oscars_df, columns=["genres","category"]).drop(["budget","release_year","revenue","title","winner","nominated"], axis=1)

## Using the binary outcome "nominated" to create our target value
y = oscars_df["nominated"]


In [6]:
# Check the balance of our target values
y.value_counts()

0    4255
1    4166
Name: nominated, dtype: int64

## Split training/test data using the train_test_split model from sklearn

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## Naive RandomOverSampler Model

In [8]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 3191, 0: 3191})

In [9]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [10]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

## Initialize Balance Accuracy Score to store in list when aggregating summaries
BA_Score = []
BA_Score.append(balanced_accuracy_score(y_test, y_pred))

In [11]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[1036,   28],
       [  39, 1003]], dtype=int64)

In [12]:
## Initialize the confusion matrix to store in list when aggregating
cm_list=[]
cm_list.append(confusion_matrix(y_test, y_pred))

In [13]:
## Print imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      0.97      0.96      0.97      0.97      0.94      1064
          1       0.97      0.96      0.97      0.97      0.97      0.94      1042

avg / total       0.97      0.97      0.97      0.97      0.97      0.94      2106



In [14]:
## Establish summary dataframe
summary_df = pd.DataFrame(columns=['pre', 'rec', 'spe', 'f1', 'geo', 'iba', 'sup'])
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup


In [15]:
#Add classification report to the summary table 
name = 'Naive Random Oversampling'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)

summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.97,0.96,0.97,0.97,0.97,0.94,1042.0


## Cluster Centroids Undersampling

In [16]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
ccus = ClusterCentroids(random_state=1)
X_resampled, y_resampled = ccus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3124, 1: 3124})

In [17]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [18]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.9700574011804944

In [19]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1035,   29],
       [  34, 1008]], dtype=int64)

In [20]:
# Add confusion matrix to confusion matrix list
cm_list.append(confusion_matrix(y_test, y_pred))

In [21]:
## Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.97      0.97      0.97      0.97      0.94      1064
          1       0.97      0.97      0.97      0.97      0.97      0.94      1042

avg / total       0.97      0.97      0.97      0.97      0.97      0.94      2106



In [22]:
#Add classification report to summary table 
name = 'Cluster Centroids Undersampling'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.97,0.96,0.97,0.97,0.97,0.94,1042.0
Cluster Centroids Undersampling,0.97,0.97,0.97,0.97,0.97,0.94,1042.0


## SMOTE Oversampling

In [23]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter

Counter(y_resampled)

Counter({1: 3191, 0: 3191})

In [24]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [25]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.9633296292554804

In [26]:
# Display the SMTOE confusion matrix
confusion_matrix(y_test, y_pred)

array([[1036,   28],
       [  49,  993]], dtype=int64)

In [27]:
# Add SMOTE confusion matrix to a list
cm_list.append(confusion_matrix(y_test, y_pred))

In [28]:
## Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.97      0.95      0.96      0.96      0.93      1064
          1       0.97      0.95      0.97      0.96      0.96      0.93      1042

avg / total       0.96      0.96      0.96      0.96      0.96      0.93      2106



In [29]:
#Add classification report to summary table 
name = 'SMOTE'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.97,0.96,0.97,0.97,0.97,0.94,1042.0
Cluster Centroids Undersampling,0.97,0.97,0.97,0.97,0.97,0.94,1042.0
SMOTE,0.97,0.95,0.97,0.96,0.96,0.93,1042.0


## Combination (Over and Under) Sampling

In [31]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 3797, 1: 1266})

In [32]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.8755078074264355

In [34]:
# Display the SMTOE confusion matrix
confusion_matrix(y_test, y_pred)

array([[1037,   27],
       [ 233,  809]], dtype=int64)

In [35]:
# Add SMOTE confusion matrix to a list
cm_list.append(confusion_matrix(y_test, y_pred))

In [36]:
#Add classification report to summary table 
name = 'SMOTEENN'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.97,0.96,0.97,0.97,0.97,0.94,1042.0
Cluster Centroids Undersampling,0.97,0.97,0.97,0.97,0.97,0.94,1042.0
SMOTE,0.97,0.95,0.97,0.96,0.96,0.93,1042.0
SMOTEENN,0.97,0.78,0.97,0.86,0.87,0.74,1042.0


## Random Forest Model

In [37]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
brf_class = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_class.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [38]:
# Calculated the balanced accuracy score
y_pred = brf_class.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.9714770972536908

In [39]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1037,   27],
       [  33, 1009]], dtype=int64)

In [40]:
# Add Random Forest confusion matrix to cm list
cm_list.append(confusion_matrix(y_test, y_pred))

In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.97      0.97      0.97      0.97      0.94      1064
          1       0.97      0.97      0.97      0.97      0.97      0.94      1042

avg / total       0.97      0.97      0.97      0.97      0.97      0.94      2106



In [42]:
# List the features sorted in descending order by feature importance
features_df = pd.DataFrame(sorted(zip(brf_class.feature_importances_, X_test.columns), reverse=True))

features_df.rename(columns={0: "Feature Importance", 1: "Feature"},inplace=True)

features_df

Unnamed: 0,Feature Importance,Feature
0,1.671792e-01,runtime
1,8.353139e-02,category_MUSIC
2,7.305012e-02,category_WRITING
3,4.311900e-02,category_FILM EDITING
4,4.102933e-02,category_BEST PICTURE
...,...,...
59,3.072740e-04,category_OUTSTANDING PICTURE
60,2.587651e-04,category_WRITING
61,1.247866e-04,genres_Family
62,7.703651e-05,genres_Foreign


In [43]:
#Add classification report to summary table 
name = 'Random Forest'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.97,0.96,0.97,0.97,0.97,0.94,1042.0
Cluster Centroids Undersampling,0.97,0.97,0.97,0.97,0.97,0.94,1042.0
SMOTE,0.97,0.95,0.97,0.96,0.96,0.93,1042.0
SMOTEENN,0.97,0.78,0.97,0.86,0.87,0.74,1042.0
Random Forest,0.97,0.97,0.97,0.97,0.97,0.94,1042.0


## Random Forest using Easy Ensemble Classifier

In [44]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
brf_class = EasyEnsembleClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [45]:
y_pred = brf_class.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9777349443666749

In [46]:
# Calculated the balanced accuracy score
y_pred = brf_class.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.9777349443666749

In [47]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1035,   29],
       [  18, 1024]], dtype=int64)

In [48]:
# Add RF EE Classifier confusion matrix to CM list
cm_list.append(confusion_matrix(y_test, y_pred))

In [49]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.97      0.98      0.98      0.98      0.95      1064
          1       0.97      0.98      0.97      0.98      0.98      0.96      1042

avg / total       0.98      0.98      0.98      0.98      0.98      0.96      2106



In [50]:
#Add classification report to summary table 
name = 'Random Forest w/ Easy Ensemble Classifier '
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.97,0.96,0.97,0.97,0.97,0.94,1042.0
Cluster Centroids Undersampling,0.97,0.97,0.97,0.97,0.97,0.94,1042.0
SMOTE,0.97,0.95,0.97,0.96,0.96,0.93,1042.0
SMOTEENN,0.97,0.78,0.97,0.86,0.87,0.74,1042.0
Random Forest,0.97,0.97,0.97,0.97,0.97,0.94,1042.0
Random Forest w/ Easy Ensemble Classifier,0.97,0.98,0.97,0.98,0.98,0.96,1042.0


## Summarized Table

In [51]:
#Add BAS and CM lists to the summary Dataframe
summary_df["BA_Score"] = BA_Score
summary_df["BA_Score"] = summary_df["BA_Score"].round(decimals=2)
summary_df["Confusion Matrix"] = cm_list

summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup,BA_Score,Confusion Matrix
Naive Random Oversampling,0.97,0.96,0.97,0.97,0.97,0.94,1042.0,0.97,"[[1036, 28], [39, 1003]]"
Cluster Centroids Undersampling,0.97,0.97,0.97,0.97,0.97,0.94,1042.0,0.97,"[[1035, 29], [34, 1008]]"
SMOTE,0.97,0.95,0.97,0.96,0.96,0.93,1042.0,0.96,"[[1036, 28], [49, 993]]"
SMOTEENN,0.97,0.78,0.97,0.86,0.87,0.74,1042.0,0.88,"[[1037, 27], [233, 809]]"
Random Forest,0.97,0.97,0.97,0.97,0.97,0.94,1042.0,0.97,"[[1037, 27], [33, 1009]]"
Random Forest w/ Easy Ensemble Classifier,0.97,0.98,0.97,0.98,0.98,0.96,1042.0,0.98,"[[1035, 29], [18, 1024]]"


In [53]:
#drop unneccessary columns
summary_df.drop(columns=["geo","iba","sup"],inplace=True)

#reorder the columns of the Summary df 
cols = summary_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
summary_df = summary_df[cols]

summary_df

Unnamed: 0,BA_Score,Confusion Matrix,pre,rec,spe,f1
Naive Random Oversampling,0.97,"[[1036, 28], [39, 1003]]",0.97,0.96,0.97,0.97
Cluster Centroids Undersampling,0.97,"[[1035, 29], [34, 1008]]",0.97,0.97,0.97,0.97
SMOTE,0.96,"[[1036, 28], [49, 993]]",0.97,0.95,0.97,0.96
SMOTEENN,0.88,"[[1037, 27], [233, 809]]",0.97,0.78,0.97,0.86
Random Forest,0.97,"[[1037, 27], [33, 1009]]",0.97,0.97,0.97,0.97
Random Forest w/ Easy Ensemble Classifier,0.98,"[[1035, 29], [18, 1024]]",0.97,0.98,0.97,0.98


In [54]:
#export csvs
summary_df.to_csv("summary.csv")
features_df.to_csv("features.csv")