In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## Import the cleansed dataset and use Feature selection to identify dummy/target values

In [3]:
## Read cleansed/combined table

file_path = "Resources/Combination_Table.csv"
oscars_df = pd.read_csv(file_path)
oscars_df

Unnamed: 0,release_year,title,genres,nominated,budget_ranges,revenue_ranges,runtime_ranges,winner
0,2003,Underworld,Fantasy,1,Medium,Medium,High,1
1,2000,The Patriot,Drama,1,High,High,High,0
2,1929,The Broadway Melody,Drama,1,Low,Low,Medium,0
3,2000,The Patriot,Drama,1,High,High,High,0
4,1929,The Broadway Melody,Drama,1,Low,Low,Medium,0
...,...,...,...,...,...,...,...,...
4259,1982,Parasite,Horror,1,Low,Low,Low,1
4260,1982,Parasite,Horror,1,Low,Low,Low,1
4261,1982,Parasite,Horror,1,Low,Low,Low,0
4262,1994,The Lion King,Family,1,High,High,Low,0


In [4]:
oscars_df.dtypes

release_year       int64
title             object
genres            object
nominated          int64
budget_ranges     object
revenue_ranges    object
runtime_ranges    object
winner             int64
dtype: object

In [5]:
# drop unecessary columns
X = pd.get_dummies(oscars_df, columns=["genres","budget_ranges","revenue_ranges","runtime_ranges"]).drop(["release_year","title","winner","nominated"], axis=1)

## Using the binary outcome "nominated" to create our target value
y = oscars_df["nominated"]


In [6]:
oscars_df

Unnamed: 0,release_year,title,genres,nominated,budget_ranges,revenue_ranges,runtime_ranges,winner
0,2003,Underworld,Fantasy,1,Medium,Medium,High,1
1,2000,The Patriot,Drama,1,High,High,High,0
2,1929,The Broadway Melody,Drama,1,Low,Low,Medium,0
3,2000,The Patriot,Drama,1,High,High,High,0
4,1929,The Broadway Melody,Drama,1,Low,Low,Medium,0
...,...,...,...,...,...,...,...,...
4259,1982,Parasite,Horror,1,Low,Low,Low,1
4260,1982,Parasite,Horror,1,Low,Low,Low,1
4261,1982,Parasite,Horror,1,Low,Low,Low,0
4262,1994,The Lion King,Family,1,High,High,Low,0


In [7]:
# Check the balance of our target values
y.value_counts()

1    4147
0     117
Name: nominated, dtype: int64

## Split training/test data using the train_test_split model from sklearn

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## Naive RandomOverSampler Model

In [9]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 3110, 0: 3110})

In [10]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

## Initialize Balance Accuracy Score to store in list when aggregating summaries
BA_Score = []
BA_Score.append(balanced_accuracy_score(y_test, y_pred))

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 18,  11],
       [290, 747]], dtype=int64)

In [13]:
## Initialize the confusion matrix to store in list when aggregating
cm_list=[]
cm_list.append(confusion_matrix(y_test, y_pred))

In [14]:
## Print imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.06      0.62      0.72      0.11      0.67      0.44        29
          1       0.99      0.72      0.62      0.83      0.67      0.45      1037

avg / total       0.96      0.72      0.62      0.81      0.67      0.45      1066



In [15]:
## Establish summary dataframe
summary_df = pd.DataFrame(columns=['pre', 'rec', 'spe', 'f1', 'geo', 'iba', 'sup'])
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup


In [16]:
#Add classification report to the summary table 
name = 'Naive Random Oversampling'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)

summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.99,0.72,0.62,0.83,0.67,0.45,1037.0


## Cluster Centroids Undersampling

In [17]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
ccus = ClusterCentroids(random_state=1)
X_resampled, y_resampled = ccus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 88, 1: 88})

In [18]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [19]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.6622385528547201

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 27,   2],
       [629, 408]], dtype=int64)

In [21]:
# Add confusion matrix to confusion matrix list
cm_list.append(confusion_matrix(y_test, y_pred))

In [22]:
## Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.04      0.93      0.39      0.08      0.61      0.39        29
          1       1.00      0.39      0.93      0.56      0.61      0.35      1037

avg / total       0.97      0.41      0.92      0.55      0.61      0.35      1066



In [23]:
#Add classification report to summary table 
name = 'Cluster Centroids Undersampling'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.99,0.72,0.62,0.83,0.67,0.45,1037.0
Cluster Centroids Undersampling,1.0,0.39,0.93,0.56,0.61,0.35,1037.0


## SMOTE Oversampling

In [24]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter

Counter(y_resampled)

Counter({1: 3110, 0: 3110})

In [25]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [26]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.5662055664549596

In [27]:
# Display the SMTOE confusion matrix
confusion_matrix(y_test, y_pred)

array([[  7,  22],
       [113, 924]], dtype=int64)

In [28]:
# Add SMOTE confusion matrix to a list
cm_list.append(confusion_matrix(y_test, y_pred))

In [29]:
## Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.06      0.24      0.89      0.09      0.46      0.20        29
          1       0.98      0.89      0.24      0.93      0.46      0.23      1037

avg / total       0.95      0.87      0.26      0.91      0.46      0.23      1066



In [30]:
#Add classification report to summary table 
name = 'SMOTE'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.99,0.72,0.62,0.83,0.67,0.45,1037.0
Cluster Centroids Undersampling,1.0,0.39,0.93,0.56,0.61,0.35,1037.0
SMOTE,0.98,0.89,0.24,0.93,0.46,0.23,1037.0


## Combination (Over and Under) Sampling

In [31]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 2895, 1: 2646})

In [32]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.6299837063146344

In [34]:
# Display the SMTOE confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 10,  19],
       [ 88, 949]], dtype=int64)

In [35]:
# Add SMOTE confusion matrix to a list
cm_list.append(confusion_matrix(y_test, y_pred))

In [36]:
#Add classification report to summary table 
name = 'SMOTEENN'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.99,0.72,0.62,0.83,0.67,0.45,1037.0
Cluster Centroids Undersampling,1.0,0.39,0.93,0.56,0.61,0.35,1037.0
SMOTE,0.98,0.89,0.24,0.93,0.46,0.23,1037.0
SMOTEENN,0.98,0.92,0.34,0.95,0.56,0.33,1037.0


## Random Forest Model

In [37]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
brf_class = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_class.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [38]:
# Calculated the balanced accuracy score
y_pred = brf_class.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.7780567286270076

In [39]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 22,   7],
       [210, 827]], dtype=int64)

In [40]:
# Add Random Forest confusion matrix to cm list
cm_list.append(confusion_matrix(y_test, y_pred))

In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.09      0.76      0.80      0.17      0.78      0.60        29
          1       0.99      0.80      0.76      0.88      0.78      0.61      1037

avg / total       0.97      0.80      0.76      0.86      0.78      0.61      1066



In [42]:
# List the features sorted in descending order by feature importance
features_df = pd.DataFrame(sorted(zip(brf_class.feature_importances_, X_test.columns), reverse=True))

features_df.rename(columns={0: "Feature Importance", 1: "Feature"},inplace=True)

features_df

Unnamed: 0,Feature Importance,Feature
0,0.081591,budget_ranges_High
1,0.07407,runtime_ranges_High
2,0.069631,genres_Drama
3,0.068414,budget_ranges_Medium
4,0.063684,revenue_ranges_Medium
5,0.062839,genres_Comedy
6,0.061453,runtime_ranges_Medium
7,0.057514,revenue_ranges_Low
8,0.057505,revenue_ranges_High
9,0.055575,budget_ranges_Low


In [43]:
#Add classification report to summary table 
name = 'Random Forest'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.99,0.72,0.62,0.83,0.67,0.45,1037.0
Cluster Centroids Undersampling,1.0,0.39,0.93,0.56,0.61,0.35,1037.0
SMOTE,0.98,0.89,0.24,0.93,0.46,0.23,1037.0
SMOTEENN,0.98,0.92,0.34,0.95,0.56,0.33,1037.0
Random Forest,0.99,0.8,0.76,0.88,0.78,0.61,1037.0


## Random Forest using Easy Ensemble Classifier

In [44]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
brf_class = EasyEnsembleClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [45]:
y_pred = brf_class.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.66642835766302

In [46]:
# Calculated the balanced accuracy score
y_pred = brf_class.predict(X_test)
BA_Score.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.66642835766302

In [47]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 20,   9],
       [370, 667]], dtype=int64)

In [48]:
# Add RF EE Classifier confusion matrix to CM list
cm_list.append(confusion_matrix(y_test, y_pred))

In [49]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.05      0.69      0.64      0.10      0.67      0.45        29
          1       0.99      0.64      0.69      0.78      0.67      0.44      1037

avg / total       0.96      0.64      0.69      0.76      0.67      0.44      1066



In [50]:
#Add classification report to summary table 
name = 'Random Forest w/ Easy Ensemble Classifier '
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.99,0.72,0.62,0.83,0.67,0.45,1037.0
Cluster Centroids Undersampling,1.0,0.39,0.93,0.56,0.61,0.35,1037.0
SMOTE,0.98,0.89,0.24,0.93,0.46,0.23,1037.0
SMOTEENN,0.98,0.92,0.34,0.95,0.56,0.33,1037.0
Random Forest,0.99,0.8,0.76,0.88,0.78,0.61,1037.0
Random Forest w/ Easy Ensemble Classifier,0.99,0.64,0.69,0.78,0.67,0.44,1037.0


## Summarized Table

In [51]:
#Add BAS and CM lists to the summary Dataframe
summary_df["BA_Score"] = BA_Score
summary_df["BA_Score"] = summary_df["BA_Score"].round(decimals=2)
summary_df["Confusion Matrix"] = cm_list

summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup,BA_Score,Confusion Matrix
Naive Random Oversampling,0.99,0.72,0.62,0.83,0.67,0.45,1037.0,0.67,"[[18, 11], [290, 747]]"
Cluster Centroids Undersampling,1.0,0.39,0.93,0.56,0.61,0.35,1037.0,0.66,"[[27, 2], [629, 408]]"
SMOTE,0.98,0.89,0.24,0.93,0.46,0.23,1037.0,0.57,"[[7, 22], [113, 924]]"
SMOTEENN,0.98,0.92,0.34,0.95,0.56,0.33,1037.0,0.63,"[[10, 19], [88, 949]]"
Random Forest,0.99,0.8,0.76,0.88,0.78,0.61,1037.0,0.78,"[[22, 7], [210, 827]]"
Random Forest w/ Easy Ensemble Classifier,0.99,0.64,0.69,0.78,0.67,0.44,1037.0,0.67,"[[20, 9], [370, 667]]"


In [52]:
#drop unneccessary columns
summary_df.drop(columns=["geo","iba","sup"],inplace=True)

#reorder the columns of the Summary df 
cols = summary_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
summary_df = summary_df[cols]

summary_df

Unnamed: 0,BA_Score,Confusion Matrix,pre,rec,spe,f1
Naive Random Oversampling,0.67,"[[18, 11], [290, 747]]",0.99,0.72,0.62,0.83
Cluster Centroids Undersampling,0.66,"[[27, 2], [629, 408]]",1.0,0.39,0.93,0.56
SMOTE,0.57,"[[7, 22], [113, 924]]",0.98,0.89,0.24,0.93
SMOTEENN,0.63,"[[10, 19], [88, 949]]",0.98,0.92,0.34,0.95
Random Forest,0.78,"[[22, 7], [210, 827]]",0.99,0.8,0.76,0.88
Random Forest w/ Easy Ensemble Classifier,0.67,"[[20, 9], [370, 667]]",0.99,0.64,0.69,0.78


In [53]:
#export csvs
summary_df.to_csv("summary.csv")
features_df.to_csv("features.csv")