# Imports / Reading Data

In [1]:
import numpy as np
import pandas as pd
import random

import statsmodels.api as sm
from sklearn import tree

from scipy.stats import pearsonr

import sklearn
from sklearn.model_selection import train_test_split

#Self-written helper functions
from helpers.ethnicity import *

In [2]:
# fix random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [3]:
main_df = pd.read_parquet('cache/data.parquet')
genres_df = pd.read_parquet('cache/genres.parquet')
languages_df = pd.read_parquet('cache/languages.parquet')
countries_df = pd.read_parquet('cache/countries.parquet')

In [4]:
# Merge data
# Take only relevant columns from the main df (note: left out 'category')
relevant_main_df = main_df[['box_office_revenue', 'runtime', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_age', 'oscar_nominated', 'year', 'average_rating', 'number_of_votes', 'number_of_movies_starred_in', 'average_box_office_revenue_previous_movies']]
modelling_df = pd.concat([relevant_main_df,genres_df, languages_df, countries_df], axis = 1)

In [5]:
#Remove duplicate columns
duplicate_columns = modelling_df.columns[modelling_df.columns.duplicated()]
modelling_df = modelling_df.loc[:, ~modelling_df.columns.duplicated()]

# Preprocessing Data

In [6]:
# Decode ethnicity and one-hot
modelling_df = decode_ethnicity(modelling_df, one_hot = True)

#One-hot encode gender: 1 if woman, no column otherwise
modelling_df["is_woman"] = (modelling_df["actor_gender"] == "F").astype(int)
modelling_df.drop(columns=["actor_gender"], inplace = True)

modelling_df = modelling_df.astype(float) #Change dtype for consistency

# Correllations

In [8]:
#Gather up all correllations 
nomination_correllations = {}
for col in modelling_df.columns:
    if col != 'oscar_nominated':
        corr, p_value = pearsonr(modelling_df[col], modelling_df['oscar_nominated'])
        nomination_correllations[col] = {'correlation': corr, 'p_value': p_value}
correlations_df = pd.DataFrame.from_dict(nomination_correllations).T

adjusted_p_value = 0.05 / len(correlations_df) #Use the bonferroni correction
significant_correllations_df = correlations_df[correlations_df["p_value"] < adjusted_p_value].sort_values(by="correlation", key=abs, ascending=False)
significant_features = significant_correllations_df.index #Extract for future use
print(f"Nr of significant correllations with being nominated: {len(significant_correllations_df)}")
significant_correllations_df

Nr of significant correllations with being nominated: 55


Unnamed: 0,correlation,p_value
average_rating,0.169251,1.6404e-152
New Hollywood,0.149198,1.408489e-118
year,-0.13741,1.043556e-100
Drama,0.119307,3.252608e-76
runtime,0.094714,1.3961669999999998e-48
Biography,0.076978,1.247714e-32
Comedy,-0.070889,6.481862e-28
Tragedy,0.068748,2.388534e-26
Period piece,0.065439,5.078918e-24
Film adaptation,0.064827,1.329113e-23


# Modelling

In [9]:
def evaluate_predictions(y_test, output):
    print('Accuracy:', sklearn.metrics.accuracy_score(y_test, output))
    print('Precision:', sklearn.metrics.precision_score(y_test, output))
    print('Recall:', sklearn.metrics.recall_score(y_test, output))
    print('F1:', sklearn.metrics.f1_score(y_test, output))

## Being Nominated

### Data preparation

In [10]:
#Features/target
X = modelling_df.drop('oscar_nominated', axis=1)
y = modelling_df['oscar_nominated']

In [11]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
#What is continuous
continuous_cols = ['box_office_revenue', 'runtime', 'actor_height', 'actor_age', 'year', 'average_rating', 'number_of_votes', 'number_of_movies_starred_in', 'average_box_office_revenue_previous_movies']

# Scaler for features. Fit only to the training data
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(X_train[continuous_cols])

# Transform train and test data according to the scaler (only continuous columns)
X_train[continuous_cols] = scaler.transform(X_train[continuous_cols]) 
X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])

# Add constants to train and test set
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

### Logistic Regression

In [13]:
#Not filtering columns results in singular matrix so select relevant features from previous findings
#X_train = X_train[significant_features]

In [14]:
model = sm.Logit(y_train, X_train.astype(float))
#result = model.fit()
#summary = result.summary()

### DecisionTree

In [15]:
model = tree.DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [17]:
print(f"Trained model depth is {model.get_depth()} ; Number of leaves {model.get_n_leaves()}")
preds = model.predict(X_test)
evaluate_predictions(y_test, preds)

Accuracy: 0.9647281125341172
Precision: 0.22727272727272727
Recall: 0.3125
F1: 0.2631578947368421


With a decisionTree we can predict oscar nominations with striking accuracy. However, when looking at the recall and percision, identifying true positives is hard.

## From personal features
Ideally oscars should not discriminate, if it is possible to get nominated based on your personal features, things are not ideal

### Extract features

In [30]:
personal_features_df = main_df[['actor_gender', 'actor_height', 'actor_ethnicity', 'actor_age', 'oscar_nominated']].copy()
# Decode ethnicity and one-hot
personal_features_df = decode_ethnicity(personal_features_df, one_hot = True)

#One-hot encode gender: 1 if woman, no column otherwise
personal_features_df["is_woman"] = (personal_features_df["actor_gender"] == "F").astype(int)
personal_features_df.drop(columns=["actor_gender"], inplace = True)

personal_features_df = personal_features_df.astype(float)

X = personal_features_df.drop('oscar_nominated', axis=1)
y = personal_features_df['oscar_nominated']

### Modelling

#### Preparing data

In [31]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

continuous_cols = ['actor_height', 'actor_age']
# Scaler for features. Fit only to the training data
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(X_train[continuous_cols])

# Transform train and test data according to the scaler (only continuous columns)
X_train[continuous_cols] = scaler.transform(X_train[continuous_cols]) 
X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])

# Add constants to train and test set
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

#### Logistic Regression

In [32]:
model = sm.Logit(y_train, X_train.astype(float))
result = model.fit()
summary = result.summary()

         Current function value: 0.104665
         Iterations: 35




In [35]:
preds = result.predict(X_test)
preds = (preds >= 0.5).astype(int)
evaluate_predictions(y_test, preds)

Accuracy: 0.9798446357337812
Precision: 0.0
Recall: 0.0
F1: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic regression is not able to predict being nominated for an oscar, which we guess is a good result

#### Decision Tree

In [36]:
model = tree.DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [37]:
print(f"Trained model depth is {model.get_depth()} ; Number of leaves {model.get_n_leaves()}")
preds = model.predict(X_test)
evaluate_predictions(y_test, preds)

Trained model depth is 30 ; Number of leaves 1123
Accuracy: 0.9716565190006299
Precision: 0.0
Recall: 0.0
F1: 0.0


Neither is the Decision tree
These bad results make sense, as from the previous correlation analysis only the gender has a relevant correlation with being nominated

## Predicting oscar categories