In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stats
import IPython.display as ipy
from ipywidgets import IntProgress
import warnings
import time

warnings.filterwarnings("ignore")
pd.set_option('mode.copy_on_write', True)
plt.style.use('fivethirtyeight')

year_from = pd.to_datetime("2000")
random_seed = 42

def update_years(value):
    if "/" in value:
        return None
    else:
        return pd.to_datetime(value)

# OSCARS

full_oscars = pd.read_csv("data/oscars.csv", sep="\t")  
full_oscars["Year"] = full_oscars["Year"].apply(update_years)
oscars = full_oscars.drop(full_oscars[full_oscars["Year"].isna()].index)

oscar_films = oscars[(oscars["Category"] == "BEST PICTURE") & (oscars["Year"] >= year_from)][["Year", "Film", "Winner"]]
oscar_actresses = oscars[(oscars["Category"] == "ACTRESS IN A LEADING ROLE") & (oscars["Year"] >= year_from)][["Year", "Name", "Film", "Winner"]]
oscar_actors = oscars[(oscars["Category"] == "ACTOR IN A LEADING ROLE") & (oscars["Year"] >= year_from)][["Year", "Name", "Film", "Winner"]]

#oscar_films.rename(columns={"Film": "Name"}, inplace=True)

oscar_films["Name"] = oscar_films["Film"]
oscar_films["Type"] = "film"
oscar_actresses["Type"] = "actress"
oscar_actors["Type"] = "actor"

oscar_dfs = [oscar_films, oscar_actresses, oscar_actors]
all_oscars = pd.concat(oscar_dfs)
all_oscars.rename(columns={"Winner": "Oscar-win"}, inplace=True)
all_oscars["Name"].str.strip()
all_oscars.loc[all_oscars["Oscar-win"] == True, "Oscar-win"] = 1

# OTHER AWARD CEREMONIES

award_ceremonies = ["Bafta", "Sag", "Gg-dram", "Gg-com"]
award_categories = ["film", "actress", "actor"]
unwanted_columns = ["Director(s)", "Producer(s)", "Country", "Cast members", "Role(s)", "Ref.", "Character", "Director", "Producers", "Producer"]
list_df = pd.Series()

for award_type in award_categories:
    for award_name in award_ceremonies:
        df = pd.read_csv("data/{}-{}.csv".format(award_name.lower(), award_type))
        if df["Year"].dtype == np.int64:
            df["Year"] = pd.to_datetime(df["Year"], format="%Y")
        else:
            df["Year"] = pd.to_datetime(df["Year"].str[:4])
                    
        df.rename(columns={award_type.capitalize(): "Name"}, inplace=True)
        if award_type == "film":
            df["Film"] = df["Name"]

        df["Type"] = award_type
        df["Name"].str.strip()
        df["{}-nom".format(award_name)] = 1.0
        df["{}-win".format(award_name)] = np.nan

        for i in df.columns:
            if i in unwanted_columns:
                df.drop(i, axis=1, inplace=True)

        for year in df["Year"].value_counts().index:
            row_idx = df[df["Year"] == year].index[0]
            df.loc[row_idx, "{}-win".format(award_name)] = 1

        list_df.loc["{}/{}".format(award_name, award_type)] = df

def merge_list(list_df, output=False):
    if type(list_df) == pd.core.series.Series:
        list_df = list_df.to_list()
    
    df = list_df[0]    
    for i in range(1, len(list_df)):
        if output: display(list_df[i].head(1))
        df = pd.merge(df, list_df[i], how="outer")
    return df

films_merged = merge_list(list_df[:4])
actresses_merged = merge_list(list_df[4:-4])
actors_merged = merge_list(list_df[-4:])
all_other = pd.concat([films_merged, actresses_merged, actors_merged])

show_films = True

FULL_TABLE = pd.merge(all_oscars, all_other, how="left")
FULL_TABLE.fillna(0, inplace=True)
column_to_move = FULL_TABLE.pop("Oscar-win")
FULL_TABLE.insert(4, "Oscar-win", column_to_move)

if not show_films:
    FULL_TABLE.drop("Film", axis=1, inplace=True)
FULL_TABLE

Unnamed: 0,Year,Film,Name,Type,Oscar-win,Bafta-nom,Bafta-win,Sag-nom,Sag-win,Gg-dram-nom,Gg-dram-win,Gg-com-nom,Gg-com-win
0,2000-01-01,Chocolat,Chocolat,film,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2000-01-01,"Crouching Tiger, Hidden Dragon","Crouching Tiger, Hidden Dragon",film,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2000-01-01,Erin Brockovich,Erin Brockovich,film,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2000-01-01,Gladiator,Gladiator,film,1,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,2000-01-01,Traffic,Traffic,film,0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,2024-01-01,The Brutalist,Adrien Brody,actor,1,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
437,2024-01-01,A Complete Unknown,Timothée Chalamet,actor,0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
438,2024-01-01,Sing Sing,Colman Domingo,actor,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
439,2024-01-01,Conclave,Ralph Fiennes,actor,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [85]:
# Format FULL_TABLE ready for multiple regression 
for column in range(len(FULL_TABLE.columns)):
    split_column = FULL_TABLE.columns[column].split("-")
    formatted_column = "_".join(split_column)
    FULL_TABLE.rename(columns={FULL_TABLE.columns[column]: formatted_column}, inplace=True)

In [86]:
# Runs a multiple regression and returns a summary table
def run_MR(fstring,df):
    model_full = stats.formula.ols(formula = fstring, data = df)
    model_full_fitted = model_full.fit()
    return(model_full_fitted.summary())

In [87]:
# For assessing p-values of each award ceremony against the oscars
fstring = "Oscar_win ~ Bafta_nom + Bafta_win + Sag_nom + Sag_win + Gg_dram_nom + Gg_dram_win + Gg_com_nom + Gg_com_win"
print(run_MR(fstring,FULL_TABLE))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.465
Model:                            OLS   Adj. R-squared:                  0.455
Method:                 Least Squares   F-statistic:                     46.92
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           3.82e-54
Time:                        16:27:18   Log-Likelihood:                -56.127
No. Observations:                 441   AIC:                             130.3
Df Residuals:                     432   BIC:                             167.1
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.0193      0.031      0.615      

In [88]:
# Multiple regression of the ceremonies where p < 0.05
fstring = "Oscar_win ~ Bafta_win + Sag_win + Gg_dram_win"
print(run_MR(fstring,FULL_TABLE))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.459
Model:                            OLS   Adj. R-squared:                  0.455
Method:                 Least Squares   F-statistic:                     123.7
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           5.38e-58
Time:                        16:27:18   Log-Likelihood:                -58.494
No. Observations:                 441   AIC:                             125.0
Df Residuals:                     437   BIC:                             141.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.0303      0.015      1.997      

In [89]:
# Now isolating the film award data
film_table = FULL_TABLE[FULL_TABLE["Type"] == "film"]

# Testing p-values with only the film data
fstring = "Oscar_win ~ Bafta_nom + Bafta_win + Sag_nom + Sag_win + Gg_dram_nom + Gg_dram_win + Gg_com_nom + Gg_com_win"
print(run_MR(fstring,film_table))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.303
Model:                            OLS   Adj. R-squared:                  0.272
Method:                 Least Squares   F-statistic:                     9.883
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           2.22e-11
Time:                        16:27:18   Log-Likelihood:                -28.977
No. Observations:                 191   AIC:                             75.95
Df Residuals:                     182   BIC:                             105.2
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -0.0113      0.045     -0.253      

In [113]:
# Running a mulitple regression on ceremonies where p < 0.05, within the film-only dataset.
fstring = "Oscar_win ~ Bafta_nom + Sag_win"
print(run_MR(fstring,film_table))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.247
Model:                            OLS   Adj. R-squared:                  0.239
Method:                 Least Squares   F-statistic:                     30.78
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           2.73e-12
Time:                        17:34:05   Log-Likelihood:                -36.378
No. Observations:                 191   AIC:                             78.76
Df Residuals:                     188   BIC:                             88.51
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0100      0.032     -0.316      0.7

In [91]:
# Now isolating the actor award data
actor_table = FULL_TABLE[FULL_TABLE["Type"] == "actor"]

# Testing p-values with only the actor data
fstring = "Oscar_win ~ Bafta_nom + Bafta_win + Sag_nom + Sag_win + Gg_dram_nom + Gg_dram_win + Gg_com_nom + Gg_com_win"
print(run_MR(fstring,actor_table))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.584
Model:                            OLS   Adj. R-squared:                  0.555
Method:                 Least Squares   F-statistic:                     20.34
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           6.19e-19
Time:                        16:27:18   Log-Likelihood:                -8.0485
No. Observations:                 125   AIC:                             34.10
Df Residuals:                     116   BIC:                             59.55
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -0.0140      0.066     -0.211      

In [114]:
# Running a mulitple regression on ceremonies where p < 0.05, within the actor-only dataset.
fstring = "Oscar_win ~ Bafta_win + Sag_win + Gg_com_win"
print(run_MR(fstring,actor_table))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.531
Method:                 Least Squares   F-statistic:                     47.84
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           1.85e-20
Time:                        17:34:43   Log-Likelihood:                -13.951
No. Observations:                 125   AIC:                             35.90
Df Residuals:                     121   BIC:                             47.22
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0474      0.029      1.643      0.1

In [93]:
# Isolating the actress-only data 
actress_table = FULL_TABLE[FULL_TABLE["Type"] == "actress"]

# Testing p-values with only the actress data
fstring = "Oscar_win ~ Bafta_nom + Bafta_win + Sag_nom + Sag_win + Gg_dram_nom + Gg_dram_win + Gg_com_nom + Gg_com_win"
print(run_MR(fstring,actress_table))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.648
Model:                            OLS   Adj. R-squared:                  0.623
Method:                 Least Squares   F-statistic:                     26.66
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           5.27e-23
Time:                        16:27:18   Log-Likelihood:                 2.3774
No. Observations:                 125   AIC:                             13.25
Df Residuals:                     116   BIC:                             38.70
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.1105      0.058      1.902      

In [115]:
# Running a mulitple regression on ceremonies where p < 0.05, within the actress-only dataset.
fstring = "Oscar_win ~ Bafta_win + Sag_win"
print(run_MR(fstring,actress_table))

                            OLS Regression Results                            
Dep. Variable:              Oscar_win   R-squared:                       0.628
Model:                            OLS   Adj. R-squared:                  0.622
Method:                 Least Squares   F-statistic:                     102.9
Date:                Thu, 03 Apr 2025   Prob (F-statistic):           6.54e-27
Time:                        17:35:03   Log-Likelihood:                -1.0565
No. Observations:                 125   AIC:                             8.113
Df Residuals:                     122   BIC:                             16.60
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0163      0.026      0.635      0.5