In [13]:
import pandas as pd
import statsmodels.formula.api as smf

In [2]:

PATH = "data/"
data = pd.read_csv(PATH + 'oscar_movies.csv')
data.head()

Unnamed: 0,tconst,primaryTitle,IMDB_genres,averageRating,numVotes,release,revenue,runtime,countries,oscar_category,oscar_year,winner
0,tt0036775,double indemnity,"Crime,Drama,Film-Noir",8.3,170475,1944.0,5000000.0,108.0,United States of America,best motion picture,1945,False
1,tt0036855,gaslight,"Crime,Drama,Mystery",7.8,34945,1944.0,4613000.0,114.0,United States of America,best motion picture,1945,False
2,tt0036868,the best years of our lives,"Drama,Romance,War",8.1,72109,1946.0,23650000.0,168.0,United States of America,best motion picture,1947,True
3,tt0036872,going my way,"Comedy,Drama,Music",7.0,13723,1944.0,,126.0,United States of America,best motion picture,1945,True
4,tt0037280,since you went away,"Drama,Romance,War",7.5,5182,1944.0,,172.0,United States of America,best motion picture,1945,False


In [3]:
def parse_str_to_list(column):
    """
    Parses a column of strings, splitting each string by commas to
    create a list for each entry.

    Parameters:
    column (pd.Series): A pandas Series containing strings separated by commas.

    Returns:
    pd.Series: A Series where each element is a list.
               If an element in the original Series is null, it returns an empty list.
    """
    return column.apply(lambda x: [text.strip() for text in x.split(",")] if pd.notnull(x) else [])

In [4]:
data = data[['countries','winner', 'averageRating', 'revenue', 'runtime', 'IMDB_genres']]#data.drop(columns = ['tconst', 'numVotes', 'release', 'oscar_category', 'oscar_year'])
data = data.dropna()
data['countries'] = parse_str_to_list(data['countries'])
data['IMDB_genres'] = parse_str_to_list(data['IMDB_genres'])

In [5]:
# Explode the countries into individual rows
exploded = data.explode('countries')

# Perform one-hot encoding of the 'countries' column
df_one_hot = pd.get_dummies(exploded['countries'], prefix='country', drop_first=True).astype(int)
df_one_hot = df_one_hot['country_United States of America']
# Merge the one-hot encoded columns back to the original DataFrame
df = pd.concat([exploded, df_one_hot], axis=1).groupby(level=0).max()
df['winner'] = df['winner'].astype(int)

In [6]:
exploded = df.explode('IMDB_genres')
df_one_hot = pd.get_dummies(exploded['IMDB_genres'], prefix='genre', drop_first=True).astype(int)
df_one_hot = df_one_hot['genre_Drama']
df = pd.concat([exploded, df_one_hot], axis=1).groupby(level=0).max()

In [7]:
df.head()

Unnamed: 0,countries,winner,averageRating,revenue,runtime,IMDB_genres,country_United States of America,genre_Drama
0,United States of America,0,8.3,5000000.0,108.0,Film-Noir,1,1
1,United States of America,0,7.8,4613000.0,114.0,Mystery,1,1
2,United States of America,1,8.1,23650000.0,168.0,War,1,1
7,United States of America,0,7.2,21333333.0,126.0,Drama,1,1
10,United States of America,0,7.5,7000000.0,110.0,Romance,1,0


In [8]:
df = df.drop(columns = {'countries','IMDB_genres'})

In [9]:
df = df.rename(columns = {'country_United States of America':'fromUSA', 'genre_Drama':'isDrama'})


In [10]:
formula = 'winner ~ averageRating + fromUSA + revenue + runtime + isDrama'

In [11]:
logit_model = smf.logit(formula=formula, data=df).fit()

Optimization terminated successfully.
         Current function value: 0.458398
         Iterations 6


In [12]:
print(logit_model.summary())

                           Logit Regression Results                           
Dep. Variable:                 winner   No. Observations:                  226
Model:                          Logit   Df Residuals:                      220
Method:                           MLE   Df Model:                            5
Date:                Tue, 26 Nov 2024   Pseudo R-squ.:                 0.08169
Time:                        15:25:00   Log-Likelihood:                -103.60
converged:                       True   LL-Null:                       -112.81
Covariance Type:            nonrobust   LLR p-value:                  0.002452
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       -11.3287      3.025     -3.746      0.000     -17.257      -5.401
averageRating     1.1193      0.380      2.945      0.003       0.374       1.864
fromUSA          -0.7837      0.585     

The average rating influences positively the chance of winning. Being from the USA influences negatively but the confidence interval includes 0 and the P value is bigger than 0.05 thus it is not significant. The revenue, runtime and being of genre Drama are not significant either.