In [1]:
import pandas as pd
import ast
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from collections import defaultdict
from tqdm import tqdm
import numpy as np
PATH_IN = './datasets/moviedata/'

df = pd.read_csv(PATH_IN + 'final_dataset.csv')
to_convert = ['Languages', 'Countries', 'Genres', 'Directors']
for elem in to_convert:
    for x in range(len(df)):
        try:
            df[elem][x] = ast.literal_eval(df[elem][x])
        except ValueError:
            df[elem][x] = df[elem][x]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[elem][x] = ast.literal_eval(df[elem][x])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[elem][x] = df[elem][x]


In [2]:
"""
    Eliminate empty values from the selected features.
    In order to do so, empty values are replaced by None,
    which is then effeciently removed.
"""

features_to_encode = ['Languages', 'Countries', 'Genres']
encoding_map = defaultdict()
encoded_df = df.copy()

for feature in features_to_encode:
    encoding_map[feature] = set()
    encoded_df[feature] = encoded_df[feature].apply(lambda x : None if len(x) == 0 else x)
    encoded_df = encoded_df[encoded_df[feature].notna()]
    encoded_df[feature].apply(lambda x : encoding_map[feature].update(x))

for feature in features_to_encode:
    encoding_map[feature] = list(encoding_map[feature])

print(encoded_df.shape[0], df.shape[0])

54884 65791


In [3]:
"""
    Preprocess a common genre, which is a combination of 2 other.
"""

def replace_values_in_list(df, col, old_values, new_value):
    df[col] = df[col].apply(lambda x: [new_value if check_common_elements(x, old_values) else i for i in x])
    return df

# function that checks if two lists have elements in common
def check_common_elements(list1, list2):
    return bool(set(list1) & set(list2))

encoded_df = replace_values_in_list(encoded_df, 'Genres', 'Action/Adventure', ['Action', 'Adventure'])
encoded_df['Genres'] = encoded_df['Genres'].apply(lambda x : list(set(x)))

In [4]:
"""
    Adjust the Data format for the features below, in order to be able to create columns
    out of the lists of categories.
"""
small_features = ['Languages', 'Genres', 'Countries']
feature_map = defaultdict()

chars_to_remove = set(['\'', '[', ']', ' ', '\"'])

for feature in tqdm(small_features):
    feature_map[feature] = set()
    encoded_df[feature] = encoded_df[feature].apply(lambda x : ''.join([c for c in str(x) if c not in chars_to_remove]))
    new_columns = encoded_df[feature].str.get_dummies(',').astype(bool)
    feature_map[feature].update(new_columns.columns)
    new_columns = new_columns.drop(new_columns.columns[new_columns.apply(lambda col: col.sum() < 100)], axis=1)
    encoded_df = pd.concat([encoded_df, new_columns], axis = 1)


100%|██████████| 3/3 [00:07<00:00,  2.49s/it]


In [5]:
"""
    Adjust the Director data, in order to be usefull for the Feature selection.
    Threshold is chosen, otherwise, due to the big amount of Directors,
    the encoding is very costly.
"""
directors = defaultdict(int)
def add_to_map(current_map, word_list):
    for word in word_list:
        directors[word] += 1

encoded_df['Directors'] = encoded_df['Directors'].apply(lambda x : ''.join([c for c in str(x) if c not in chars_to_remove]))
encoded_df['Directors'].apply(lambda x : add_to_map(directors, x.split(',')))
for director, value in tqdm(directors.items()):
    if value >= 20:
        new_column = encoded_df['Directors'].apply(lambda x : director in x.split(','))
        new_column = new_column.rename(director)
        encoded_df = pd.concat([encoded_df, new_column], axis=1)

100%|██████████| 20780/20780 [00:16<00:00, 1255.93it/s]


In [6]:
"""
    Obtain feature importance by utilizing a Random Forest
"""

#Drop columns, which are unnecessary
importance_df = encoded_df.drop(columns=['Wikipedia ID', 'Freebase ID', 'Name', 'Languages', 'Countries', 'Genres', 'IMDb ID', 'Release date', 'averageRating', 'numVotes',"Mpaas", 'Box offices',
                               "Directors", "Writers", "Producers", "Composers", "Cinematographers", "Editors"])
importance_df = importance_df.dropna()
wr = importance_df["Weighted Rating"]

importance_df = importance_df.drop(columns = ['Weighted Rating'])
X_train, X_test, y_train, y_test = train_test_split(importance_df, wr, random_state=42)


#Fit the Random Forest Regressor
feature_names = [f"{importance_df.columns[i]}" for i in range(importance_df.shape[1])]
forest = RandomForestRegressor(random_state=0)
forest.fit(X_train, y_train)

#Obtain imprortance
importances = forest.feature_importances_
forest_importances = pd.Series(importances, index=feature_names)
forest_importances.sort_values(ascending = False)[:20]

Runtime                  0.197985
Budgets                  0.141801
Drama                    0.024678
Animation                0.017045
English                  0.013417
CrimeFiction             0.012525
ScienceFiction           0.011584
Mystery                  0.011476
Comedy                   0.011154
Cult                     0.010757
UnitedStatesofAmerica    0.010540
Horror                   0.010457
Thriller                 0.010401
Adventure                0.010330
Action                   0.010186
RomanceFilm              0.010112
Comingofage              0.009792
Fantasy                  0.009728
UnitedKingdom            0.008861
Indie                    0.008244
dtype: float64

In [7]:
"""
    Get the most descriptive categories of a feature,
    as outputed by the Random Forest.
"""
def get_most_descriptive_values_in_category(feature, n):
    count = 0
    most_descriptive = []
    for index, value in forest_importances.sort_values(ascending = False).iteritems():
        if index.replace(" ", "") in feature_map[feature]:
            count += 1
            most_descriptive.append(index)
            if count == n:
                break
    return most_descriptive

In [8]:
"""
    Combine the categorical values into one
"""
importance_sum = defaultdict(float)
enc = ['Languages', 'Countries', 'Genres', 'Directors']
feature_map['Directors'] = set()
feature_map['Directors'].update(directors.keys())

for index, value in forest_importances.iteritems():
    for feature in enc:
        if index.replace(" ", "") in feature_map[feature]:
            importance_sum[feature] += value

  for index, value in forest_importances.iteritems():


In [9]:
importance_sum['Budgets'] = forest_importances['Budgets']
importance_sum['Runtime'] = forest_importances['Runtime']
importance_sum

defaultdict(float,
            {'Languages': 0.026422302871953855,
             'Genres': 0.4876929478825191,
             'Countries': 0.0627707790939454,
             'Directors': 0.08528662392775475,
             'Budgets': 0.14180094126988632,
             'Runtime': 0.19798478633984434})

In [10]:
# Therefore top Genres are Drama, Animation, Crime Fiction, Mystery, Romance , Science Fiction, Comedy, Adventure, Thriller, Action
most_descriptive_genres = ['Drama', 'Animation', 'CrimeFiction', 'Mystery', 'RomanceFilm', 'ScienceFiction', 'Comedy', 'Adventure', 'Thriller', 'Action']
most_descriptive_countries = ['UnitedStatesofAmerica', 'UnitedKingdom', 'Japan', 'NewZealand', 'India', 'France', 'Germany', 'Italy']
lr_columns = most_descriptive_genres
lr_columns.extend(most_descriptive_countries)
lr_columns.append('Budgets')
lr_columns.append('Runtime')
lr_columns.append('Weighted Rating')
lr_df = encoded_df[lr_columns]
lr_columns.remove('Weighted Rating')
lr_df = lr_df.dropna()
lr_df['Budgets'] = (lr_df['Budgets'] - lr_df['Budgets'].mean()) / lr_df['Budgets'].std()
lr_df['Runtime'] = (lr_df['Runtime'] - lr_df['Runtime'].mean()) / lr_df['Runtime'].std()
lr_df = lr_df.rename(columns={"Weighted Rating": "WR"})

In [11]:
"""
    Get Directors over decades, sorted based on their Weighted Rating.
"""
def get_directors_per_decade(df, genre = None):
    director_df = df.copy()
    director_df['Release date'] = director_df['Release date'].apply(lambda x : x//10000)
    
    if genre is not None:
        director_df = director_df.explode('Genres')
        director_df = director_df[director_df['Genres'] == genre]
    
    director_df = director_df.explode('Directors')
    bins = np.arange(1900, 2030, 10)
    labels = ['{}-{}'.format(i, j) for i, j in zip(bins[:-1], bins[1:])]
    time_intervals = pd.cut(director_df['Release date'], bins=bins, labels=labels, include_lowest=True)

    director_decade_group = director_df[['Directors', 'Weighted Rating']].groupby([time_intervals, 'Directors'], dropna=True).mean().dropna()

    return director_decade_group.sort_values('Weighted Rating', ascending=False).sort_index(level = ['Release date'], sort_remaining=False)

get_directors_per_decade(df, 'Animation')

Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted Rating
Release date,Directors,Unnamed: 2_level_1
1900-1910,Segundo de Chomón,6.244280
1900-1910,Edwin S. Porter,6.231431
1910-1920,Winsor McCay,6.283572
1910-1920,Quirino Cristiani,6.231468
1910-1920,Wladyslaw Starewicz,6.229471
...,...,...
2010-2020,Simon Wells,5.834747
2010-2020,Tim Hill,5.747212
2010-2020,Zeljko Mitrovic,5.700687
2010-2020,Raja Gosnell,5.611337


In [12]:
"""
    This function performs a linear regression, using least squared error measurement.
    The time period of interest could be specified by adding the lower and higher bounds as an argument.
"""
def regression(df, target, features, year_lower = 0, year_higher = 0):
    if year_lower != 0 and year_higher != 0:
        df['Release date'] = df['Release date'].apply(lambda x : x//10000)
        df = df[df['Release date'] >= year_lower and df['Release date'] < year_higher]

    model = smf.ols(formula='{} ~ {}'.format(target, " + ".join(features)), data = df)
    result = model.fit()
    return result.summary()

In [13]:
regression(lr_df.copy(),"WR", lr_columns)

0,1,2,3
Dep. Variable:,WR,R-squared:,0.135
Model:,OLS,Adj. R-squared:,0.134
Method:,Least Squares,F-statistic:,112.7
Date:,"Fri, 23 Dec 2022",Prob (F-statistic):,0.0
Time:,17:46:16,Log-Likelihood:,-8287.4
No. Observations:,14444,AIC:,16620.0
Df Residuals:,14423,BIC:,16780.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.2072,0.011,550.015,0.000,6.185,6.229
Drama[T.True],0.1586,0.008,19.791,0.000,0.143,0.174
Animation[T.True],0.3497,0.022,15.684,0.000,0.306,0.393
CrimeFiction[T.True],0.1039,0.011,9.148,0.000,0.082,0.126
Mystery[T.True],0.1009,0.015,6.593,0.000,0.071,0.131
RomanceFilm[T.True],0.0021,0.010,0.211,0.833,-0.017,0.021
ScienceFiction[T.True],0.0065,0.014,0.477,0.633,-0.020,0.033
Comedy[T.True],0.0061,0.008,0.720,0.471,-0.011,0.023
Adventure[T.True],0.0087,0.012,0.712,0.476,-0.015,0.033

0,1,2,3
Omnibus:,2963.832,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24969.832
Skew:,0.751,Prob(JB):,0.0
Kurtosis:,9.264,Cond. No.,17.2


In [14]:
animation = get_directors_per_decade(df, 'Animation')
drama = get_directors_per_decade(df, 'Drama')

In [15]:
animation[animation.index.get_level_values(0) == "2010-2020"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted Rating
Release date,Directors,Unnamed: 2_level_1
2010-2020,Rich Moore,7.617872
2010-2020,Mamoru Hosoda,7.434025
2010-2020,Pierre Coffin,7.237197
2010-2020,Steven Spielberg,7.195816
2010-2020,Dan Scanlon,7.136992
2010-2020,...,...
2010-2020,Simon Wells,5.834747
2010-2020,Tim Hill,5.747212
2010-2020,Zeljko Mitrovic,5.700687
2010-2020,Raja Gosnell,5.611337


In [16]:
best_period = animation['Weighted Rating'].idxmax()
b = animation[animation.index.get_level_values(0) == best_period[0]]
b[b.index.get_level_values(1) == best_period[1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted Rating
Release date,Directors,Unnamed: 2_level_1
1990-2000,Roger Allers,8.447283


In [17]:
drama[drama.index.get_level_values(0) == "2010-2020"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted Rating
Release date,Directors,Unnamed: 2_level_1
2010-2020,Éric Toledano,8.435343
2010-2020,Olivier Nakache,8.435343
2010-2020,Christopher Nolan,8.368448
2010-2020,Quentin Tarantino,8.365343
2010-2020,Thomas Vinterberg,8.153862
2010-2020,...,...
2010-2020,John Singleton,5.366701
2010-2020,Christian E. Christiansen,5.356511
2010-2020,Angelina Jolie,5.344510
2010-2020,Bill Condon,5.024805


In [20]:
best_period = drama['Weighted Rating'].idxmax()
b = drama[drama.index.get_level_values(0) == best_period[0]]
b[b.index.get_level_values(1) == best_period[1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted Rating
Release date,Directors,Unnamed: 2_level_1
1990-2000,Frank Darabont,8.913211


In [21]:
"""
    Get Directors based on Genre.
"""
def get_directors_by_genre(df, genre):
    director_df = df.copy()
    director_df = director_df[['Directors', 'Genres', 'Weighted Rating']]
    director_df = director_df.explode('Genres')
    director_df = director_df.explode('Directors')
    director_df = director_df[director_df['Genres'] == genre]
    
    return director_df[director_df['Directors'].groupby(director_df['Directors']).transform('size') > 5].groupby('Directors', as_index=False)['Weighted Rating'].mean()


In [22]:
directors_animation = get_directors_by_genre(df, 'Animation')
directors_animation.max()

Directors          Yoshiaki Kawajiri
Weighted Rating              7.35902
dtype: object

In [23]:
directors_drama = get_directors_by_genre(df, 'Drama')
directors_drama.max()

Directors          Éric Rohmer
Weighted Rating       8.150876
dtype: object