In [67]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.graph_objects as go
pd.set_option('mode.chained_assignment', None)

## Preprocessing

In [3]:
# read oscar dataset
oscar_dataset = pd.read_csv('the_oscar_award.csv')
oscar_dataset = oscar_dataset.rename(columns = {'name': 'Actor Name','film' : 'Movie name','year_film':'Movie release year'})
# only take actor/actress Awards from all categories
oscar_nominees = oscar_dataset[oscar_dataset['category'].str.contains('ACTOR') | oscar_dataset['category'].str.contains('ACTRESS')].reset_index(drop=True)
oscar_winners = oscar_nominees[oscar_nominees['winner'] == True]
oscar_winners

Unnamed: 0,Movie release year,year_ceremony,ceremony,category,Actor Name,Movie name,winner
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
6,1928,1929,2,ACTOR,Warner Baxter,In Old Arizona,True
15,1928,1929,2,ACTRESS,Mary Pickford,Coquette,True
16,1929,1930,3,ACTOR,George Arliss,Disraeli,True
...,...,...,...,...,...,...,...
1725,2018,2019,91,ACTRESS IN A SUPPORTING ROLE,Regina King,If Beale Street Could Talk,True
1731,2019,2020,92,ACTOR IN A LEADING ROLE,Joaquin Phoenix,Joker,True
1737,2019,2020,92,ACTOR IN A SUPPORTING ROLE,Brad Pitt,Once upon a Time...in Hollywood,True
1742,2019,2020,92,ACTRESS IN A LEADING ROLE,Renée Zellweger,Judy,True


In [4]:
# count the number of previous nominations of the actor prior to the ceremony date
def count_previous_nominations(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return (actor_oscars['year_ceremony'] <= ceremony).sum()
# count the number of previous wins of the actor prior to the ceremony date
def count_previous_wins(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return ((actor_oscars['year_ceremony'] <= ceremony) & (actor_oscars['winner'] == True)).sum()


In [5]:
## determine for every oscar nomination how many oscars the actor won and got nominated to before in his career
oscar_nominees['nominations so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_nominations(x[0],x[1],oscar_nominees),axis=1)
oscar_nominees['wins so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_wins(x[0],x[1],oscar_nominees),axis=1)
final_oscar_nominees = oscar_nominees[['Actor Name','Movie name','nominations so far','wins so far']]
final_oscar_nominees[final_oscar_nominees['Actor Name']== 'Daniel Day-Lewis']
final_oscar_nominees


Unnamed: 0,Actor Name,Movie name,nominations so far,wins so far
0,Richard Barthelmess,The Noose,1,0
1,Emil Jannings,The Last Command,1,1
2,Louise Dresser,A Ship Comes In,1,0
3,Janet Gaynor,7th Heaven,1,1
4,Gloria Swanson,Sadie Thompson,1,0
...,...,...,...,...
1743,Kathy Bates,Richard Jewell,4,1
1744,Laura Dern,Marriage Story,3,1
1745,Scarlett Johansson,Jojo Rabbit,2,0
1746,Florence Pugh,Little Women,1,0


In [6]:
character_metadata_header = [
    'ID',
    'Freebase movie ID',
    'Movie release date',
    'Character Name',
    'Actor DOB',
    'Actor gender',
    'Actor height',
    'Actor ethnicity',
    'Actor Name',
    'Actor age at movie release',
    'Freebase character map',
    'Freebase character ID',
    'Freebase actor ID'
]
character_metadata = pd.read_csv(
    'MovieSummaries/character.metadata.tsv',
    delimiter='\t',
    names=character_metadata_header
)
movie_metadata = pd.read_csv(
    'movies_with_rating.csv'
)


In [7]:
# added average rating, release year and movie name to character metadata 
movie_ratings = movie_metadata[['ID','Movie name','Movie release year','averageRating']]
character_metadata_with_rating = pd.merge(character_metadata,movie_ratings, on = 'ID', how = 'inner')

In [9]:
## read filtered_characters and merge them with movie ratings
filtered_characters = pd.read_csv('filtered_characters.csv')
final_characters = pd.merge(filtered_characters,movie_ratings[['ID','Movie release year']],right_on= 'ID',left_on='movie_id',how = 'inner')

In [10]:
# count the number of times an actor played a role with the same classification before that year.
def count_similar_previous_roles(actor_name,movie_release_year,classification,characters_dataset):
    actor_character = characters_dataset[(characters_dataset['Actor Name']== actor_name) & (characters_dataset['classification']== classification)]
    return (actor_character['Movie release year'] < movie_release_year).sum()

In [11]:
# determine for every actor how many times he played a role with the same classification before the current role
final_characters['played same character']= final_characters[['Actor Name','Movie release year','classification']].apply(
    lambda x: count_similar_previous_roles(x[0],x[1],x[2],final_characters),axis = 1)

In [12]:
# Example of Tom hanks and the characters he has played
final_characters[final_characters['Actor Name']== 'Tom Hanks']

Unnamed: 0,movie_id,Actor Name,Character Name,name,classification,ID,Movie release year,played same character
1397,543433,Tom Hanks,Jimmy Dugan,Dugan,22,543433,1992.0,0
2173,53085,Tom Hanks,Sheriff Woody,Woody,3,53085,1995.0,0
2352,4186631,Tom Hanks,Richard Harlan Drew,Richard,8,4186631,1985.0,0
2569,1565181,Tom Hanks,"Walter Fielding, Jr.",Fielding,6,1565181,1986.0,0
2647,1724301,Tom Hanks,Det. Scott Turner,Turner,6,1724301,1989.0,1
6591,176489,Tom Hanks,Joe,Joe,6,176489,1990.0,2
8347,4186781,Tom Hanks,Lawrence Whatley Bourne III,III,20,4186781,1985.0,0


In [13]:
# merge the final charcters with their according chracters metadata and rating
final_filtered_characters = final_characters.drop(columns=['movie_id','name','Movie release year'])
character_metadata_with_role_count = pd.merge(character_metadata_with_rating,final_filtered_characters,on =['ID','Actor Name','Character Name'],how = 'left')
# since we merge left most and not all characters were classified we fill na values for played same character with 0
character_metadata_with_role_count['played same character']= character_metadata_with_role_count['played same character'].fillna(0)
# we merge all the characters informations together including roles played and their oscars information.
final_character_metadata = pd.merge(character_metadata_with_role_count, final_oscar_nominees, on = ['Actor Name','Movie name'], how = 'left' )
final_character_metadata[['nominations so far','wins so far']] = final_character_metadata[['nominations so far','wins so far']].fillna(0)

In [18]:
## we add actor year of birth in order to calculate how many similar role and how many oscars when he reaches a certain age
final_character_metadata['actor year Of birth']= final_character_metadata['Movie release year'] - final_character_metadata['Actor age at movie release']

In [19]:
# get all actors classified at least once
actors = final_character_metadata[['Actor Name','Actor gender','Actor ethnicity','Actor height','actor year Of birth']]
actors = actors.drop_duplicates(subset= ['Actor Name'])
actors

Unnamed: 0,Actor Name,Actor gender,Actor ethnicity,Actor height,actor year Of birth
0,Frank Krog,M,,,1955.0
1,Kristin Kajander,F,,,1960.0
2,Vidar Sandem,M,,,1948.0
3,Anne Krigsvoll,F,,,1958.0
4,Erwin Geschonneck,M,,,1907.0
...,...,...,...,...,...
79694,Marge Champion,F,,,1920.0
79699,Tony Bickley,M,,,
79700,Bill Fiore,,,,
79701,Janet Landgard,,,,


In [20]:
def standardize(x):
    """Standardize a data set following an axis."""
    mean_x = np.mean(x, axis = 0)
    x = x - mean_x
    std_x = np.std(x, axis = 0)
    x = x / std_x
    return x, mean_x, std_x

In [21]:
# Keep relevant features and normalize height and age
final_character_metadata_normalized = final_character_metadata[['averageRating', 'played same character', 'nominations so far', 'wins so far', 'Actor gender', 'Actor age at movie release', 'Actor ethnicity', 'Actor height']]
final_character_metadata_normalized['Actor height'], mean_height, std_height = standardize(final_character_metadata_normalized['Actor height'])
final_character_metadata_normalized['Actor age at movie release'], mean_age, std_age = standardize(final_character_metadata_normalized['Actor age at movie release'])

In [22]:
# fill NA values with 0 for normalized continuous variables, with Unspecified for categorical variables
final_character_metadata_normalized = final_character_metadata_normalized.dropna(thresh=4)
final_character_metadata_normalized[['Actor height','Actor age at movie release']] = final_character_metadata_normalized[['Actor height','Actor age at movie release']].fillna(0)
final_character_metadata_normalized[['Actor ethnicity','Actor gender']] = final_character_metadata_normalized[['Actor ethnicity','Actor gender']].fillna('Unspecified')

# Model

In [23]:
# Rename columns
final_character_metadata_normalized = final_character_metadata_normalized.rename(columns = {
    'played same character': 'sameCharacterCount', 
    'nominations so far': 'nominations', 
    'wins so far': 'wins', 
    'Actor gender': 'gender', 
    'Actor age at movie release': 'age', 
    'Actor ethnicity': 'ethnicity', 
    'Actor height': 'height'
})

In [24]:
def forward_selected(data, y_label):
    """Design a linear model by picking predictors using forward selection evaluated by adjusted R-squared.

    Args:
        - data : DataFrame with all possible predictors and the response
        - y_label: string, name of the response column in data

    Returns:
        - model: an optimal fitted statsmodels linear model
    """
    left = set(data.columns)
    left.remove(y_label)
    selected = []
    current_radj_score, best_new_radj_score = 0.0, 0.0
    while left and current_radj_score == best_new_radj_score:
        scores_with_candidates = []
        # Evaluate all possibilities of next predictor
        for candidate in left:
            formula = "{} ~ {} + 1".format(y_label,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_radj_score, best_candidate = scores_with_candidates.pop()
        if current_radj_score < best_new_radj_score:
            left.remove(best_candidate)
            selected.append(best_candidate)
            current_radj_score = best_new_radj_score
    formula = "{} ~ {} + 1".format(y_label,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [25]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(final_character_metadata_normalized.drop(columns=['ethnicity', 'gender']), test_size=0.1)

In [26]:

model = forward_selected(train, 'averageRating')
formula = model.model.formula
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          averageRating   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     36.05
Date:                Fri, 23 Dec 2022   Prob (F-statistic):           5.24e-37
Time:                        20:46:08   Log-Likelihood:            -1.1378e+05
No. Observations:               71760   AIC:                         2.276e+05
Df Residuals:                   71754   BIC:                         2.276e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              6.2380      0

In [28]:
model.model.formula

'averageRating ~ nominations + sameCharacterCount + height + wins + age + 1'

In [30]:
def get_potential_actors(test_movie):
    # get details about the characters we want to cast for
    test_characters= final_character_metadata[final_character_metadata['ID'] == test_movie]
    test_characters=test_characters[~test_characters['classification'].isna()]
    test_characters_details = test_characters[['Character Name','Actor gender','Actor ethnicity','Actor age at movie release','classification','Movie release year']]
    test_characters_details['Actor range of age'] = test_characters_details['Actor age at movie release'].map(lambda x: int(x//10))

    # get potential actors
    potential_actors = pd.merge(test_characters_details,actors,how = 'cross',suffixes=('_original','_potential'))
    # filter potential actors by gender
    potential_actors = potential_actors[potential_actors['Actor gender_original'] == potential_actors['Actor gender_potential']]
    # filter potential actors by ethnicity
    potential_actors =potential_actors[ (potential_actors['Actor ethnicity_original'].isna())|(potential_actors['Actor ethnicity_original'] == potential_actors['Actor ethnicity_potential'])]
    # filter potential actors by age range
    potential_actors['actor_age_potential'] = potential_actors['Movie release year'] -potential_actors['actor year Of birth']
    potential_actors['Actor range of age_potential'] =potential_actors['actor_age_potential'].map(lambda x: x // 10)
    potential_actors = potential_actors[potential_actors['Actor range of age'] == potential_actors['Actor range of age_potential']]
    
    # add rest of metadata to potential actors
    filtered_potential_actors = potential_actors[['Character Name','Actor Name','classification','Movie release year','Actor height','actor_age_potential']]
    filtered_potential_actors['nominations'] = filtered_potential_actors[['Actor Name','Movie release year']].apply(lambda x: count_previous_nominations(x[0],x[1],oscar_nominees),axis=1)
    filtered_potential_actors['wins'] = filtered_potential_actors[['Actor Name','Movie release year']].apply(lambda x: count_previous_wins(x[0],x[1],oscar_nominees),axis=1)
    filtered_potential_actors['sameCharacterCount'] = filtered_potential_actors[['Actor Name','Movie release year','classification']].apply(lambda x: count_similar_previous_roles(x[0],x[1],x[2],final_characters),axis=1)

    return filtered_potential_actors

In [68]:
def sort_potential_actors(potential_actors):
    # Rename columns
    potential_actors = potential_actors.rename(columns = {
        'actor_age_potential': 'age', 
        'Actor height': 'height'
    })

    # Normalize age and height
    potential_actors['age'] = (potential_actors['age'] - mean_age)/std_age
    potential_actors['height'] = (potential_actors['height'] - mean_height)/std_height
    potential_actors[['height','age ']] = potential_actors[['height','age']].fillna(0)

    # Fit Model
    potential_actors['predicted_rating'] = model.predict(potential_actors)

    # Sort result by predicted rating
    

    return potential_actors.sort_values(['predicted_rating'], ascending=False)

In [80]:
# Generate tables of actors suggested per character

test_movies = [13950959,24314116,1059701]
for movie_id in test_movies:
    test_df = sort_potential_actors(get_potential_actors(movie_id))
    characters = test_df['Character Name'].unique().tolist()
    for character in characters:
        character_df = test_df[test_df['Character Name'] == character].reset_index()
        character_df = character_df[['Actor Name', 'predicted_rating']]
        fig = go.Figure(data=[go.Table(header=dict(values=['Actor Name', 'Predicted Rating'], align='left'),
                 cells=dict(values=[character_df[k].tolist() for k in character_df.columns], align='left'))
                     ])
        fig.write_html("plots/RQ3-"+ str(movie_id)+"-'"+character+"'.html")