In [2]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Preprocessing

In [3]:
# read oscar dataset
oscar_dataset = pd.read_csv('the_oscar_award.csv')
oscar_dataset = oscar_dataset.rename(columns = {'name': 'Actor Name','film' : 'Movie name','year_film':'Movie release year'})
# only take actor/actress Awards from all categories
oscar_nominees = oscar_dataset[oscar_dataset['category'].str.contains('ACTOR') | oscar_dataset['category'].str.contains('ACTRESS')].reset_index(drop=True)
oscar_winners = oscar_nominees[oscar_nominees['winner'] == True]
oscar_winners

Unnamed: 0,Movie release year,year_ceremony,ceremony,category,Actor Name,Movie name,winner
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
6,1928,1929,2,ACTOR,Warner Baxter,In Old Arizona,True
15,1928,1929,2,ACTRESS,Mary Pickford,Coquette,True
16,1929,1930,3,ACTOR,George Arliss,Disraeli,True
...,...,...,...,...,...,...,...
1725,2018,2019,91,ACTRESS IN A SUPPORTING ROLE,Regina King,If Beale Street Could Talk,True
1731,2019,2020,92,ACTOR IN A LEADING ROLE,Joaquin Phoenix,Joker,True
1737,2019,2020,92,ACTOR IN A SUPPORTING ROLE,Brad Pitt,Once upon a Time...in Hollywood,True
1742,2019,2020,92,ACTRESS IN A LEADING ROLE,Renée Zellweger,Judy,True


In [4]:
# count the number of previous nominations of the actor prior to the ceremony date
def count_previous_nominations(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return (actor_oscars['year_ceremony'] <= ceremony).sum()
# count the number of previous wins of the actor prior to the ceremony date
def count_previous_wins(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return ((actor_oscars['year_ceremony'] <= ceremony) & (actor_oscars['winner'] == True)).sum()


In [5]:
## determine for every oscar nomination how many oscars the actor won and got nominated to before in his career
oscar_nominees['nominations so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_nominations(x[0],x[1],oscar_dataset),axis=1)
oscar_nominees['wins so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_wins(x[0],x[1],oscar_dataset),axis=1)
final_oscar_nominees = oscar_nominees[['Actor Name','Movie name','nominations so far','wins so far']]
final_oscar_nominees[final_oscar_nominees['Actor Name']== 'Daniel Day-Lewis']
final_oscar_nominees


Unnamed: 0,Actor Name,Movie name,nominations so far,wins so far
0,Richard Barthelmess,The Noose,1,0
1,Emil Jannings,The Last Command,1,1
2,Louise Dresser,A Ship Comes In,1,0
3,Janet Gaynor,7th Heaven,1,1
4,Gloria Swanson,Sadie Thompson,1,0
...,...,...,...,...
1743,Kathy Bates,Richard Jewell,4,1
1744,Laura Dern,Marriage Story,3,1
1745,Scarlett Johansson,Jojo Rabbit,2,0
1746,Florence Pugh,Little Women,1,0


In [35]:
character_metadata_header = [
    'ID',
    'Freebase movie ID',
    'Movie release date',
    'Character Name',
    'Actor DOB',
    'Actor gender',
    'Actor height',
    'Actor ethnicity',
    'Actor Name',
    'Actor age at movie release',
    'Freebase character map',
    'Freebase character ID',
    'Freebase actor ID'
]
character_metadata = pd.read_csv(
    'MovieSummaries/character.metadata.tsv',
    delimiter='\t',
    names=character_metadata_header
)
movie_metadata = pd.read_csv(
    'movies_with_rating.csv'
)


In [36]:
# added average rating, release year and movie name to character metadata 
movie_ratings = movie_metadata[['ID','Movie name','Movie release year','averageRating']]
character_metadata_with_rating = pd.merge(character_metadata,movie_ratings, on = 'ID', how = 'inner')

In [37]:
filtered_characters = pd.read_csv('filtered_characters.csv')
final_characters = pd.merge(filtered_characters,movie_ratings[['ID','Movie release year']],right_on= 'ID',left_on='movie_id',how = 'inner')

In [38]:
def count_similar_previous_roles(actor_name,movie_release_year,classification,characters_dataset):
    actor_character = characters_dataset[(characters_dataset['Actor Name']== actor_name) & (characters_dataset['classification']== classification)]
    return (actor_character['Movie release year'] < movie_release_year).sum()

In [39]:
final_characters['played same character']= final_characters[['Actor Name','Movie release year','classification']].apply(
    lambda x: count_similar_previous_roles(x[0],x[1],x[2],final_characters),axis = 1)

In [40]:
final_characters[final_characters['Actor Name']== 'Tom Hanks']

Unnamed: 0,movie_id,Actor Name,Character Name,name,classification,ID,Movie release year,played same character
1397,543433,Tom Hanks,Jimmy Dugan,Dugan,22,543433,1992.0,0
2173,53085,Tom Hanks,Sheriff Woody,Woody,3,53085,1995.0,0
2352,4186631,Tom Hanks,Richard Harlan Drew,Richard,8,4186631,1985.0,0
2569,1565181,Tom Hanks,"Walter Fielding, Jr.",Fielding,6,1565181,1986.0,0
2647,1724301,Tom Hanks,Det. Scott Turner,Turner,6,1724301,1989.0,1
6591,176489,Tom Hanks,Joe,Joe,6,176489,1990.0,2
8347,4186781,Tom Hanks,Lawrence Whatley Bourne III,III,20,4186781,1985.0,0


In [41]:
final_characters = final_characters.drop(columns=['movie_id','name','Movie release year'])

In [42]:
character_metadata_with_role_count = pd.merge(character_metadata_with_rating,final_characters,on =['ID','Actor Name','Character Name'],how = 'left')
character_metadata_with_role_count.columns

Index(['ID', 'Freebase movie ID', 'Movie release date', 'Character Name',
       'Actor DOB', 'Actor gender', 'Actor height', 'Actor ethnicity',
       'Actor Name', 'Actor age at movie release', 'Freebase character map',
       'Freebase character ID', 'Freebase actor ID', 'Movie name',
       'Movie release year', 'averageRating', 'classification',
       'played same character'],
      dtype='object')

In [43]:
character_metadata_with_role_count['played same character']= character_metadata_with_role_count['played same character'].fillna(0)

In [44]:
final_character_metadata = pd.merge(character_metadata_with_role_count, final_oscar_nominees, on = ['Actor Name','Movie name'], how = 'left' )
final_character_metadata[['nominations so far','wins so far']] = final_character_metadata[['nominations so far','wins so far']].fillna(0)

In [48]:
final_character_metadata[final_character_metadata['nominations so far']>0]

Unnamed: 0,ID,Freebase movie ID,Movie release date,Character Name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor Name,Actor age at movie release,Freebase character map,Freebase character ID,Freebase actor ID,Movie name,Movie release year,averageRating,classification,played same character,nominations so far,wins so far
61,1369204,/m/04x8zs,1939,,1902-05-02,M,1.880,,Brian Aherne,36.0,/m/02vcld0,,/m/04x8_g,Juarez,1939.0,6.9,,0.0,1.0,0.0
1057,142443,/m/011yl_,1996-01-21,Peter,1930-12-17,M,,/m/013xrm,Armin Mueller-Stahl,65.0,/m/0k36jv,/m/0bnr0v1,/m/02my3z,Shine,1996.0,7.2,21.0,0.0,1.0,0.0
1062,142443,/m/011yl_,1996-01-21,David Helfgott,1951-07-06,M,1.830,,Geoffrey Rush,44.0,/m/0k36jj,/m/02nw8qb,/m/0170pk,Shine,1996.0,7.2,,0.0,1.0,1.0
1348,142457,/m/011yqc,1997-05-14,Lynn Bracken,1953-12-08,F,1.710,/m/01qhm_,Kim Basinger,43.0,/m/0j_n18,/m/0bh33z3,/m/01d0fp,L.A. Confidential,1997.0,8.7,15.0,0.0,1.0,1.0
1512,1482785,/m/054_2g,1938-02-11,,1888-05-03,F,,,Beulah Bondi,,/m/02tb7jx,,/m/04c98w,Of Human Hearts,1938.0,6.8,,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78948,171618,/m/01716t,2001-02-16,Lee Krasner,1959-08-14,F,1.638,,Marcia Gay Harden,41.0,/m/0k201p,/m/0cgzsxj,/m/022411,Pollock,2001.0,7.0,6.0,0.0,1.0,1.0
79652,43452,/m/0bx0l,1962-12-10,T. E. Lawrence,1932-08-02,M,1.910,/m/02g7sp,Peter O'Toole,30.0,/m/0j_tdt,/m/02nw8h0,/m/0h0jz,Lawrence of Arabia,1962.0,8.3,6.0,0.0,1.0,0.0
79653,43452,/m/0bx0l,1962-12-10,Sherif Ali,1932-04-10,M,1.800,,Omar Sharif,30.0,/m/0j_tdz,/m/0c1mf0z,/m/019_1h,Lawrence of Arabia,1962.0,8.3,1.0,0.0,1.0,0.0
79673,6707631,/m/0gjk1d,1995-12-29,Sister Helen Prejean,1946-10-04,F,1.700,/m/09vc4s,Susan Sarandon,49.0,/m/0jx1qp,/m/02nwczk,/m/01vwllw,Dead Man Walking,1995.0,5.0,,0.0,5.0,1.0


In [49]:
def standardize(x):
    """Standardize a data set following an axis."""
    mean_x = np.mean(x, axis = 0)
    x = x - mean_x
    std_x = np.std(x, axis = 0)
    x = x / std_x
    return x, mean_x, std_x

In [50]:
# Keep relevant features and normalize height and age
final_character_metadata_normalized = final_character_metadata[['averageRating', 'played same character', 'nominations so far', 'wins so far', 'Actor gender', 'Actor age at movie release', 'Actor ethnicity', 'Actor height']]
final_character_metadata_normalized['Actor height'], mean_height, std_height = standardize(final_character_metadata_normalized['Actor height'])
final_character_metadata_normalized['Actor age at movie release'], mean_age, std_age = standardize(final_character_metadata_normalized['Actor age at movie release'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_character_metadata_normalized['Actor height'], mean_height, std_height = standardize(final_character_metadata_normalized['Actor height'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_character_metadata_normalized['Actor age at movie release'], mean_age, std_age = standardize(final_character_metadata_normalized['Actor age at movie release'])


In [52]:
# fill NA values with 0 for normalized continuous variables, with Unspecified for categorical variables
final_character_metadata_normalized = final_character_metadata_normalized.dropna(thresh=4)
final_character_metadata_normalized[['Actor height','Actor age at movie release']] = final_character_metadata_normalized[['Actor height','Actor age at movie release']].fillna(0)
final_character_metadata_normalized[['Actor ethnicity','Actor gender']] = final_character_metadata_normalized[['Actor ethnicity','Actor gender']].fillna('Unspecified')

# Model

In [53]:
# Rename columns
final_character_metadata_normalized = final_character_metadata_normalized.rename(columns = {
    'played same character': 'sameCharacterCount', 
    'nominations so far': 'nominations', 
    'wins so far': 'wins', 
    'Actor gender': 'gender', 
    'Actor age at movie release': 'age', 
    'Actor ethnicity': 'ethnicity', 
    'Actor height': 'height'
})

In [54]:
def forward_selected(data, y_label):
    """Design a linear model by picking predictors using forward selection evaluated by adjusted R-squared.

    Args:
        - data : DataFrame with all possible predictors and the response
        - y_label: string, name of the response column in data

    Returns:
        - model: an optimal fitted statsmodels linear model
    """
    left = set(data.columns)
    left.remove(y_label)
    selected = []
    current_radj_score, best_new_radj_score = 0.0, 0.0
    while left and current_radj_score == best_new_radj_score:
        scores_with_candidates = []
        # Evaluate all possibilities of next predictor
        for candidate in left:
            formula = "{} ~ {} + 1".format(y_label,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_radj_score, best_candidate = scores_with_candidates.pop()
        if current_radj_score < best_new_radj_score:
            left.remove(best_candidate)
            selected.append(best_candidate)
            current_radj_score = best_new_radj_score
    formula = "{} ~ {} + 1".format(y_label,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [55]:
model = forward_selected(final_character_metadata_normalized, 'averageRating')
model.model.formula

'averageRating ~ ethnicity + nominations + gender + height + sameCharacterCount + wins + age + 1'