In [138]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.set_option('mode.chained_assignment', None)

## Preprocessing

In [139]:
# read oscar dataset
oscar_dataset = pd.read_csv('the_oscar_award.csv')
oscar_dataset = oscar_dataset.rename(columns = {'name': 'Actor Name','film' : 'Movie name','year_film':'Movie release year'})
# only take actor/actress Awards from all categories
oscar_nominees = oscar_dataset[oscar_dataset['category'].str.contains('ACTOR') | oscar_dataset['category'].str.contains('ACTRESS')].reset_index(drop=True)
oscar_winners = oscar_nominees[oscar_nominees['winner'] == True]
oscar_winners

Unnamed: 0,Movie release year,year_ceremony,ceremony,category,Actor Name,Movie name,winner
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
6,1928,1929,2,ACTOR,Warner Baxter,In Old Arizona,True
15,1928,1929,2,ACTRESS,Mary Pickford,Coquette,True
16,1929,1930,3,ACTOR,George Arliss,Disraeli,True
...,...,...,...,...,...,...,...
1725,2018,2019,91,ACTRESS IN A SUPPORTING ROLE,Regina King,If Beale Street Could Talk,True
1731,2019,2020,92,ACTOR IN A LEADING ROLE,Joaquin Phoenix,Joker,True
1737,2019,2020,92,ACTOR IN A SUPPORTING ROLE,Brad Pitt,Once upon a Time...in Hollywood,True
1742,2019,2020,92,ACTRESS IN A LEADING ROLE,Renée Zellweger,Judy,True


In [140]:
# count the number of previous nominations of the actor prior to the ceremony date
def count_previous_nominations(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return (actor_oscars['year_ceremony'] <= ceremony).sum()
# count the number of previous wins of the actor prior to the ceremony date
def count_previous_wins(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return ((actor_oscars['year_ceremony'] <= ceremony) & (actor_oscars['winner'] == True)).sum()


In [141]:
## determine for every oscar nomination how many oscars the actor won and got nominated to before in his career
oscar_nominees['nominations so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_nominations(x[0],x[1],oscar_nominees),axis=1)
oscar_nominees['wins so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_wins(x[0],x[1],oscar_nominees),axis=1)
final_oscar_nominees = oscar_nominees[['Actor Name','Movie name','nominations so far','wins so far']]
final_oscar_nominees[final_oscar_nominees['Actor Name']== 'Daniel Day-Lewis']
final_oscar_nominees


Unnamed: 0,Actor Name,Movie name,nominations so far,wins so far
0,Richard Barthelmess,The Noose,1,0
1,Emil Jannings,The Last Command,1,1
2,Louise Dresser,A Ship Comes In,1,0
3,Janet Gaynor,7th Heaven,1,1
4,Gloria Swanson,Sadie Thompson,1,0
...,...,...,...,...
1743,Kathy Bates,Richard Jewell,4,1
1744,Laura Dern,Marriage Story,3,1
1745,Scarlett Johansson,Jojo Rabbit,2,0
1746,Florence Pugh,Little Women,1,0


In [142]:
character_metadata_header = [
    'ID',
    'Freebase movie ID',
    'Movie release date',
    'Character Name',
    'Actor DOB',
    'Actor gender',
    'Actor height',
    'Actor ethnicity',
    'Actor Name',
    'Actor age at movie release',
    'Freebase character map',
    'Freebase character ID',
    'Freebase actor ID'
]
character_metadata = pd.read_csv(
    'MovieSummaries/character.metadata.tsv',
    delimiter='\t',
    names=character_metadata_header
)
movie_metadata = pd.read_csv(
    'movies_with_rating.csv'
)


In [143]:
# added average rating, release year and movie name to character metadata 
movie_ratings = movie_metadata[['ID','Movie name','Movie release year','averageRating']]
character_metadata_with_rating = pd.merge(character_metadata,movie_ratings, on = 'ID', how = 'inner')

In [144]:
character_metadata_with_rating

Unnamed: 0,ID,Freebase movie ID,Movie release date,Character Name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor Name,Actor age at movie release,Freebase character map,Freebase character ID,Freebase actor ID,Movie name,Movie release year,averageRating
0,28463795,/m/0crgdbh,1988,,1954-10-05,M,,,Frank Krog,33.0,/m/0gct1bn,,/m/053j7xf,Brun bitter,1988.0,5.7
1,28463795,/m/0crgdbh,1988,,1959-02-28,F,,,Kristin Kajander,28.0,/m/0gct1bv,,/m/0gct1by,Brun bitter,1988.0,5.7
2,28463795,/m/0crgdbh,1988,,1947,M,,,Vidar Sandem,40.0,/m/0gct1c5,,/m/0bwh7d8,Brun bitter,1988.0,5.7
3,28463795,/m/0crgdbh,1988,,1957-02-04,F,,,Anne Krigsvoll,30.0,/m/0gct1cb,,/m/04ghdvq,Brun bitter,1988.0,5.7
4,32456683,/m/0gyryjt,1967,,1906-12-27,M,,,Erwin Geschonneck,60.0,/m/0h40xcx,,/m/02qyp1n,Die Fahne von Kriwoj Rog,1967.0,7.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79728,6456053,/m/0g605h,1964-12-31,Lady Angela St. Simeon,1923-08-06,F,,,Moira Lister,41.0,/m/0cg4r4r,/m/0hnw0m8,/m/07pfcc,The Yellow Rolls-Royce,1964.0,6.4
79729,6456053,/m/0g605h,1964-12-31,Assistant Car Salesman,1933-07-26,M,,,Lance Percival,31.0,/m/0cg01yt,/m/0hnw0ly,/m/052k8b,The Yellow Rolls-Royce,1964.0,6.4
79730,6456053,/m/0g605h,1964-12-31,Norwood,1900-08-31,M,,,Roland Culver,64.0,/m/0cs4ftp,/m/0hnw0kl,/m/0412scr,The Yellow Rolls-Royce,1964.0,6.4
79731,6456053,/m/0g605h,1964-12-31,Harnsworth,1911-10-03,M,,,Michael Hordern,53.0,/m/0csk5b4,/m/0hnw0mv,/m/015vql,The Yellow Rolls-Royce,1964.0,6.4


In [173]:
filtered_characters = pd.read_csv('filtered_characters.csv')
final_characters = pd.merge(filtered_characters,movie_ratings[['ID','Movie release year']],right_on= 'ID',left_on='movie_id',how = 'inner')

In [174]:
def count_similar_previous_roles(actor_name,movie_release_year,classification,characters_dataset):
    actor_character = characters_dataset[(characters_dataset['Actor Name']== actor_name) & (characters_dataset['classification']== classification)]
    return (actor_character['Movie release year'] < movie_release_year).sum()

In [175]:
final_characters['played same character']= final_characters[['Actor Name','Movie release year','classification']].apply(
    lambda x: count_similar_previous_roles(x[0],x[1],x[2],final_characters),axis = 1)

In [176]:
final_characters[final_characters['Actor Name']== 'Tom Hanks']

Unnamed: 0,movie_id,Actor Name,Character Name,name,classification,ID,Movie release year,played same character
1397,543433,Tom Hanks,Jimmy Dugan,Dugan,22,543433,1992.0,0
2173,53085,Tom Hanks,Sheriff Woody,Woody,3,53085,1995.0,0
2352,4186631,Tom Hanks,Richard Harlan Drew,Richard,8,4186631,1985.0,0
2569,1565181,Tom Hanks,"Walter Fielding, Jr.",Fielding,6,1565181,1986.0,0
2647,1724301,Tom Hanks,Det. Scott Turner,Turner,6,1724301,1989.0,1
6591,176489,Tom Hanks,Joe,Joe,6,176489,1990.0,2
8347,4186781,Tom Hanks,Lawrence Whatley Bourne III,III,20,4186781,1985.0,0


In [177]:
final_filtered_characters = final_characters.drop(columns=['movie_id','name','Movie release year'])

In [178]:
character_metadata_with_role_count = pd.merge(character_metadata_with_rating,final_filtered_characters,on =['ID','Actor Name','Character Name'],how = 'left')
character_metadata_with_role_count.columns

Index(['ID', 'Freebase movie ID', 'Movie release date', 'Character Name',
       'Actor DOB', 'Actor gender', 'Actor height', 'Actor ethnicity',
       'Actor Name', 'Actor age at movie release', 'Freebase character map',
       'Freebase character ID', 'Freebase actor ID', 'Movie name',
       'Movie release year', 'averageRating', 'classification',
       'played same character'],
      dtype='object')

In [179]:
character_metadata_with_role_count['played same character']= character_metadata_with_role_count['played same character'].fillna(0)

In [180]:
final_character_metadata = pd.merge(character_metadata_with_role_count, final_oscar_nominees, on = ['Actor Name','Movie name'], how = 'left' )
final_character_metadata[['nominations so far','wins so far']] = final_character_metadata[['nominations so far','wins so far']].fillna(0)

In [153]:
final_character_metadata[final_character_metadata['nominations so far']>0]

Unnamed: 0,ID,Freebase movie ID,Movie release date,Character Name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor Name,Actor age at movie release,Freebase character map,Freebase character ID,Freebase actor ID,Movie name,Movie release year,averageRating,classification,played same character,nominations so far,wins so far
61,1369204,/m/04x8zs,1939,,1902-05-02,M,1.880,,Brian Aherne,36.0,/m/02vcld0,,/m/04x8_g,Juarez,1939.0,6.9,,0.0,1.0,0.0
1057,142443,/m/011yl_,1996-01-21,Peter,1930-12-17,M,,/m/013xrm,Armin Mueller-Stahl,65.0,/m/0k36jv,/m/0bnr0v1,/m/02my3z,Shine,1996.0,7.2,21.0,0.0,1.0,0.0
1062,142443,/m/011yl_,1996-01-21,David Helfgott,1951-07-06,M,1.830,,Geoffrey Rush,44.0,/m/0k36jj,/m/02nw8qb,/m/0170pk,Shine,1996.0,7.2,,0.0,1.0,1.0
1348,142457,/m/011yqc,1997-05-14,Lynn Bracken,1953-12-08,F,1.710,/m/01qhm_,Kim Basinger,43.0,/m/0j_n18,/m/0bh33z3,/m/01d0fp,L.A. Confidential,1997.0,8.7,15.0,0.0,1.0,1.0
1512,1482785,/m/054_2g,1938-02-11,,1888-05-03,F,,,Beulah Bondi,,/m/02tb7jx,,/m/04c98w,Of Human Hearts,1938.0,6.8,,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78948,171618,/m/01716t,2001-02-16,Lee Krasner,1959-08-14,F,1.638,,Marcia Gay Harden,41.0,/m/0k201p,/m/0cgzsxj,/m/022411,Pollock,2001.0,7.0,6.0,0.0,1.0,1.0
79652,43452,/m/0bx0l,1962-12-10,T. E. Lawrence,1932-08-02,M,1.910,/m/02g7sp,Peter O'Toole,30.0,/m/0j_tdt,/m/02nw8h0,/m/0h0jz,Lawrence of Arabia,1962.0,8.3,6.0,0.0,1.0,0.0
79653,43452,/m/0bx0l,1962-12-10,Sherif Ali,1932-04-10,M,1.800,,Omar Sharif,30.0,/m/0j_tdz,/m/0c1mf0z,/m/019_1h,Lawrence of Arabia,1962.0,8.3,1.0,0.0,1.0,0.0
79673,6707631,/m/0gjk1d,1995-12-29,Sister Helen Prejean,1946-10-04,F,1.700,/m/09vc4s,Susan Sarandon,49.0,/m/0jx1qp,/m/02nwczk,/m/01vwllw,Dead Man Walking,1995.0,5.0,,0.0,5.0,1.0


In [154]:
def standardize(x):
    """Standardize a data set following an axis."""
    mean_x = np.mean(x, axis = 0)
    x = x - mean_x
    std_x = np.std(x, axis = 0)
    x = x / std_x
    return x, mean_x, std_x

In [155]:
# Keep relevant features and normalize height and age
final_character_metadata_normalized = final_character_metadata[['averageRating', 'played same character', 'nominations so far', 'wins so far', 'Actor gender', 'Actor age at movie release', 'Actor ethnicity', 'Actor height']]
final_character_metadata_normalized['Actor height'], mean_height, std_height = standardize(final_character_metadata_normalized['Actor height'])
final_character_metadata_normalized['Actor age at movie release'], mean_age, std_age = standardize(final_character_metadata_normalized['Actor age at movie release'])

In [156]:
# fill NA values with 0 for normalized continuous variables, with Unspecified for categorical variables
final_character_metadata_normalized = final_character_metadata_normalized.dropna(thresh=4)
final_character_metadata_normalized[['Actor height','Actor age at movie release']] = final_character_metadata_normalized[['Actor height','Actor age at movie release']].fillna(0)
final_character_metadata_normalized[['Actor ethnicity','Actor gender']] = final_character_metadata_normalized[['Actor ethnicity','Actor gender']].fillna('Unspecified')

# Model

In [157]:
# Rename columns
final_character_metadata_normalized = final_character_metadata_normalized.rename(columns = {
    'played same character': 'sameCharacterCount', 
    'nominations so far': 'nominations', 
    'wins so far': 'wins', 
    'Actor gender': 'gender', 
    'Actor age at movie release': 'age', 
    'Actor ethnicity': 'ethnicity', 
    'Actor height': 'height'
})

In [158]:
def forward_selected(data, y_label):
    """Design a linear model by picking predictors using forward selection evaluated by adjusted R-squared.

    Args:
        - data : DataFrame with all possible predictors and the response
        - y_label: string, name of the response column in data

    Returns:
        - model: an optimal fitted statsmodels linear model
    """
    left = set(data.columns)
    left.remove(y_label)
    selected = []
    current_radj_score, best_new_radj_score = 0.0, 0.0
    while left and current_radj_score == best_new_radj_score:
        scores_with_candidates = []
        # Evaluate all possibilities of next predictor
        for candidate in left:
            formula = "{} ~ {} + 1".format(y_label,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_radj_score, best_candidate = scores_with_candidates.pop()
        if current_radj_score < best_new_radj_score:
            left.remove(best_candidate)
            selected.append(best_candidate)
            current_radj_score = best_new_radj_score
    formula = "{} ~ {} + 1".format(y_label,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [159]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(final_character_metadata_normalized.drop(columns=['ethnicity', 'gender']), test_size=0.1)

In [160]:

model = forward_selected(train, 'averageRating')
formula = model.model.formula
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          averageRating   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     36.17
Date:                Fri, 23 Dec 2022   Prob (F-statistic):           3.89e-37
Time:                        19:30:14   Log-Likelihood:            -1.1386e+05
No. Observations:               71760   AIC:                         2.277e+05
Df Residuals:                   71754   BIC:                         2.278e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              6.2364      0

In [161]:
a= model.predict(test)

In [185]:
a

11298    6.256642
23672    6.241882
11207    6.228961
7888     6.247879
75441    6.596224
           ...   
10649    6.236418
76399    6.229822
73750    6.246346
5569     6.236418
63742    6.226072
Length: 7974, dtype: float64

In [162]:
model.model.formula

'averageRating ~ nominations + sameCharacterCount + height + wins + age + 1'

In [163]:
# test_movies = [13950959,24314116,1059701]
test_movies = [13950959]
test_characters= final_character_metadata[final_character_metadata['ID'].isin(test_movies)]
test_characters=test_characters[~test_characters['classification'].isna()]

test_characters_details = test_characters[['Character Name','Actor gender','Actor ethnicity','Actor age at movie release','classification','Movie release year']]


In [164]:
test_characters_details['Actor range of age'] = test_characters_details['Actor age at movie release'].map(lambda x: int(x//10))
test_characters_details

Unnamed: 0,Character Name,Actor gender,Actor ethnicity,Actor age at movie release,classification,Movie release year,Actor range of age
35910,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4
35911,Dr. John Cawley,M,/m/0dryh9k,66.0,4.0,2010.0,6
35912,Rachel Solando,F,,38.0,18.0,2010.0,3
35913,Dolores Chanal,F,,29.0,11.0,2010.0,2
35915,George Noyce,M,,48.0,4.0,2010.0,4
35916,Andrew Laeddis,M,/m/067lrj,48.0,3.0,2010.0,4
35921,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3


In [165]:
final_character_metadata['actor year Of birth']= final_character_metadata['Movie release year'] - final_character_metadata['Actor age at movie release']

In [166]:
actors = final_character_metadata[['Actor Name','Actor gender','Actor ethnicity','Actor height','actor year Of birth']]
actors = actors.drop_duplicates(subset= ['Actor Name'])

actors

Unnamed: 0,Actor Name,Actor gender,Actor ethnicity,Actor height,actor year Of birth
0,Frank Krog,M,,,1955.0
1,Kristin Kajander,F,,,1960.0
2,Vidar Sandem,M,,,1948.0
3,Anne Krigsvoll,F,,,1958.0
4,Erwin Geschonneck,M,,,1907.0
...,...,...,...,...,...
79694,Marge Champion,F,,,1920.0
79699,Tony Bickley,M,,,
79700,Bill Fiore,,,,
79701,Janet Landgard,,,,


In [167]:
potential_actors = pd.merge(test_characters_details,actors,how = 'cross',suffixes=('_original','_potential'))
potential_actors

Unnamed: 0,Character Name,Actor gender_original,Actor ethnicity_original,Actor age at movie release,classification,Movie release year,Actor range of age,Actor Name,Actor gender_potential,Actor ethnicity_potential,Actor height,actor year Of birth
0,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Frank Krog,M,,,1955.0
1,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Kristin Kajander,F,,,1960.0
2,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Vidar Sandem,M,,,1948.0
3,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Anne Krigsvoll,F,,,1958.0
4,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Erwin Geschonneck,M,,,1907.0
...,...,...,...,...,...,...,...,...,...,...,...,...
263916,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Marge Champion,F,,,1920.0
263917,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Tony Bickley,M,,,
263918,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Bill Fiore,,,,
263919,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Janet Landgard,,,,


In [168]:
## keep same gender
potential_actors = potential_actors[potential_actors['Actor gender_original'] == potential_actors['Actor gender_potential']]
# potential_actors.drop(columns = ['Actor gender_original'])
potential_actors

Unnamed: 0,Character Name,Actor gender_original,Actor ethnicity_original,Actor age at movie release,classification,Movie release year,Actor range of age,Actor Name,Actor gender_potential,Actor ethnicity_potential,Actor height,actor year Of birth
0,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Frank Krog,M,,,1955.0
2,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Vidar Sandem,M,,,1948.0
4,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Erwin Geschonneck,M,,,1907.0
6,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Tyrese Gibson,M,/m/0x67,1.797,1979.0
8,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Anthony Johnson,M,/m/0x67,,1965.0
...,...,...,...,...,...,...,...,...,...,...,...,...
263909,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Gamil Ratib,M,,,1926.0
263910,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Herbert Jeffreys,M,/m/0x67,1.870,1913.0
263913,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Steve Boles,M,,,
263914,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Ray Aranha,M,/m/0x67,,1939.0


In [169]:
# keep same ethnicity if possible
potential_actors =potential_actors[ (potential_actors['Actor ethnicity_original'].isna())|(potential_actors['Actor ethnicity_original'] == potential_actors['Actor ethnicity_potential'])]
potential_actors

Unnamed: 0,Character Name,Actor gender_original,Actor ethnicity_original,Actor age at movie release,classification,Movie release year,Actor range of age,Actor Name,Actor gender_potential,Actor ethnicity_potential,Actor height,actor year Of birth
128,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Frankie Avalon,M,/m/0xnvg,1.730,1940.0
139,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,John Travolta,M,/m/0xnvg,1.830,1954.0
498,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Robert Blake,M,/m/0xnvg,1.630,
805,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Sonny Bono,M,/m/0xnvg,1.651,1936.0
844,Chuck Aule,M,/m/0xnvg,42.0,7.0,2010.0,4,Robert Loggia,M,/m/0xnvg,1.790,1930.0
...,...,...,...,...,...,...,...,...,...,...,...,...
229159,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Kirk Douglas,M,/m/09kr66,1.750,1917.0
229751,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Jerry Lewis,M,/m/09kr66,1.830,1926.0
240142,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Abe Vigoda,M,/m/09kr66,1.842,1921.0
244196,Teddy Daniels,M,/m/09kr66,35.0,5.0,2010.0,3,Leonardo DiCaprio,M,/m/09kr66,1.830,1975.0


In [170]:
# keep same age range at movie release
potential_actors['actor_age_potential'] = potential_actors['Movie release year'] -potential_actors['actor year Of birth']
potential_actors['Actor range of age_potential'] =potential_actors['actor_age_potential'].map(lambda x: x // 10)
potential_actors = potential_actors[potential_actors['Actor range of age'] == potential_actors['Actor range of age_potential']]


In [171]:
potential_actors.columns

Index(['Character Name', 'Actor gender_original', 'Actor ethnicity_original',
       'Actor age at movie release', 'classification', 'Movie release year',
       'Actor range of age', 'Actor Name', 'Actor gender_potential',
       'Actor ethnicity_potential', 'Actor height', 'actor year Of birth',
       'actor_age_potential', 'Actor range of age_potential'],
      dtype='object')

In [181]:
filtered_potential_actors = potential_actors[['Character Name','Actor Name','classification','Movie release year','Actor height','actor_age_potential']]
filtered_potential_actors['nominations'] = filtered_potential_actors[['Actor Name','Movie release year']].apply(lambda x: count_previous_nominations(x[0],x[1],oscar_nominees),axis=1)
filtered_potential_actors['wins'] = filtered_potential_actors[['Actor Name','Movie release year']].apply(lambda x: count_previous_wins(x[0],x[1],oscar_nominees),axis=1)
filtered_potential_actors['sameCharacterCount'] = filtered_potential_actors[['Actor Name','Movie release year','classification']].apply(lambda x: count_similar_previous_roles(x[0],x[1],x[2],final_characters),axis=1)


In [184]:
filtered_potential_actors

Unnamed: 0,Character Name,Actor Name,classification,Movie release year,Actor height,actor_age_potential,nominations,wins,sameCharacterCount
1144,Chuck Aule,Mark Ruffalo,7.0,2010.0,1.730,42.0,0,0,0
1444,Chuck Aule,Matthew Fox,7.0,2010.0,1.880,43.0,0,0,0
3551,Chuck Aule,Max Martini,7.0,2010.0,1.850,40.0,0,0,0
5020,Chuck Aule,Adam Carolla,7.0,2010.0,1.880,45.0,0,0,0
5674,Chuck Aule,James Gandolfini,7.0,2010.0,1.848,49.0,0,0,0
...,...,...,...,...,...,...,...,...,...
188495,George Noyce,John Cameron Mitchell,4.0,2010.0,,46.0,0,0,0
188496,George Noyce,Danny Quinn,4.0,2010.0,1.870,45.0,0,0,0
188500,George Noyce,Velizar Binev,4.0,2010.0,,43.0,0,0,0
190715,Andrew Laeddis,Elias Koteas,3.0,2010.0,1.791,48.0,0,0,0
