In [1]:
import pandas as pd 

## Preprocessing

In [4]:
# read oscar dataset
oscar_dataset = pd.read_csv('the_oscar_award.csv')
oscar_dataset = oscar_dataset.rename(columns = {'name': 'Actor Name','film' : 'Movie name','year_film':'Movie release year'})
# only take actor/actress Awards from all categories
oscar_nominees = oscar_dataset[oscar_dataset['category'].str.contains('ACTOR') | oscar_dataset['category'].str.contains('ACTRESS')].reset_index(drop=True)
oscar_winners = oscar_nominees[oscar_nominees['winner'] == True]
oscar_winners

Unnamed: 0,Movie release year,year_ceremony,ceremony,category,Actor Name,Movie name,winner
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
6,1928,1929,2,ACTOR,Warner Baxter,In Old Arizona,True
15,1928,1929,2,ACTRESS,Mary Pickford,Coquette,True
16,1929,1930,3,ACTOR,George Arliss,Disraeli,True
...,...,...,...,...,...,...,...
1725,2018,2019,91,ACTRESS IN A SUPPORTING ROLE,Regina King,If Beale Street Could Talk,True
1731,2019,2020,92,ACTOR IN A LEADING ROLE,Joaquin Phoenix,Joker,True
1737,2019,2020,92,ACTOR IN A SUPPORTING ROLE,Brad Pitt,Once upon a Time...in Hollywood,True
1742,2019,2020,92,ACTRESS IN A LEADING ROLE,Renée Zellweger,Judy,True


In [80]:
# count the number of previous nominations of the actor prior to the ceremony date
def count_previous_nominations(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return (actor_oscars['year_ceremony'] <= ceremony).sum()
# count the number of previous wins of the actor prior to the ceremony date
def count_previous_wins(actor_name,ceremony,oscar_dataset):
    actor_oscars = oscar_dataset[oscar_dataset['Actor Name']== actor_name]
    return ((actor_oscars['year_ceremony'] <= ceremony) & (actor_oscars['winner'] == True)).sum()


In [164]:
## determine for every oscar nomination how many oscars the actor won and got nominated to before in his career
oscar_nominees['nominations so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_nominations(x[0],x[1],oscar_dataset),axis=1)
oscar_nominees['wins so far'] = oscar_nominees[['Actor Name','year_ceremony']].apply(lambda x: count_previous_wins(x[0],x[1],oscar_dataset),axis=1)
final_oscar_nominees = oscar_nominees[['Actor Name','Movie name','nominations so far','wins so far']]
final_oscar_nominees[final_oscar_nominees['Actor Name']== 'Daniel Day-Lewis']
final_oscar_nominees


Unnamed: 0,Actor Name,Movie name,nominations so far,wins so far
0,Richard Barthelmess,The Noose,1,0
1,Emil Jannings,The Last Command,1,1
2,Louise Dresser,A Ship Comes In,1,0
3,Janet Gaynor,7th Heaven,1,1
4,Gloria Swanson,Sadie Thompson,1,0
...,...,...,...,...
1743,Kathy Bates,Richard Jewell,4,1
1744,Laura Dern,Marriage Story,3,1
1745,Scarlett Johansson,Jojo Rabbit,2,0
1746,Florence Pugh,Little Women,1,0


In [150]:
character_metadata_header = [
    'ID',
    'Freebase movie ID',
    'Movie release date',
    'Character Name',
    'Actor DOB',
    'Actor gender',
    'Actor height',
    'Actor ethnicity',
    'Actor Name',
    'Actor age at movie release',
    'Freebase character map',
    'Freebase character ID',
    'Freebase actor ID'
]
character_metadata = pd.read_csv(
    'MovieSummaries/character.metadata.tsv',
    delimiter='\t',
    names=character_metadata_header
)
movie_metadata = pd.read_csv(
    'movies_with_rating.csv'
)


In [151]:
# added average rating, release year and movie name to character metadata 
movie_ratings = movie_metadata[['ID','Movie name','Movie release year','averageRating']]
character_metadata_with_rating = pd.merge(character_metadata,movie_ratings, on = 'ID', how = 'inner')

In [152]:
filtered_characters = pd.read_csv('filtered_characters.csv')
final_characters = pd.merge(filtered_characters,movie_ratings[['ID','Movie release year']],right_on= 'ID',left_on='movie_id',how = 'inner')

In [153]:
def count_similar_previous_roles(actor_name,movie_release_year,classification,characters_dataset):
    actor_character = characters_dataset[(characters_dataset['Actor Name']== actor_name) & (characters_dataset['classification']== classification)]
    return (actor_character['Movie release year'] < movie_release_year).sum()

In [154]:
final_characters['played same character']= final_characters[['Actor Name','Movie release year','classification']].apply(
    lambda x: count_similar_previous_roles(x[0],x[1],x[2],final_characters),axis = 1)

In [155]:
final_characters[final_characters['Actor Name']== 'Tom Hanks']

Unnamed: 0,movie_id,Actor Name,Character Name,name,classification,ID,Movie release year,played same character
1397,543433,Tom Hanks,Jimmy Dugan,Dugan,22,543433,1992.0,0
2173,53085,Tom Hanks,Sheriff Woody,Woody,3,53085,1995.0,0
2352,4186631,Tom Hanks,Richard Harlan Drew,Richard,8,4186631,1985.0,0
2569,1565181,Tom Hanks,"Walter Fielding, Jr.",Fielding,6,1565181,1986.0,0
2647,1724301,Tom Hanks,Det. Scott Turner,Turner,6,1724301,1989.0,1
6591,176489,Tom Hanks,Joe,Joe,6,176489,1990.0,2
8347,4186781,Tom Hanks,Lawrence Whatley Bourne III,III,20,4186781,1985.0,0


In [156]:
final_characters = final_characters.drop(columns=['movie_id','name','Movie release year'])

In [157]:
character_metadata_with_role_count = pd.merge(character_metadata_with_rating,final_characters,on =['ID','Actor Name','Character Name'],how = 'inner')

In [175]:
final_character_metadata = pd.merge(character_metadata_with_role_count, final_oscar_nominees, on = ['Actor Name','Movie name'], how = 'left' )
final_character_metadata[['nominations so far','wins so far']] = final_character_metadata[['nominations so far','wins so far']].fillna(0)

In [177]:
final_character_metadata.columns

Index(['ID', 'Freebase movie ID', 'Movie release date', 'Character Name',
       'Actor DOB', 'Actor gender', 'Actor height', 'Actor ethnicity',
       'Actor Name', 'Actor age at movie release', 'Freebase character map',
       'Freebase character ID', 'Freebase actor ID', 'Movie name',
       'Movie release year', 'averageRating', 'classification',
       'played same character', 'nominations so far', 'wins so far'],
      dtype='object')