In [1]:
import pandas as pd

In [32]:
movie_metadata_header = [
    'ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Movie box office revenue',
    'Movie runtime',
    'Movie languages',
    'Movie countries',
    'Movie genres'
]

character_metadata_header = [
    'ID',
    'Freebase movie ID',
    'Movie release date',
    'Character Name',
    'Actor DOB',
    'Actor gender',
    'Actor height',
    'Actor ethnicity',
    'Actor Name',
    'Actor age at movie release',
    'Freebase character map',
    'Freebase character ID',
    'Freebase actor ID'
]

In [201]:
# read oscar dataset
oscar_dataset = pd.read_csv('the_oscar_award.csv')
oscar_dataset = oscar_dataset.rename(columns = {'name': 'Actor Name','film' : 'Movie name','year_film':'Movie release year'})
# only take actor/actress Awards from all categories
oscar_nominees = oscar_dataset[oscar_dataset['category'].str.contains('ACTOR') | oscar_dataset['category'].str.contains('ACTRESS')].reset_index(drop=True)
oscar_winners = oscar_nominees[oscar_nominees['winner'] == True]
# we look for the first win and first nomination of every actor
actor_first_win = oscar_winners.groupby('Actor Name').agg({'Movie release year': 'min',
                                                           'year_ceremony'     : 'min'}).reset_index()
actor_first_nomination =  oscar_nominees.groupby('Actor Name').agg({'Movie release year': 'min',
                                                                    'year_ceremony'     : 'min'}).reset_index()
actor_first_win = actor_first_win.rename(columns = {'year_ceremony' :'first win date'}).drop(columns=['Movie release year'])
actor_first_nomination = actor_first_nomination.rename(columns = {'year_ceremony' :'first nomination date'}).drop(columns=['Movie release year'])
best_actors = pd.merge(actor_first_nomination,actor_first_win,on = 'Actor Name', how = 'left')
best_actors.fillna(9999)

Unnamed: 0,Actor Name,first nomination date,first win date
0,Abigail Breslin,2007,9999.0
1,Adam Driver,2019,9999.0
2,Adolph Caesar,1985,9999.0
3,Adolphe Menjou,1931,9999.0
4,Adriana Barraza,2007,9999.0
...,...,...,...
930,Winona Ryder,1994,9999.0
931,Woody Allen,1978,9999.0
932,Woody Harrelson,1997,9999.0
933,Yalitza Aparicio,2019,9999.0


In [202]:
character_metadata = pd.read_csv(
    'MovieSummaries/character.metadata.tsv',
    delimiter='\t',
    names=character_metadata_header
)
# in order to compare release years
character_metadata['Movie release year'] = pd.to_numeric(character_metadata['Movie release date'].str.split('-').str[0],errors = 'coerce')
actors_in_movies = character_metadata[['ID','Character Name','Actor Name','Movie release year']]

In [235]:
oscars_in_movies = pd.merge(actors_in_movies,best_actors,on = 'Actor Name',how = 'left')
oscars_in_movies.fillna({'first nomination date': 9999,'first win date': 9999,'ceremony':0, 'winner':False},inplace= True)
oscars_in_movies

Unnamed: 0,ID,Character Name,Actor Name,Movie release year,first nomination date,first win date
0,975900,Akooshay,Wanda De Jesus,2001.0,9999.0,9999.0
1,975900,Lieutenant Melanie Ballard,Natasha Henstridge,2001.0,9999.0,9999.0
2,975900,Desolation Williams,Ice Cube,2001.0,9999.0,9999.0
3,975900,Sgt Jericho Butler,Jason Statham,2001.0,9999.0,9999.0
4,975900,Bashira Kincaid,Clea DuVall,2001.0,9999.0,9999.0
...,...,...,...,...,...,...
450664,913762,Elensh,Dorothy Elias-Fahn,1992.0,9999.0,9999.0
450665,913762,Hibiki,Jonathan Fahn,1992.0,9999.0,9999.0
450666,28308153,,David Hemmings,1957.0,9999.0,9999.0
450667,28308153,,Roberta Paterson,1957.0,9999.0,9999.0


In [262]:
oscars_in_movies['nominated'] = oscars_in_movies['Movie release year'] > oscars_in_movies['first nomination date'].astype(int)
oscars_in_movies['winner'] = oscars_in_movies['Movie release year'] > oscars_in_movies['first win date'].astype(int)
# oscars_in_movies.groupby('ID')['ID'].count()
oscars_per_movie = oscars_in_movies.groupby('ID').agg({'winner':'sum',
                                                       'nominated':'sum'}).reset_index()
oscars_per_movie.sort_values(by = 'winner',ascending= False)

Unnamed: 0,ID,winner,nominated
57346,31232694,11,12
24873,10821674,10,22
37485,19187200,7,13
1563,327427,6,15
631,101398,5,8
...,...,...,...
23223,9894771,0,0
23224,9894921,0,0
23225,9894962,0,0
23226,9895416,0,0


In [163]:
movie_metadata = pd.read_csv(
    'MovieSummaries/movie.metadata.tsv',
    delimiter='\t',
    names=movie_metadata_header
)
# in order to compare release years
movie_metadata['Movie release year'] = pd.to_numeric(movie_metadata['Movie release date'].str.split('-').str[0],errors = 'coerce')


In [263]:
movies = pd.merge(movie_metadata,oscars_per_movie,on = 'ID', how = 'inner')

In [264]:
movies_with_oscar_winner = movies[movies['winner'] > 0]
movies_with_oscar_nomination = movies[(movies['nominated'] > 0) & (movies['winner'] == 0) ]
movies_wo_oscar = movies[movies['nominated'] == 0 ]