In [239]:
import pandas as pd
import os
import numpy as np
import re

## Merging characters and movies

In [240]:
movie_df = pd.read_csv('data/movie.metadata.tsv', sep='\t', names=['wiki_movie_id', 'freebase_movie_id', 'title', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres'], index_col='freebase_movie_id')
movie_df = movie_df.reset_index()

In [241]:
def parse_string(s):
    try:
        s = s.lower().strip()
        s  = re.sub('[^a-zA-Z0-9 -]', '', s)
        if len(s) == 0:
            return None
        return s
    except:
        return None 

In [242]:
movie_df['parsed_movie_name'] = movie_df['title'].apply(parse_string)

In [243]:
col_names = ["wiki_movie_id", "freebase_movie_id", "movie_release_date", "character_name", "actor_date_of_birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)", "Actor name", "Actor age at movie release", "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_df = pd.read_csv('data/character.metadata.tsv', sep='\t', names=col_names, index_col="Freebase character ID")

# Remove characters without actors
character_df = character_df[character_df['Freebase actor ID'].notna()]

# Allow only one character per actor per movie.
# Motivation: When we look at if the actor won a Oscar we don't care about what role it won it as. 
character_df = character_df.groupby(['freebase_movie_id', 'Freebase actor ID']).first().reset_index()

character_df['parsed_actor_name'] = character_df['Actor name'].apply(parse_string)

# How many cases where the actors of the same name stars in the same movie or
temp = character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()
movies_with_actors_of_same_name_df = temp[temp['count'] > 1]
print('Number of movies with actors of the same name:', movies_with_actors_of_same_name_df.shape[0])

# Only 313 we decide to drop the cases.
# TODO: Check if any of the 311 actors won Oscars
character_df = character_df.merge(temp, on=['freebase_movie_id', 'parsed_actor_name'], how='inner')
dropped_character_df = character_df[character_df['count'] == 1]

temp = dropped_character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()

assert temp[temp['count'] > 1].empty

character_df = dropped_character_df

Number of movies with actors of the same name: 314


In [244]:
movie_character_df = movie_df.merge(character_df, on='freebase_movie_id', how='inner')
movie_character_df = movie_character_df[[
    'freebase_movie_id',
    'title',
    'release_date',
    'box_office_revenue',
    'runtime',
    'languages',
    'countries',
    'genres',
    'parsed_movie_name',
    'Freebase actor ID',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'Freebase character/actor map ID',
    'parsed_actor_name'
]]


In [245]:
# To join with Oscar Movie name and Actor name need to uniquely identify one row in movie_character_df 
temp = movie_character_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size')).reset_index()
print('Number of combination of actors of the same names starring in movies with the same name:', temp[temp['count'] > 1].shape[0])

# 350 combination. We decide to drop the combinations 
# TODO: Check if any movie won an Oscar
dropped_movie_character_df = movie_character_df.merge(temp, on=['parsed_movie_name', 'parsed_actor_name'], how='inner')
dropped_movie_character_df = dropped_movie_character_df[dropped_movie_character_df['count'] == 1]

temp = dropped_movie_character_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size'))

assert temp['count'].max() == 1

movie_character_df = dropped_movie_character_df
movie_character_df = movie_character_df.drop('count', axis=1)

Number of combination of actors of the same names starring in movies with the same name: 353


## Merging with oscar dataset

In [246]:
oscar_df = pd.read_csv('data/the_oscar_award.csv')

# Removes weird Oscar nomination
oscar_df = oscar_df[oscar_df['film'].notna() & oscar_df['name'].notna()]

In [247]:
# Filtering relevant Oscar catagories
# TODO: explain why we drop certain categories.
oscar_df = oscar_df[oscar_df['category'].str.contains('ACTOR') | oscar_df['category'].str.contains('ACTRESS')]

This join needs to be checked. 
Joining with ignore case? 
Do we want to do a right join? (so also keep movies that did not win oscars)


In [248]:
# Avoid case-sensitivity and extra spaces
oscar_df['parsed_movie_name'] = oscar_df['film'].apply(parse_string)
oscar_df['parsed_actor_name'] = oscar_df['name'].apply(parse_string)

In [249]:
# One actor won two Oscars for the same movie (Barry Fitzgerald)
oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'count')).sort_values('count', ascending=False)

# If actor has mutiple nomination for the same movie we choose the first
oscar_df = oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).first().reset_index()

# For joining rows in oscar_df needs to be uniquely identified by name_parsed and film_title_parsed
assert oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size')).max()['count'] == 1

In [250]:
movie_character_oscar_df = movie_character_df.merge(oscar_df, how='outer', on=['parsed_movie_name', 'parsed_actor_name'])

In [251]:
# Create column to determine if actor was nominated for Oscar 
movie_character_oscar_df['oscar_nominated'] = movie_character_oscar_df['winner'].notna()

In [252]:
oscar_nominated_actors_df = movie_character_oscar_df[movie_character_oscar_df['oscar_nominated']]
print('Number of different Oscar nominated films in dataset:', 
      oscar_nominated_actors_df['parsed_movie_name'].unique().shape[0])

print('Number of different Oscar nominated actors in dataset:', 
      oscar_nominated_actors_df['parsed_actor_name'].unique().shape[0])

print('Number of characters with Oscar nominated characters:', oscar_nominated_actors_df.shape[0])
print('Number of Oscar nominated movies in dataset:', oscar_nominated_actors_df['parsed_movie_name'].unique().shape[0])
print('Number of Oscar nominated actors in dataset:', oscar_nominated_actors_df['parsed_actor_name'].unique().shape[0])

Number of different Oscar nominated films in dataset: 1201
Number of different Oscar nominated actors in dataset: 982
Number of characters with Oscar nominated characters: 1827
Number of Oscar nominated movies in dataset: 1201
Number of Oscar nominated actors in dataset: 982


## Merging with IMDB ratings

In [253]:
#Read data
titlebasics_df = pd.read_csv('data/title.basics.tsv', sep='\t', quoting=3)
titleratings_df = pd.read_csv('data/title.ratings.tsv', sep='\t')

In [254]:
#Merge dataframes containing ratings and title names
name_rating_df = titlebasics_df.merge(titleratings_df, how='inner', on=['tconst', 'tconst'])
#Consider only movies
name_rating_df = name_rating_df[(name_rating_df["titleType"] == 'movie')]
#Get relevant columns
name_rating_df = name_rating_df[["primaryTitle", "startYear", "averageRating", "numVotes"]]

In [255]:
#Change the year data in IMDB dataset to float for merging to work
#\\N is a special value used for missing, replace with NaN so it can't be used for merging
name_rating_df["startYear"] = name_rating_df["startYear"].replace('\\N', np.nan)
name_rating_df["startYear"] = name_rating_df["startYear"].astype(float)
#Rename columns for merge
name_rating_df.rename(columns={'primaryTitle': 'title', 'startYear' : 'year', 'titleType': 'type'}, inplace=True)

In [256]:
#Have to take care of duplicate entries for movies in the same year
#ASSUME they are the same movie and aggregate the scores: sum up the numVotes and calculate the
#new average rating taking into account the number of votes

#The ratings need to be weighted to account for the number of votes
name_rating_df["RatingWeight"] = name_rating_df['averageRating'] * name_rating_df['numVotes']
# Group by title and year for duplicates, sum the number of votes and the weighted ratings
name_rating_agg_df = name_rating_df.groupby(['title','year']).agg(
    numVotes=('numVotes', 'sum'),
    RatingWeight = ('RatingWeight','sum'), 
).reset_index()

#Undo the previous weighing, dividing by number of all votes
name_rating_agg_df['averageRating'] = name_rating_agg_df['RatingWeight'] / name_rating_agg_df['numVotes']
#All ratings have 1 space after comma
name_rating_agg_df['averageRating'] = name_rating_agg_df['averageRating'].round(1)

#Drop the temporary weighted ratings
name_rating_agg_df.drop(columns='RatingWeight', inplace=True)

In [257]:
#Get the year of release of the movies from the mixed formatting
dates_as_year = pd.to_datetime(movie_character_oscar_df["release_date"],format='mixed',errors='coerce').dt.year

#Copy of the dataset with the added year column
movie_character_oscar_df['year'] = dates_as_year 

rows_before_ratings = movie_character_oscar_df.shape[0]

In [258]:
#Merge ratings with everything previous (left merge because we only want to add the ratings where we can, they
#are not the be-all-end-all
name_rating_agg_df['parsed_movie_name'] = name_rating_agg_df['title'].apply(parse_string)
movie_character_oscar_rating_df = movie_character_oscar_df.merge(
    name_rating_agg_df, on=['parsed_movie_name', 'year'], how='left')


We have three columns for date - `release_date`, `year_ceremony` and `year_film`. `year_film` and `year_ceremony` are from the oscar dataset and are only available for 1827 rows. Additionally, `year_ceremony` is always exactly 1 more than `year_film`. This allows us to conclude that the `year_film` and `year_ceremony` columns are redundant and can be dropped.

For similar reasons, the `name` and `film` columns from the oscar dataset are also redundant.

The `Freebase character/actor map ID` column is also not useful for us as we only care about actors and not the characters played by them in movies. 

From the pairs of columns (`title`,`parsed_movie_name`) and (`Actor_name`, `parsed_actor_name`), we choose to keep the parsed versions. At first glance, it might appear that the parsed columns have no missing values while the original ones do not, but this is simply because the missing values have been converted to 'nan' strings. This is something to be mindful of. TODO we might even want to drop rows with `nan` actors?



In [259]:
movie_character_oscar_rating_df = movie_character_oscar_rating_df[[
    'title_x',
    'release_date',
    'box_office_revenue',
    'runtime',
    'year',
    'languages',
    'countries',
    'genres',
    'parsed_movie_name',
    'Actor gender',
    'Actor height (in meters)',
    'Actor name',
    'Actor age at movie release',
    'parsed_actor_name',
    'oscar_nominated',
    'numVotes',
    'averageRating'
]]

In [260]:
movie_character_oscar_rating_df.rename(columns={
    'title_x': 'title', 
    'year': 'release_year', 
    'numVotes': 'number_of_votes',
    'averageRating': 'average_rating'
}, inplace=True)

In [261]:
# Creates an identifier
# Film identifier assumes no movies of the same name come out in the same year
movie_character_oscar_rating_df['identifier'] = movie_character_oscar_rating_df.apply(lambda x: x['parsed_actor_name'] + '_' + x['parsed_movie_name'] + '_' + str(x['release_year']), axis=1) 
movie_character_oscar_rating_df['film_identifier'] = movie_character_oscar_rating_df.apply(lambda x: x['parsed_movie_name'] + '_' + str(x['release_year']), axis=1)
movie_character_oscar_rating_df = movie_character_oscar_rating_df.drop(columns=['parsed_actor_name', 'parsed_movie_name'], axis=1)

In [262]:
movie_character_oscar_rating_df

Unnamed: 0,title,release_date,box_office_revenue,runtime,release_year,languages,countries,genres,Actor gender,Actor height (in meters),Actor name,Actor age at movie release,oscar_nominated,number_of_votes,average_rating,identifier,film_identifier
0,È arrivato il cavaliere!,1950,,92.0,1950.0,"{""/m/02bjrlw"": ""Italian Language""}","{""/m/03rjj"": ""Italy""}","{""/m/01z4y"": ""Comedy""}",F,,Alda Mangini,35.0,False,61.0,5.6,alda mangini_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
1,È arrivato il cavaliere!,1950,,92.0,1950.0,"{""/m/02bjrlw"": ""Italian Language""}","{""/m/03rjj"": ""Italy""}","{""/m/01z4y"": ""Comedy""}",M,,Arturo Bragaglia,,False,61.0,5.6,arturo bragaglia_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
2,È arrivato il cavaliere!,1950,,92.0,1950.0,"{""/m/02bjrlw"": ""Italian Language""}","{""/m/03rjj"": ""Italy""}","{""/m/01z4y"": ""Comedy""}",M,,Carlo Mazzarella,30.0,False,61.0,5.6,carlo mazzarella_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
3,È arrivato il cavaliere!,1950,,92.0,1950.0,"{""/m/02bjrlw"": ""Italian Language""}","{""/m/03rjj"": ""Italy""}","{""/m/01z4y"": ""Comedy""}",M,,Enrico Viarisio,,False,61.0,5.6,enrico viarisio_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
4,È arrivato il cavaliere!,1950,,92.0,1950.0,"{""/m/02bjrlw"": ""Italian Language""}","{""/m/03rjj"": ""Italy""}","{""/m/01z4y"": ""Comedy""}",M,,Enzo Biliotti,,False,61.0,5.6,enzo biliotti_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445051,Zyzzyx Road,2006-02-25,,90.0,2006.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0219x_"": ""Indie""...",M,1.80,Tom Sizemore,44.0,False,,,tom sizemore_zyzzyx road_2006.0,zyzzyx road_2006.0
445052,Zyzzyx Road,2006-02-25,,90.0,2006.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0219x_"": ""Indie""...",M,,Yorlin Madera,30.0,False,,,yorlin madera_zyzzyx road_2006.0,zyzzyx road_2006.0
445053,Zzyzx,2007,,81.0,2007.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/02wtdps"": ""Crime...",,,Kayo Zepeda,,False,,,kayo zepeda_zzyzx_2007.0,zzyzx_2007.0
445054,Zzyzx,2007,,81.0,2007.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/02wtdps"": ""Crime...",M,1.85,Kenny Johnson,43.0,False,,,kenny johnson_zzyzx_2007.0,zzyzx_2007.0


In [263]:
# We also note that the `winner` column has only 1827 values, while the `oscar_nominated` column has 445017. It is safe to say that if a movie is not nominated for an oscar, then it also does not win an oscar. Thus we impute the missing values for `winner` with `False`
# movie_character_oscar_rating_df_dropped.loc[movie_character_oscar_rating_df_dropped['winner'].isna(), 'winner'] = False

# This now also allows us to cast the `winner` column to boolean type
# movie_character_oscar_rating_df_dropped['winner'] = movie_character_oscar_rating_df_dropped['winner'].astype(bool)

In [264]:
#TODO: Fix this print
# print('Total number of movies with ratings: ',
#       movie_character_oscar_rating_df[movie_character_oscar_rating_df['average_rating'].notna()]['title'].unique().shape[0])

# #Get oscar nominated movies
# oscar_nominated_actors_with_ratings_df = movie_character_oscar_rating_df[movie_character_oscar_rating_df['oscar_nominated']]
# print('Total number of oscar-nominated movies with ratings: ',
#       oscar_nominated_actors_with_ratings_df[oscar_nominated_actors_with_ratings_df['average'].notna()]['title'].unique().shape[0])

In [265]:
#TODO: Fix the assert
# Assert that adding ratings did not add any rows
# assert len(movie_character_oscar_df) == len(movie_character_oscar_rating_df_dropped)

# Extracting column data

In [266]:
def extract_column(s):
    s = str(s)
    return re.findall('\"([^\/:][\w\s]+)"', s)

  return re.findall('\"([^\/:][\w\s]+)"', s)


In [267]:
movie_character_oscar_rating_df['countries'] = movie_character_oscar_rating_df['countries'].apply(extract_column)
movie_character_oscar_rating_df['languages'] = movie_character_oscar_rating_df['languages'].apply(extract_column)
movie_character_oscar_rating_df['genres'] = movie_character_oscar_rating_df['genres'].apply(extract_column)

In [269]:
movie_character_oscar_rating_df

Unnamed: 0,title,release_date,box_office_revenue,runtime,release_year,languages,countries,genres,Actor gender,Actor height (in meters),Actor name,Actor age at movie release,oscar_nominated,number_of_votes,average_rating,identifier,film_identifier
0,È arrivato il cavaliere!,1950,,92.0,1950.0,[Italian Language],[Italy],[Comedy],F,,Alda Mangini,35.0,False,61.0,5.6,alda mangini_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
1,È arrivato il cavaliere!,1950,,92.0,1950.0,[Italian Language],[Italy],[Comedy],M,,Arturo Bragaglia,,False,61.0,5.6,arturo bragaglia_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
2,È arrivato il cavaliere!,1950,,92.0,1950.0,[Italian Language],[Italy],[Comedy],M,,Carlo Mazzarella,30.0,False,61.0,5.6,carlo mazzarella_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
3,È arrivato il cavaliere!,1950,,92.0,1950.0,[Italian Language],[Italy],[Comedy],M,,Enrico Viarisio,,False,61.0,5.6,enrico viarisio_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
4,È arrivato il cavaliere!,1950,,92.0,1950.0,[Italian Language],[Italy],[Comedy],M,,Enzo Biliotti,,False,61.0,5.6,enzo biliotti_ arrivato il cavaliere_1950.0,arrivato il cavaliere_1950.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445051,Zyzzyx Road,2006-02-25,,90.0,2006.0,[English Language],[United States of America],"[Thriller, Indie, Psychological thriller, Dram...",M,1.80,Tom Sizemore,44.0,False,,,tom sizemore_zyzzyx road_2006.0,zyzzyx road_2006.0
445052,Zyzzyx Road,2006-02-25,,90.0,2006.0,[English Language],[United States of America],"[Thriller, Indie, Psychological thriller, Dram...",M,,Yorlin Madera,30.0,False,,,yorlin madera_zyzzyx road_2006.0,zyzzyx road_2006.0
445053,Zzyzx,2007,,81.0,2007.0,[English Language],[United States of America],"[Thriller, Crime Thriller, Road movie, Psychol...",,,Kayo Zepeda,,False,,,kayo zepeda_zzyzx_2007.0,zzyzx_2007.0
445054,Zzyzx,2007,,81.0,2007.0,[English Language],[United States of America],"[Thriller, Crime Thriller, Road movie, Psychol...",M,1.85,Kenny Johnson,43.0,False,,,kenny johnson_zzyzx_2007.0,zzyzx_2007.0


## Output data

In [268]:
# Finish processed data
path = 'cache/data.csv'
os.makedirs('cache', exist_ok=True)
movie_character_oscar_rating_df.to_csv(path)
print('Processing done')

Processing done
