In [2]:
import pandas as pd
import os
import numpy as np

## Merging characters and movies

In [3]:
movie_df = pd.read_csv('data/movie.metadata.tsv', sep='\t', names=['wiki_movie_id', 'freebase_movie_id', 'title', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres'], index_col='freebase_movie_id')
movie_df = movie_df.reset_index()

In [4]:
movie_df['parsed_movie_name'] = movie_df['title'].str.lower().str.strip()

In [6]:
col_names = ["wiki_movie_id", "freebase_movie_id", "movie_release_date", "character_name", "actor_date_of_birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)", "Actor name", "Actor age at movie release", "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_df = pd.read_csv('data/character.metadata.tsv', sep='\t', names=col_names, index_col="Freebase character ID")

# Remove characters without actors
character_df = character_df[character_df['Freebase actor ID'].notna()]

# Allow only one character per actor per movie.
# Motivation: When we look at if the actor won a Oscar we don't care about what role it won it as. 
character_df = character_df.groupby(['freebase_movie_id', 'Freebase actor ID']).first().reset_index()

character_df['parsed_actor_name'] = character_df['Actor name'].str.lower().str.strip()

# How many cases where the actors of the same name stars in the same movie or
temp = character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()
movies_with_actors_of_same_name_df = temp[temp['count'] > 1]
print('Number of movies with actors of the same name:', movies_with_actors_of_same_name_df.shape[0])

# Only 313 we decide to drop the cases.
# TODO: Check if any of the 311 actors won Oscars
character_df = character_df.merge(temp, on=['freebase_movie_id', 'parsed_actor_name'], how='inner')
dropped_character_df = character_df[character_df['count'] == 1]

temp = dropped_character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()

assert temp[temp['count'] > 1].empty

character_df = dropped_character_df

Number of movies with actors of the same name: 313


In [7]:
movie_character_df = movie_df.merge(character_df, on='freebase_movie_id', how='inner')
movie_character_df = movie_character_df[[
    'freebase_movie_id',
    'title',
    'release_date',
    'box_office_revenue',
    'runtime',
    'languages',
    'countries',
    'genres',
    'parsed_movie_name',
    'Freebase actor ID',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'Freebase character/actor map ID',
    'parsed_actor_name'
]]


In [8]:
# To join with Oscar Movie name and Actor name need to uniquely identify one row in movie_character_df 
temp = movie_character_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size')).reset_index()
print('Number of combination of actors of the same names starring in movies with the same name:', temp[temp['count'] > 1].shape[0])

# 350 combination. We decide to drop the combinations 
# TODO: Check if any movie won an Oscar
dropped_movie_character_df = movie_character_df.merge(temp, on=['parsed_movie_name', 'parsed_actor_name'], how='inner')
dropped_movie_character_df = dropped_movie_character_df[dropped_movie_character_df['count'] == 1]

temp = dropped_movie_character_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size'))

assert temp['count'].max() == 1

movie_character_df = dropped_movie_character_df
movie_character_df = movie_character_df.drop('count', axis=1)

Number of combination of actors of the same names starring in movies with the same name: 350


## Merging with oscar dataset

In [9]:
oscar_df = pd.read_csv('data/the_oscar_award.csv')

# Removes weird Oscar nomination
oscar_df = oscar_df[oscar_df['film'].notna() & oscar_df['name'].notna()]

In [10]:
# Filtering relevant Oscar catagories
# TODO: explain why we drop certain categories.
oscar_df = oscar_df[oscar_df['category'].str.contains('ACTOR') | oscar_df['category'].str.contains('ACTRESS')]

This join needs to be checked. 
Joining with ignore case? 
Do we want to do a right join? (so also keep movies that did not win oscars)


In [11]:
# Avoid case-sensitivity and extra spaces
oscar_df['parsed_movie_name'] = oscar_df['film'].str.lower().str.strip()
oscar_df['parsed_actor_name'] = oscar_df['name'].str.lower().str.strip()

In [12]:
# One actor won two Oscars for the same movie (Barry Fitzgerald)
oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'count')).sort_values('count', ascending=False)

# If actor has mutiple nomination for the same movie we choose the first
oscar_df = oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).first().reset_index()

# For joining rows in oscar_df needs to be uniquely identified by name_parsed and film_title_parsed
assert oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size')).max()['count'] == 1

In [13]:
movie_character_oscar_df = movie_character_df.merge(oscar_df, how='outer', on=['parsed_movie_name', 'parsed_actor_name'])

In [14]:
# Create column to determine if actor was nominated for Oscar 
movie_character_oscar_df['oscar_nominated'] = movie_character_oscar_df['winner'].notna()

In [15]:
oscar_nominated_actors_df = movie_character_oscar_df[movie_character_oscar_df['oscar_nominated']]
print('Number of different Oscar nominated films in dataset:', 
      oscar_nominated_actors_df['parsed_movie_name'].unique().shape[0])

print('Number of different Oscar nominated actors in dataset:', 
      oscar_nominated_actors_df['parsed_actor_name'].unique().shape[0])

print('Number of characters in dataset:', oscar_nominated_actors_df.shape[0])
print('Number of movies in dataset:', oscar_nominated_actors_df['parsed_movie_name'].unique().shape[0])
print('Number of actors in dataset:', oscar_nominated_actors_df['parsed_actor_name'].unique().shape[0])

Number of different Oscar nominated films in dataset: 1201
Number of different Oscar nominated actors in dataset: 982
Number of characters in dataset: 1827
Number of movies in dataset: 1201
Number of actors in dataset: 982


## Merging with IMDB ratings

In [None]:
#Read data
titlebasics_df = pd.read_csv('data/title.basics.tsv', sep='\t', quoting=3)
titleratings_df = pd.read_csv('data/title.ratings.tsv', sep='\t')

  titlebasics_df = pd.read_csv('data/title.basics.tsv', sep='\t')


In [18]:
#Merge dataframes containing ratings and title names
name_rating_df = titlebasics_df.merge(titleratings_df, how='inner', on=['tconst', 'tconst'])
#Consider only movies
name_rating_df = name_rating_df[(name_rating_df["titleType"] == 'movie')]
#Get relevant columns
name_rating_df = name_rating_df[["primaryTitle", "startYear", "averageRating", "numVotes"]]

In [19]:
#Change the year data in IMDB dataset to float for merging to work
#\\N is a special value used for missing, replace with NaN so it can't be used for merging
name_rating_df["startYear"] = name_rating_df["startYear"].replace('\\N', np.nan)
name_rating_df["startYear"] = name_rating_df["startYear"].astype(float)
#Rename columns for merge
name_rating_df.rename(columns={'primaryTitle': 'title', 'startYear' : 'year', 'titleType': 'type'}, inplace=True)

In [20]:
name_rating_df

Unnamed: 0,title,year,averageRating,numVotes
8,Miss Jerry,1894.0,5.4,215
144,The Corbett-Fitzsimmons Fight,1897.0,5.2,539
339,Bohemios,1905.0,4.4,18
374,The Story of the Kelly Gang,1906.0,6.0,938
384,The Prodigal Son,1907.0,5.7,28
...,...,...,...,...
1497890,Coven,2020.0,6.4,5937
1497895,The Secret of China,2019.0,3.6,19
1497901,Kuambil Lagi Hatiku,2019.0,8.3,9
1497914,Dankyavar Danka,2013.0,8.4,8


In [20]:
#Have to take care of duplicate entries for movies in the same year
#ASSUME they are the same movie and aggregate the scores: sum up the numVotes and calculate the
#new average rating taking into account the number of votes

#The ratings need to be weighted to account for the number of votes
name_rating_df["RatingWeight"] = name_rating_df['averageRating'] * name_rating_df['numVotes']
# Group by title and year for duplicates, sum the number of votes and the weighted ratings
name_rating_agg_df = name_rating_df.groupby(['title','year']).agg(
    numVotes=('numVotes', 'sum'),
    RatingWeight = ('RatingWeight','sum'), 
).reset_index()

#Undo the previous weighing, dividing by number of all votes
name_rating_agg_df['averageRating'] = name_rating_agg_df['RatingWeight'] / name_rating_agg_df['numVotes']
#All ratings have 1 space after comma
name_rating_agg_df['averageRating'] = name_rating_agg_df['averageRating'].round(1)

#Drop the temporary weighted ratings
name_rating_agg_df.drop(columns='RatingWeight', inplace=True)

In [21]:
#Get the year of release of the movies from the mixed formatting
dates_as_year = pd.to_datetime(movie_character_oscar_df["release_date"],format='mixed',errors='coerce').dt.year

#Copy of the dataset with the added year column
movie_character_oscar_fixedDate_df = movie_character_oscar_df.copy(deep=True)
movie_character_oscar_fixedDate_df["year"] = dates_as_year

In [22]:
#Merge ratings with everything previous (left merge because we only want to add the ratings where we can, they
#are not the be-all-end-all
movie_character_oscar_rating_df = movie_character_oscar_fixedDate_df.merge(
    name_rating_agg_df, on=['title', 'year'], how='left')


In [23]:
print(movie_character_oscar_rating_df.columns)
print(name_rating_agg_df.columns)

Index(['freebase_movie_id', 'title', 'release_date', 'box_office_revenue',
       'runtime', 'languages', 'countries', 'genres', 'parsed_movie_name',
       'Freebase actor ID', 'Actor gender', 'Actor height (in meters)',
       'Actor ethnicity (Freebase ID)', 'Actor name',
       'Actor age at movie release', 'Freebase character/actor map ID',
       'parsed_actor_name', 'year_film', 'year_ceremony', 'ceremony',
       'category', 'name', 'film', 'winner', 'oscar_nominated', 'year',
       'numVotes', 'averageRating'],
      dtype='object')
Index(['title', 'year', 'numVotes', 'averageRating'], dtype='object')


In [25]:
movie_character_oscar_rating_df.notna().sum()

freebase_movie_id                  444700
title                              444700
release_date                       434854
box_office_revenue                  99727
runtime                            395191
languages                          444700
countries                          444700
genres                             444700
parsed_movie_name                  445017
Freebase actor ID                  444700
Actor gender                       400434
Actor height (in meters)           152680
Actor ethnicity (Freebase ID)      104228
Actor name                         444700
Actor age at movie release         288956
Freebase character/actor map ID    444700
parsed_actor_name                  445017
year_film                            1827
year_ceremony                        1827
ceremony                             1827
category                             1827
name                                 1827
film                                 1827
winner                            

We have three columns for date - `release_date`, `year_ceremony` and `year_film`. `year_film` and `year_ceremony` are from the oscar dataset and are only available for 1827 rows. Additionally, `year_ceremony` is always exactly 1 more than `year_film`. This allows us to conclude that the `year_film` and `year_ceremony` columns are redundant and can be dropped.

For similar reasons, the `name` and `film` columns from the oscar dataset are also redundant.

The `Freebase character/actor map ID` column is also not useful for us as we only care about actors and not the characters played by them in movies. 

From the pairs of columns (`title`,`parsed_movie_name`) and (`Actor_name`, `parsed_actor_name`), we choose to keep the parsed versions. At first glance, it might appear that the parsed columns have no missing values while the original ones do not, but this is simply because the missing values have been converted to 'nan' strings. This is something to be mindful of. TODO we might even want to drop rows with `nan` actors?



In [28]:
movie_character_oscar_rating_df_dropped = movie_character_oscar_rating_df.drop(columns=['Freebase character/actor map ID', 'name', 'film', 'year_film', 'year_ceremony', 'title', 'Actor name'])
movie_character_oscar_rating_df_dropped.columns

Index(['freebase_movie_id', 'release_date', 'box_office_revenue', 'runtime',
       'languages', 'countries', 'genres', 'parsed_movie_name',
       'Freebase actor ID', 'Actor gender', 'Actor height (in meters)',
       'Actor ethnicity (Freebase ID)', 'Actor age at movie release',
       'parsed_actor_name', 'ceremony', 'category', 'winner',
       'oscar_nominated', 'year', 'numVotes', 'averageRating'],
      dtype='object')

In [38]:
movie_character_oscar_rating_df_dropped.rename(columns={'parsed_movie_name': 'title', 'parsed_actor_name': 'actor_name'}, inplace=True)
movie_character_oscar_rating_df_dropped.columns

Index(['freebase_movie_id', 'release_date', 'box_office_revenue', 'runtime',
       'languages', 'countries', 'genres', 'title', 'Freebase actor ID',
       'Actor gender', 'Actor height (in meters)',
       'Actor ethnicity (Freebase ID)', 'Actor age at movie release',
       'actor_name', 'ceremony', 'category', 'winner', 'oscar_nominated',
       'year', 'numVotes', 'averageRating'],
      dtype='object')

In [39]:
# We also note that the `winner` column has only 1827 values, while the `oscar_nominated` column has 445017. It is safe to say that if a movie is not nominated for an oscar, then it also does not win an oscar. Thus we impute the missing values for `winner` with `False`
movie_character_oscar_rating_df_dropped.loc[movie_character_oscar_rating_df_dropped['winner'].isna(), 'winner'] = False

# This now also allows us to cast the `winner` column to boolean type
movie_character_oscar_rating_df_dropped['winner'] = movie_character_oscar_rating_df_dropped['winner'].astype(bool)
print(movie_character_oscar_rating_df_dropped.dtypes)

freebase_movie_id                 object
release_date                      object
box_office_revenue               float64
runtime                          float64
languages                         object
countries                         object
genres                            object
title                             object
Freebase actor ID                 object
Actor gender                      object
Actor height (in meters)         float64
Actor ethnicity (Freebase ID)     object
Actor age at movie release       float64
actor_name                        object
ceremony                         float64
category                          object
winner                              bool
oscar_nominated                     bool
year                             float64
numVotes                         float64
averageRating                    float64
dtype: object


In [41]:
print('Total number of movies with ratings: ',
      movie_character_oscar_rating_df_dropped[movie_character_oscar_rating_df_dropped['averageRating'].notna()]['title'].unique().shape[0])

#Get oscar nominated movies
oscar_nominated_actors_with_ratings_df = movie_character_oscar_rating_df_dropped[movie_character_oscar_rating_df_dropped['oscar_nominated']]
print('Total number of oscar-nominated movies with ratings: ',
      oscar_nominated_actors_with_ratings_df[oscar_nominated_actors_with_ratings_df['averageRating'].notna()]['title'].unique().shape[0])

Total number of movies with ratings:  32935
Total number of oscar-nominated movies with ratings:  969


In [42]:
# Assert that adding ratings did not add any rows
assert len(movie_character_oscar_df) == len(movie_character_oscar_rating_df_dropped)

## Output data

In [43]:
movie_character_oscar_rating_df_dropped[:1]

Unnamed: 0,freebase_movie_id,release_date,box_office_revenue,runtime,languages,countries,genres,title,Freebase actor ID,Actor gender,...,Actor ethnicity (Freebase ID),Actor age at movie release,actor_name,ceremony,category,winner,oscar_nominated,year,numVotes,averageRating
0,/m/03vyhn,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",ghosts of mars,/m/01vw26l,M,...,/m/0x67,32.0,ice cube,,,False,False,2001.0,58876.0,4.9


In [None]:
# Finish processed data
path = 'cache/data.csv'
os.makedirs('cache', exist_ok=True)
movie_character_oscar_rating_df_dropped.to_csv(path)