In [None]:
import pandas as pd
import os
import numpy as np

## Merging characters and movies

In [None]:
movie_df = pd.read_csv('data/movie.metadata.tsv', sep='\t', names=['wiki_movie_id', 'freebase_movie_id', 'title', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres'], index_col='freebase_movie_id')
movie_df = movie_df.reset_index()

In [None]:
movie_df['parsed_movie_name'] = movie_df['title'].str.lower().str.strip()

In [None]:
col_names = ["wiki_movie_id", "freebase_movie_id", "movie_release_date", "character_name", "actor_date_of_birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)", "Actor name", "Actor age at movie release", "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_df = pd.read_csv('data/character.metadata.tsv', sep='\t', names=col_names, index_col="Freebase character ID")

# Remove characters without actors
character_df = character_df[character_df['Freebase actor ID'].notna()]

# Sometimes one actor can play multiple characters in the same movie. #TODO: Provide example
# We will allow only one character per actor per movie.
# Motivation: We only care about whether an actor was Oscar nominated, not which character(s) the nomination was for.
character_df = character_df.groupby(['freebase_movie_id', 'Freebase actor ID']).first().reset_index()

character_df['parsed_actor_name'] = character_df['Actor name'].str.lower().str.strip()

# How many cases where the actors of the same name star in the same movie
identical_actor_names_per_movie = character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()
movies_with_actors_of_same_name_df = identical_actor_names_per_movie[identical_actor_names_per_movie['count'] > 1]
print('Number of movies with actors of the same name:', movies_with_actors_of_same_name_df.shape[0])


In [None]:

# Only 313 occurences of name doubles in the dataset, we decide to drop these cases. The effort to handle these cases would not be worth the benefit.
# TODO: Check if any of the 311 actors won Oscars
character_df = character_df.merge(identical_actor_names_per_movie, on=['freebase_movie_id', 'parsed_actor_name'], how='inner')
dropped_character_df = character_df[character_df['count'] == 1]

temp = dropped_character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()

assert temp[temp['count'] > 1].empty

character_df = dropped_character_df

In [None]:
movie_character_df = movie_df.merge(character_df, on='freebase_movie_id', how='inner')
movie_character_df = movie_character_df[[
    'freebase_movie_id',
    'title',
    'release_date',
    'box_office_revenue',
    'runtime',
    'languages',
    'countries',
    'genres',
    'parsed_movie_name',
    'Freebase actor ID',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'Freebase character/actor map ID',
    'parsed_actor_name'
]]


In [None]:
# To join with Oscar Movie name and Actor name need to uniquely identify one row in movie_character_df 
temp = movie_character_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size')).reset_index()
print('Number of combination of actors of the same names starring in movies with the same name:', temp[temp['count'] > 1].shape[0])

# 350 combination. We decide to drop the combinations 
# TODO: Check if any movie won an Oscar
dropped_movie_character_df = movie_character_df.merge(temp, on=['parsed_movie_name', 'parsed_actor_name'], how='inner')
dropped_movie_character_df = dropped_movie_character_df[dropped_movie_character_df['count'] == 1]

temp = dropped_movie_character_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size'))

assert temp['count'].max() == 1

movie_character_df = dropped_movie_character_df
movie_character_df = movie_character_df.drop('count', axis=1)

## Merging with oscar dataset

In [None]:
oscar_df = pd.read_csv('data/the_oscar_award.csv')

# Removes weird Oscar nomination
oscar_df = oscar_df[oscar_df['film'].notna() & oscar_df['name'].notna()]

In [None]:
# Filtering relevant Oscar catagories
# TODO: explain why we drop certain categories.
oscar_df = oscar_df[oscar_df['category'].str.contains('ACTOR') | oscar_df['category'].str.contains('ACTRESS')]

This join needs to be checked. 
Joining with ignore case? 
Do we want to do a right join? (so also keep movies that did not win oscars)


In [None]:
# Avoid case-sensitivity and extra spaces
oscar_df['parsed_movie_name'] = oscar_df['film'].str.lower().str.strip()
oscar_df['parsed_actor_name'] = oscar_df['name'].str.lower().str.strip()

In [None]:
# One actor won two Oscars for the same movie (Barry Fitzgerald)
oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'count')).sort_values('count', ascending=False)

# If actor has mutiple nomination for the same movie we choose the first
oscar_df = oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).first().reset_index()

# For joining rows in oscar_df needs to be uniquely identified by name_parsed and film_title_parsed
assert oscar_df.groupby(['parsed_movie_name', 'parsed_actor_name']).agg(count = ('parsed_movie_name', 'size')).max()['count'] == 1

In [None]:
movie_character_oscar_df = movie_character_df.merge(oscar_df, how='outer', on=['parsed_movie_name', 'parsed_actor_name'])

In [None]:
# Create column to determine if actor was nominated for Oscar 
movie_character_oscar_df['oscar_nominated'] = movie_character_oscar_df['winner'].notna()

In [None]:
oscar_nominated_actors_df = movie_character_oscar_df[movie_character_oscar_df['oscar_nominated']]
print('Number of different Oscar nominated films in dataset:', 
      oscar_nominated_actors_df['parsed_movie_name'].unique().shape[0])

print('Number of different Oscar nominated actors in dataset:', 
      oscar_nominated_actors_df['parsed_actor_name'].unique().shape[0])

print('Number of characters in dataset:', oscar_nominated_actors_df.shape[0])
print('Number of movies in dataset:', oscar_nominated_actors_df['parsed_movie_name'].unique().shape[0])
print('Number of actors in dataset:', oscar_nominated_actors_df['parsed_actor_name'].unique().shape[0])

## Merging with IMDB ratings

In [None]:
#Read data
titlebasics_df = pd.read_csv('data/title.basics.tsv', sep='\t')
titleratings_df = pd.read_csv('data/title.ratings.tsv', sep='\t')

In [None]:
#Merge dataframes containing ratings and title names
name_rating_df = titlebasics_df.merge(titleratings_df, how='inner', on=['tconst', 'tconst'])
#Consider only movies
name_rating_df = name_rating_df[(name_rating_df["titleType"] == 'movie')]
#Get relevant columns
name_rating_df = name_rating_df[["primaryTitle", "startYear", "averageRating", "numVotes"]]

In [None]:
#Change the year data in IMDB dataset to float for merging to work
#\\N is a special value used for missing, replace with NaN so it can't be used for merging
name_rating_df["startYear"] = name_rating_df["startYear"].replace('\\N', np.nan)
name_rating_df["startYear"] = name_rating_df["startYear"].astype(float)
#Rename columns for merge
name_rating_df.rename(columns={'primaryTitle': 'title', 'startYear' : 'year'}, inplace=True)

In [None]:
#Have to take care of duplicate entries for movies in the same year
#ASSUME they are the same movie and aggregate the scores: sum up the numVotes and calculate the
#new average rating taking into account the number of votes

#The ratings need to be weighted to account for the number of votes
name_rating_df["RatingWeight"] = name_rating_df['averageRating'] * name_rating_df['numVotes']

# Group by title and year for duplicates, sum the number of votes and the weighted ratings
name_rating_agg_df = name_rating_df.groupby(['title','year']).agg(
    numVotes=('numVotes', 'sum'),
    RatingWeight = ('RatingWeight','sum')
).reset_index()

#Undo the previous weighing, dividing by number of all votes
name_rating_agg_df['averageRating'] = name_rating_agg_df['RatingWeight'] / name_rating_agg_df['numVotes']
#All ratings have 1 space after comma
name_rating_agg_df['averageRating'] = name_rating_agg_df['averageRating'].round(1)

#Drop the temporary weighted ratings
name_rating_agg_df.drop(columns='RatingWeight', inplace=True)

In [None]:
#Get the year of release of the movies from the mixed formatting
dates_as_year = pd.to_datetime(movie_character_oscar_df["release_date"],format='mixed',errors='coerce').dt.year

#Copy of the dataset with the added year column
movie_character_oscar_fixedDate_df = movie_character_oscar_df.copy(deep=True)
movie_character_oscar_fixedDate_df["year"] = dates_as_year

In [None]:
#Merge ratings with everything previous (left merge because we only want to add the ratings where we can, they
#are not the be-all-end-all
movie_character_oscar_rating_df = movie_character_oscar_fixedDate_df.merge(
    name_rating_agg_df, on=['title', 'year'], how='left')


In [None]:
print('Total number of movies with ratings: ',
      movie_character_oscar_rating_df[movie_character_oscar_rating_df['averageRating'].notna()]['parsed_movie_name'].unique().shape[0])

#Get oscar nominated movies
oscar_nominated_actors_with_ratings_df = movie_character_oscar_rating_df[movie_character_oscar_rating_df['oscar_nominated']]
print('Total number of oscar-nominated movies with ratings: ',
      oscar_nominated_actors_with_ratings_df[oscar_nominated_actors_with_ratings_df['averageRating'].notna()]['parsed_movie_name'].unique().shape[0])

In [None]:
# Assert that adding ratings did not add any rows
assert len(movie_character_oscar_df) == len(movie_character_oscar_rating_df)

## Output data

In [None]:
movie_character_oscar_rating_df[:1]

In [None]:
# Finish processed data
path = 'cache/processed data.csv'
os.makedirs('cache', exist_ok=True)
movie_character_oscar_rating_df.to_csv(path)