In [1]:
import pandas as pd
import os
import numpy as np
import re

# How datasets are joined

### Movie dataset on Character dataset
We can use freebase_movie_id

### Character dataset on Oscar dataset
Oscar dataset does not have freebase_movie_id or freebase_actor_id. We instead use parsed_actor_name and movie_identifier. parsed_actor_name will be unique for each movie as we drop actors if they share parsed_actor_name with another actor in the same movie. movie_identifier is a combination of parsed_movie_name and release_year. This is unique as we drop movies that share movie identifier.

# Preparing Movie dataset

In [7]:
movie_df = pd.read_csv('data/movie.metadata.tsv', sep='\t', names=['wiki_movie_id', 'freebase_movie_id', 'title', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres'], index_col='freebase_movie_id')
movie_df = movie_df.reset_index()

In [8]:
def parse_string(s):
    try:
        s = s.lower().strip()
        s  = re.sub('[^a-zA-Z0-9 -]', '', s)
        if len(s) == 0:
            return None
        return s
    except:
        return None 

In [9]:
movie_df['parsed_movie_name'] = movie_df['title'].apply(parse_string)
movie_df['release_year'] = movie_df['release_date'].apply(lambda x: pd.to_datetime(x, format='mixed', errors='coerce').year)

In [10]:
movie_df['release_year'] = movie_df['release_year'].fillna(0).astype(int)

In [11]:
#Dropped movies without parsed_movie_name
movie_df = movie_df[movie_df['parsed_movie_name'].notna()]
#Create identifier
movie_df['movie_identifier'] = movie_df.apply(lambda x: x['parsed_movie_name'] + '_' + str(x['release_year']), axis=1)
movie_df = movie_df.drop(['parsed_movie_name'], axis=1)

In [12]:
def extract_column(s):
    s = str(s)
    return re.findall('\"([^\/:][\w\s]+)"', s)

  return re.findall('\"([^\/:][\w\s]+)"', s)


In [13]:
# Extract data from columns
movie_df['countries'] = movie_df['countries'].apply(extract_column)
movie_df['languages'] = movie_df['languages'].apply(extract_column)
movie_df['genres'] = movie_df['genres'].apply(extract_column)

# If movie has not language it is assumed to be in english.
movie_df['languages'] = movie_df.apply(lambda x: x['languages'] if len(x['languages']) else ['English Language'], axis=1)

In [14]:
# Remove movies with duplicate movie_identifiers
temp = movie_df.groupby('movie_identifier').agg(count = ('movie_identifier', 'size'))
print('Number of movies with duplicate movie identifier:', temp[temp['count'] > 1]['count'].sum())
movie_df = movie_df.merge(temp[temp['count'] == 1], on='movie_identifier', how='inner')

movie_df = movie_df.drop('count', axis=1)

Number of movies with duplicate movie identifier: 379


# Preparing Character dataset

In [15]:
# Read character
col_names = ["wiki_movie_id", "freebase_movie_id", "movie_release_date", "character_name", "actor_date_of_birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)", "Actor name", "Actor age at movie release", "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_df = pd.read_csv('data/character.metadata.tsv', sep='\t', names=col_names, index_col="Freebase character ID")

In [16]:
# Remove characters without actors
character_df = character_df[character_df['Freebase actor ID'].notna()]

# Allow only one character per actor per movie.
# Motivation: When we look at if the actor won a Oscar we don't care about what role it won it as. 
character_df = character_df.groupby(['freebase_movie_id', 'Freebase actor ID']).first().reset_index()

character_df['parsed_actor_name'] = character_df['Actor name'].apply(parse_string)

# How many cases where the actors of the same name stars in the same movie or
temp = character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()
movies_with_actors_of_same_name_df = temp[temp['count'] > 1]
print('Number of movies with actors of the same name:', movies_with_actors_of_same_name_df.shape[0])

# Only 313 we decide to drop the cases.
# TODO: Check if any of the 311 actors won Oscars
character_df = character_df.merge(temp, on=['freebase_movie_id', 'parsed_actor_name'], how='inner')
dropped_character_df = character_df[character_df['count'] == 1]

temp = dropped_character_df.groupby(['freebase_movie_id', 'parsed_actor_name']).agg(count = ('parsed_actor_name', 'size')).reset_index()

assert temp[temp['count'] > 1].empty

character_df = dropped_character_df
character_df = character_df.drop('count', axis=1)

Number of movies with actors of the same name: 314


In [38]:
dropped_character_df

Unnamed: 0,freebase_movie_id,Freebase actor ID,wiki_movie_id,movie_release_date,character_name,actor_date_of_birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,parsed_actor_name,count
0,/m/011_mj,/m/01lc5,142780,1928-01-06,,1889-04-16,M,1.65,/m/06j2v,Charlie Chaplin,,/m/0k4qrd,charlie chaplin,1
1,/m/011_mj,/m/03cn95y,142780,1928-01-06,,1889-11-07,M,1.80,,George Davis,,/m/0gc5r6n,george davis,1
2,/m/011_mj,/m/03d36rf,142780,1928-01-06,,1871-11-19,M,,,John Rand,,/m/0gdn7dk,john rand,1
3,/m/011_mj,/m/043j_fq,142780,1928-01-06,,1908,F,,,Betty Morrissey,20.0,/m/0gcmkh6,betty morrissey,1
4,/m/011_mj,/m/07mmg5,142780,1928-01-06,,1868-02-23,M,,,Henry Bergman,,/m/0k4qrw,henry bergman,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445971,/m/0yzvw,/m/0b2rvr,129542,1989-09-13,Mr. Brown,1926-03-30,M,,/m/02ctzb,Ray McAnally,63.0,/m/02tbc70,ray mcanally,1
445972,/m/0yzvw,/m/0c0n663,129542,1989-09-13,Benny,,M,,,Eanna MacLiam,,/m/0cgnnql,eanna macliam,1
445973,/m/0yzvw,/m/0c71z4l,129542,1989-09-13,Mary,,F,,,Ruth McCabe,,/m/0dgl9f9,ruth mccabe,1
445974,/m/0yzvw,/m/0gc19fb,129542,1989-09-13,Sadie,,,,,Marie Conmee,,/m/0gc8n2t,marie conmee,1


# Merge Movie dataset on character dataset

In [17]:
movie_character_df = movie_df.merge(character_df, on='freebase_movie_id', how='inner')
movie_character_df = movie_character_df[[
    'title',
    'release_date',
    'box_office_revenue',
    'runtime',
    'languages',
    'countries',
    'genres',
    'movie_identifier',
    'Freebase actor ID',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'parsed_actor_name'
]]
movie_character_df['actor_identifier'] = movie_character_df['Freebase actor ID']
movie_character_df = movie_character_df.drop('Freebase actor ID', axis=1)

In [18]:
# Create identifier used for joining on Oscar dataset
movie_character_df['identifier'] = movie_character_df.apply(lambda x: x['movie_identifier'] + '_' + x['parsed_actor_name'], axis=1)
assert movie_character_df['identifier'].is_unique

# Preparing Oscar dataset

In [19]:
oscar_df = pd.read_csv('data/the_oscar_award.csv')

# Removes weird Oscar nomination
oscar_df = oscar_df[oscar_df['film'].notna() & oscar_df['name'].notna()]

In [20]:
# Filtering relevant Oscar catagories
# TODO: explain why we drop certain categories.
oscar_df = oscar_df[oscar_df['category'].str.contains('ACTOR') | oscar_df['category'].str.contains('ACTRESS')]

In [21]:
# Create identifier 
oscar_df['parsed_movie_name'] = oscar_df['film'].apply(parse_string)
oscar_df['parsed_actor_name'] = oscar_df['name'].apply(parse_string)
oscar_df['identifier'] = oscar_df.apply(lambda x: x['parsed_movie_name'] + '_' + str(x['year_film']) + '_' + x['parsed_actor_name'], axis=1)

In [22]:
# If rows share identifier it must mean the same actor got nominated for the 
# same film in multiple categories. We only count one.
oscar_df = oscar_df.groupby(['identifier']).first().reset_index()
assert oscar_df['identifier'].is_unique

In [23]:
# Select relevant columns
oscar_df = oscar_df[[
    'identifier',
    'category',
    'winner'
]]

# Merge movie_character dataset to Oscar
The join is identifier which consists of parsed_movie_name, release_year and parsed_actor_name

In [24]:
movie_character_oscar_df = movie_character_df.merge(oscar_df, how='left', on='identifier')

In [25]:
# Create column to determine if actor was nominated for Oscar 
movie_character_oscar_df['oscar_nominated'] = movie_character_oscar_df['category'].notna()

## Stats about dataset

In [26]:
nominated_df = movie_character_oscar_df[movie_character_oscar_df['oscar_nominated'] == True]
print('Number of different Oscar nominated movies in dataset:', 
      nominated_df['movie_identifier'].unique().shape[0])
print('Number of different movies in dataset:', 
      movie_character_oscar_df['movie_identifier'].unique().shape[0])
print('Number of different Oscar nominated actors in dataset:', 
      nominated_df['actor_identifier'].unique().shape[0])
print('Number of different actors in dataset:', 
      movie_character_oscar_df['actor_identifier'].unique().shape[0])
print('Number of Oscar nominated rows:', nominated_df.shape[0])

Number of different Oscar nominated movies in dataset: 952
Number of different movies in dataset: 63968
Number of different Oscar nominated actors in dataset: 801
Number of different actors in dataset: 134907
Number of Oscar nominated rows: 1443


# Prepare IMDB rating dataset 

In [27]:
#Read data
titlebasics_df = pd.read_csv('data/title.basics.tsv', sep='\t', quoting=3)
titleratings_df = pd.read_csv('data/title.ratings.tsv', sep='\t')

In [28]:
#Merge dataframes containing ratings and title names
name_rating_df = titlebasics_df.merge(titleratings_df, how='inner', on=['tconst', 'tconst'])
#Consider only movies
name_rating_df = name_rating_df[(name_rating_df["titleType"] == 'movie')]
#Get relevant columns
name_rating_df = name_rating_df[["primaryTitle", "startYear", "averageRating", "numVotes"]]

In [29]:
#Change the year data in IMDB dataset to float for merging to work
#\\N is a special value used for missing, replace with NaN so it can't be used for merging
name_rating_df["startYear"] = name_rating_df["startYear"].replace('\\N', np.nan)
name_rating_df["startYear"] = name_rating_df["startYear"].astype(float)
#Rename columns for merge
name_rating_df.rename(columns={'primaryTitle': 'title', 'startYear' : 'year', 'titleType': 'type'}, inplace=True)

In [30]:
#Have to take care of duplicate entries for movies in the same year
#ASSUME they are the same movie and aggregate the scores: sum up the numVotes and calculate the
#new average rating taking into account the number of votes

#The ratings need to be weighted to account for the number of votes
name_rating_df["RatingWeight"] = name_rating_df['averageRating'] * name_rating_df['numVotes']
# Group by title and year for duplicates, sum the number of votes and the weighted ratings
name_rating_agg_df = name_rating_df.groupby(['title','year']).agg(
    numVotes=('numVotes', 'sum'),
    RatingWeight = ('RatingWeight','sum'), 
).reset_index()

#Undo the previous weighing, dividing by number of all votes
name_rating_agg_df['averageRating'] = name_rating_agg_df['RatingWeight'] / name_rating_agg_df['numVotes']
#All ratings have 1 space after comma
name_rating_agg_df['averageRating'] = name_rating_agg_df['averageRating'].round(1)

#Drop the temporary weighted ratings
name_rating_agg_df.drop(columns='RatingWeight', inplace=True)

In [31]:
#Get the year of release of the movies from the mixed formatting
dates_as_year = pd.to_datetime(movie_character_oscar_df["release_date"],format='mixed',errors='coerce').dt.year

#Copy of the dataset with the added year column
movie_character_oscar_df['year'] = dates_as_year 

rows_before_ratings = movie_character_oscar_df.shape[0]

# Join movie_character_oscar on IMDB dataset

In [32]:
# Create identifier
imdb_df = name_rating_agg_df.copy(deep=True)
imdb_df['parsed_movie_name'] = imdb_df.apply(lambda x: parse_string(x['title']), axis=1)
imdb_df['release_year'] = imdb_df['year'].fillna(0).astype(int)
imdb_df = imdb_df[imdb_df['parsed_movie_name'].notna()]
imdb_df['movie_identifier'] = imdb_df.apply(lambda x: x['parsed_movie_name'] + '_' + str(x['release_year']), axis=1)

In [33]:
# Drop ratings of movies that share movie identifier
temp = imdb_df.groupby('movie_identifier').agg(count = ('movie_identifier', 'size')).sort_values('count', ascending=False)
temp = temp[temp['count'] == 1]
imdb_df = imdb_df.merge(temp, on='movie_identifier', how='inner')
imdb_df = imdb_df.drop('count', axis=1)
assert imdb_df['movie_identifier'].is_unique

In [34]:
# Select relevant columns
imdb_df = imdb_df[[
    'averageRating',
    'numVotes',
    'movie_identifier'
]]

In [35]:
movie_character_oscar_rating_df = movie_character_oscar_df.merge(imdb_df, on='movie_identifier', how='left')
movie_character_oscar_rating_df['has_rating'] = movie_character_oscar_rating_df['numVotes'].notna()

In [36]:
movie_character_oscar_rating_df['average_rating'] = movie_character_oscar_rating_df['averageRating']
movie_character_oscar_rating_df['number_of_votes'] = movie_character_oscar_rating_df['numVotes']
movie_character_oscar_rating_df = movie_character_oscar_rating_df.drop(['numVotes', 'averageRating'], axis=1)

In [37]:
## Stats of dataset
rating_df = movie_character_oscar_rating_df[movie_character_oscar_rating_df['has_rating'] == True]
print('Number of movies with ratings:', rating_df['movie_identifier'].unique().shape[0])
print('Oscar nominated movies with rating:', rating_df[rating_df['oscar_nominated'] == True]['movie_identifier'].unique().shape[0])

# Almost all Oscar nominated movies have rating

Number of movies with ratings: 36760
Oscar nominated movies with rating: 939


# Rename columns

In [37]:
# Rename columns to a more standardized format
movie_character_oscar_rating_df = movie_character_oscar_rating_df.rename(columns=
                                       {"Actor gender": "actor_gender",
                                        "Actor height (in meters)": "actor_height",
                                        "Actor ethnicity (Freebase ID)":"actor_ethnicity",
                                        "Actor name":"actor_name",
                                        "Actor age at movie release":"actor_age",
                                        })

# Write data

In [36]:
# Finish processed data
path = 'cache/data.csv'
os.makedirs('cache', exist_ok=True)
movie_character_oscar_rating_df.to_csv(path)
print('Processing done')

Processing done
