This notebook follows the plan:
- Import the modules
- Import the "basic" data (movies and characters datasets from CMU), clean it and save it
- Extraction of the lemmatized version of the plot summaries from the corenlp processed data
- Processing of the summaries according to the gender
- Loading, cleaning of IMDb dataset
- Matching CMU and IMDb datasets

# Import the modules

In [177]:
import pandas as pd
import numpy as np
import pickle
import nltk

In [2]:
# Download useful packages for nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /home/pierre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pierre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pierre/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pierre/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/pierre/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pierre/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Import the data

In [178]:
# File and folder names
DATA_FOLDER = 'Data/'
CHARACTER_DATASET = DATA_FOLDER + 'character.metadata.tsv'
MOVIE_DATASET = DATA_FOLDER + 'movie.metadata.tsv'

SUMMARIES_DATASET = DATA_FOLDER + 'plot_summaries.txt'
NLP_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries/'
DEFAULT_COMPRESSION = 'gzip'

In [179]:
# Function to load data
def load_metadata(path, column_names, header=None, low_memory=False):
    return pd.read_table(path, header=header, names=column_names)

In [180]:
# Name columns
columns_character = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_release_date', 'Character_name', 'Actor_date_of_birth', 'Actor_gender', 'Actor_height_meters', 'Actor_ethnicity_Freebase_ID', 'Actor_name', 'Actor_age_at_movie_release', 'Freebase_character_actor_map_ID', 'Freebase_character_ID', 'Freebase_actor_ID']
columns_movie = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name','Movie_release_date','Movie_box_office_revenue', 'Movie_runtime','Movie_languages','Movie_countries','Movie_genres' ]

# Load data with correct column names
characters = load_metadata(CHARACTER_DATASET,column_names=columns_character)
movies = load_metadata(MOVIE_DATASET,column_names=columns_movie)

In [181]:
# Load summaries
with open(SUMMARIES_DATASET,'r', encoding='utf-8') as file:
    summaries = file.readlines()

## First glimpse at the data

First we observe the movies dataframe:

In [182]:
print(len(movies))
movies.head(2)

81741


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


Then we observe the characters dataframe:

In [183]:
print(len(characters))
characters.head(2)

450669


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Character_name,Actor_date_of_birth,Actor_gender,Actor_height_meters,Actor_ethnicity_Freebase_ID,Actor_name,Actor_age_at_movie_release,Freebase_character_actor_map_ID,Freebase_character_ID,Freebase_actor_ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4


We also check the summaries:

In [184]:
print('Number of plots:', len(summaries))
summaries[0]

Number of plots: 42306


"23890098\tShlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.\n"

# Cleaning

## Problem of dates

We fix typos and absurd dates

In [185]:
movies.loc[movies.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters.loc[characters.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters[characters.Actor_date_of_birth == '2050'] = '1971'
characters = characters.drop(characters[characters.Actor_date_of_birth < '1500'].index)
characters = characters.drop(characters[characters.Actor_date_of_birth > '2030'].index)

## Format of movie languages, genres and country

Convert the format of languages, genres, country columns to a simpler format (in terms of utilisation).

In [186]:
def format_multiple(chain,deb,step):
    '''Split the chain of characters at each " encountered, and keep only the element in deb +i*step'''
    res = chain.split('"')[deb::step]
    return res

In [187]:
movies.loc[:,'Movie_genres'] = movies.Movie_genres.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_countries'] = movies.Movie_countries.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_languages'] = movies.Movie_languages.apply(format_multiple,deb=3,step=4)

In [188]:
keys = ['Movie_languages','Movie_countries','Movie_genres']
for key in keys:
    nb = len(movies[movies[key].apply(len) == 0])
    print('{nb} movies without {key} ({percentage:.2f}% of the dataset)'.format(nb=nb,key=key, percentage=nb*100/len(movies)))

13866 movies without Movie_languages (16.96% of the dataset)
8154 movies without Movie_countries (9.98% of the dataset)
2294 movies without Movie_genres (2.81% of the dataset)


## Format for dates

For our study, we only keep the years from the dates.

In [189]:
movies.Movie_release_date = pd.to_datetime(movies.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Movie_release_date = pd.to_datetime(characters.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Actor_date_of_birth = pd.to_datetime(characters.Actor_date_of_birth,format='%Y-%m-%d',utc=True,errors='coerce').dt.year

## Saving the new dataset

We pickle our data in order to reuse directly the cleaned data (and load it faster).

In [190]:
DESTINATION = './Data/'
EXT = '.pkl'
to_pickle_data = [characters,movies]
to_pickle_name = ['characters','movies']
for i in range(len(to_pickle_data)):
    to_pickle_data[i].to_pickle(DESTINATION+to_pickle_name[i]+EXT)

# # To unpickle:
# characters = pd.read_pickle("./Data/characters.pkl") 
# movies = pd.read_pickle("./Data/movies.pkl")

# Lemmatizing the summaries

We lemmatize data (for examples *'is'* becomes *'be'*) to be able to count words better. To do so, we used the `corenlp_plot_summaries` files, and exctracted from it the lemmatized versions of the movies summaries.

In [191]:
# Set to True to save the data
LEMMATIZE_SUMMARIES = False # Takes ~7 mins to run (on i7-10875H CPU)

if LEMMATIZE_SUMMARIES:
    # Imports
    from time import time
    import os
    import gzip
    import re

    # Count the number of files in the directory
    nb_files = 0
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename)
        nb_files += 1
    print('Number of summaries:',nb_files)

    ext = '.xml.gz' # Extension name
    dico_processed_summmaries = {} # Dictionary to store the processed summaries
    regex = r'<lemma>.*?</lemma>' # Expression to detect in the corenlp data <lemma>(word)</lemma>

    deb = time() # Start timer
    count = 0 # Counter

    # Iteration over the files
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename) # Path to the file
        id_summary = path[len(NLP_FOLDER):-len(ext)] # id of the summary = filename without extension
        summary = '' # String to store the summary

        if os.path.isfile(path): # Checking if it is a file
            with gzip.open(path, 'rb') as f: # Opening the .gz file
                for line in f:
                    txt = line.decode().strip() # Extract the line as txt
                    for elt in re.finditer(regex,txt): # Find all the elements like regex
                        summary += re.split('[><]',elt.group(0))[2].lower() + ' ' # Adding only the lemmatized word
        
        # Set the summary in the dictionary and increment the counter
        dico_processed_summmaries[id_summary] = summary
        count += 1

        # Evolution of the process
        if count%1000 == 0:
            print('{processed}/{tot} files processed --> {perc:.1f}% ({t:.1f} seconds since deb)'.format(processed=count,tot=nb_files,perc=count/nb_files*100,t=time()-deb))
    
    # Pickle the file
    with open(DATA_FOLDER + 'nlp_summaries.pkl', 'wb') as file:
        pickle.dump(dico_processed_summmaries, file, protocol=pickle.HIGHEST_PROTOCOL)

Let us try to extract the data:

In [192]:
# Read the pickle file
nlp_summaries = pd.read_pickle(DATA_FOLDER+'nlp_summaries.pkl')

# Observe the first lemmatized summary
for key,value in nlp_summaries.items():
    print('Key:',key)
    print('Summary:\n',value[:200]+'...')
    break

Key: 10000053
Summary:
 Fur trapper Jean La B te paddle he canoe through wild water towards the settlement in order to sell a load of fur . at the settlement a steamboat be landing and the trader and he foster-child Eve , ar...


# Separating sentences between sexes

The aim of this part is to separate sentences between sexes to do a sentimental analysis later. To do so, we check if a feminine actor or the *'she'* pronoun is present in a sentence and add them to a new file. We do the same for a male actor and the *'he'* pronoun. Note that for example the sentence *'She hates him'* will become *'she hate he'* once lemmatized, which will be put in the feminine and maculine files

This approach is not perfect, since for example in the sentences 'She likes butter. Indeed, the actress loves food.', only the first one will be added. It is not perfect, but the best solution we could think of.

In [193]:
# Create a dataframe with the characters
characters_per_film = characters.copy()
# Put the column in their correct type and lower chars
characters_per_film['Wikipedia_movie_ID'] = characters_per_film['Wikipedia_movie_ID'].astype(int)
characters_per_film['Character_name'] = characters_per_film['Character_name'].astype(str).apply(lambda x: x.lower())
# Sort the dataframe by movie ID
characters_per_film = characters_per_film.sort_values(by=['Wikipedia_movie_ID'])
# Drio rows where the character name or the gender is empty
characters_per_film = characters_per_film.dropna(subset=['Character_name', 'Actor_gender'])
# Group the dataframe by movie ID
characters_per_film = characters_per_film.groupby('Wikipedia_movie_ID')[['Wikipedia_movie_ID', 'Character_name', 'Actor_gender']]

In [194]:
# Import dataframe from lemmatized summaries
df = pd.DataFrame(list(nlp_summaries.items()), columns = ['id','plot_lemmatized'])
# Put column in their correct type
df['id'] = df['id'].astype(int)
# Sort the dataframe by movie ID
df = df.sort_values(by=['id'])
# Show the first 5 rows
df.head()

Unnamed: 0,id,plot_lemmatized
27884,330,in order to prepare the role of a important ol...
26866,3217,"after be pull through a time portal , Ash Will..."
28281,3333,the film follow two juxtapose family : the Nor...
31566,3746,-lcb- -lcb- Hatnote -rcb- -rcb- in Los Angeles...
31793,3837,"in the American Old West of 1874 , constructio..."


In [195]:
# Set to True to save the data
SEPARATE_SENTENCES = False # Takes ~20 mins to run (on i7-10875H CPU)

if SEPARATE_SENTENCES:
    # Imports
    count = 0
    dico_male = {}
    dico_female = {}
    regexp = nltk.tokenize.RegexpTokenizer('\w+')

    # Loop on subgroups
    for _, group in characters_per_film:
        # Get the movie id
        movie_id = group['Wikipedia_movie_ID'].iloc[0]
        female_sentences = []
        male_sentences = []

        # Check if wikipedia movie id is in the nlp summaries
        if movie_id in df['id'].values:
            index = df[df['id'] == movie_id].index[0] # Take the correct index
            plot = df['plot_lemmatized'][index] # Take the correct plot
            sentences = plot.split('.') # Split into sentences
            # Loop on sentences
            for sentence in sentences:
                tokens = regexp.tokenize(sentence)
                # Loop on characters
                for character in group['Character_name']:
                    # Find the sex of the character
                    gender = group[group['Character_name'] == character].Actor_gender.values[0]
                    # Find potential pronouns discriminative on gender
                    he_index = any('he' in sublist for sublist in tokens)
                    she_index = any('she' in sublist for sublist in tokens)
                    # Check if the pronoun or actor name is in the sentence
                    if ((character in sentence) or she_index or he_index):
                        # Store in dictionary depending on gender of sentence (can also be in both)
                        if ((gender == '1') or she_index):
                            female_sentences.append(sentence)
                        if ((gender == '0') or he_index):
                            male_sentences.append(sentence)

        # Store in dictionary and increment counter
        dico_male[movie_id] = male_sentences
        dico_female[movie_id] = female_sentences
        count += 1

        # Evolution of the process
        if count%1000 == 0:
            print('{processed} files processed'.format(processed=count))

    # Pickle the file
    with open(DATA_FOLDER + 'male_sentences.pkl', 'wb') as file:
        pickle.dump(dico_male, file, protocol=pickle.HIGHEST_PROTOCOL)    
    with open(DATA_FOLDER + 'female_sentences.pkl', 'wb') as file:
        pickle.dump(dico_female, file, protocol=pickle.HIGHEST_PROTOCOL)    

## Analyse sentiments for each group

We run it in the handling of data since it takes a long time to calculate

In [26]:
# Import male sentences
male_sentences_dict = pd.read_pickle(DATA_FOLDER + 'male_sentences.pkl')
# Form a dataframe
male_sentences = pd.DataFrame(list(male_sentences_dict.items()), columns = ['id','sentences'])
# Create a new column that reconstructs the summary from the lemmatized sentences
male_sentences['summary'] = male_sentences['sentences'].apply(lambda x: ' '.join(x))

# Import female sentences
female_sentences_dict = pd.read_pickle(DATA_FOLDER + 'female_sentences.pkl')
# Form a dataframe
female_sentences = pd.DataFrame(list(female_sentences_dict.items()), columns = ['id','sentences'])
# Create a new column that reconstructs the summary from the lemmatized sentences
female_sentences['summary'] = female_sentences['sentences'].apply(lambda x: ' '.join(x))

# Show the first 5 rows of male sentences
male_sentences.head()

Unnamed: 0,id,sentences,summary
0,330,[in order to prepare the role of a important o...,in order to prepare the role of a important ol...
1,1971,[],
2,3217,"[after be pull through a time portal , Ash Wil...","after be pull through a time portal , Ash Will..."
3,3333,[the film follow two juxtapose family : the No...,the film follow two juxtapose family : the Nor...
4,3746,[-lcb- -lcb- Hatnote -rcb- -rcb- in Los Angele...,-lcb- -lcb- Hatnote -rcb- -rcb- in Los Angeles...


In [196]:
SAVE_SENTIMENTS = False # Takes ~41 mins to run (on i7-10875H CPU)

if SAVE_SENTIMENTS:
    # Use nltk Vader to get the sentiment of the sentences
    analyzer =  nltk.sentiment.SentimentIntensityAnalyzer()

    # Apply sentiments to plots
    male_sentences['polarity'] = male_sentences['summary'].apply(lambda x: analyzer.polarity_scores(x))
    female_sentences['polarity'] = female_sentences['summary'].apply(lambda x: analyzer.polarity_scores(x))

    # Pickle the file
    with open(DATA_FOLDER + 'male_sentiments.pkl', 'wb') as file:
        pickle.dump(male_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)    
    with open(DATA_FOLDER + 'female_sentiments.pkl', 'wb') as file:
        pickle.dump(female_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)    

# Enriching the CMU dataset with IMDb dataset and movie-stats

## Loading the data and first glimpse

In [197]:
#Load the most useful datasets from IMDb
TITLE_BASICS_DATASET = DATA_FOLDER + 'title.basics.tsv.gz'
TITLE_RATINGS_DATASET = DATA_FOLDER + 'title.ratings.tsv.gz'
#Load movie-stats, a dataset generated from IMDb movies
MOVIE_STATS = DATA_FOLDER + 'movie-stats.csv'

columns_title_basics = ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
columns_ratings = ['tconstIdentifier', 'averageRating', 'numVotes']

DESTINATION = './Data/'
MATCHING_TABLE = DESTINATION + 'matching_table.pkl'

CLEAN_DATA = False # True to clean again the data, False to use the already pickled data
MATCH_DATA = False # True to match on film names, False to use the matching table already computed

In [198]:
if CLEAN_DATA:
    #Load title_basics
    title_basics = load_metadata(TITLE_BASICS_DATASET, column_names=columns_title_basics)
    print("length of title_basics: ", len(title_basics))
    title_basics.head()

In [199]:
if CLEAN_DATA:
    #Load title_ratings
    ratings = load_metadata(TITLE_RATINGS_DATASET, column_names=columns_ratings)
    print("length of ratings: ", len(ratings))
    ratings.head(6)

In [200]:
if CLEAN_DATA:
    #Load movie-stats
    movie_stats = pd.read_csv(MOVIE_STATS, header = 8)
    print("length of movie_stats: ", len(movie_stats))
    movie_stats.head()

## Cleaning the datasets

In [201]:
if CLEAN_DATA:
    #Create a new table with only titleType=movies (get rid of videos, tvshows, tvepisodes and short)
    title_basics_movies = title_basics[title_basics["titleType"] == "movie"]
    #Remove the endYear column since movies are not concerned by thats
    title_basics_movies_cleaned = title_basics_movies.drop(columns='endYear')
    title_basics_movies_cleaned.replace('\\N',np.NaN,inplace=True) # replace \\N by NaN
    # datetime format for dates
    title_basics_movies_cleaned.startYear = pd.to_datetime(title_basics_movies_cleaned.startYear,format='%Y').dt.year 
    title_basics_movies_cleaned.head()

In [202]:
if CLEAN_DATA:
    #Drop the first row which represents the titles of the columns
    #Careful: execute only once, otherwise it will delete the first row each time!
    ratings_cleaned = ratings.drop(index=ratings.index[0], axis=0) 
    print("length of ratings_cleaned: ", len(ratings_cleaned))
    ratings_cleaned.replace('\\N',np.NaN,inplace=True) # replace \\N by NaN
    #Check if there are NaN values in the dataset
    print('Number of NaN in the ratings dataset: \n',ratings_cleaned.isnull().sum())
    ratings_cleaned.head()

In [203]:
if CLEAN_DATA:
    #Check if there are NaN values in the dataset
    print('Number of NaN in the movie-stats dataset: \n', movie_stats.isnull().sum())
    #Remove useless columns
    movie_stats_cleaned = movie_stats.drop(columns=['rating', 'released'])
    #Remove rows where budget is NaN because we use movie-stats dataset to get information on budget
    movie_stats_cleaned.dropna(subset=['budget'], inplace=True)
    movie_stats_cleaned.head()
    print("length of movie_stats_cleaned: ", len(movie_stats_cleaned))
    print('Number of NaN in the cleaned movie-stats dataset: \n', movie_stats_cleaned.isnull().sum())

## Saving the cleaned dataset

In [204]:
DESTINATION = './Data/'
EXT = '.pkl'

In [205]:
if CLEAN_DATA:
    #Pickle the data
    to_pickle_data = title_basics_movies_cleaned
    to_pickle_name = 'IMDb_title_movies'
    to_pickle_data.to_pickle(DESTINATION+to_pickle_name+EXT)

if not CLEAN_DATA: # for testing part
    # load already pickled data
    title_basics_movies_cleaned = pd.read_pickle("./Data/IMDb_title_movies.pkl")
    title_basics_movies_cleaned.startYear = pd.to_datetime(title_basics_movies_cleaned.startYear,format='%Y').dt.year

In [206]:
if CLEAN_DATA:
    #Pickle the data
    to_pickle_data = ratings_cleaned
    to_pickle_name = 'IMDb_ratings'
    to_pickle_data.to_pickle(DESTINATION+to_pickle_name+EXT)

if not CLEAN_DATA: # for testing part
    # load already pickled data
    ratings_cleaned = pd.read_pickle("./Data/IMDb_ratings.pkl")

In [207]:
if CLEAN_DATA:
    #Pickle the data
    to_pickle_data = movie_stats_cleaned
    to_pickle_name = 'movie-stats_budget'
    to_pickle_data.to_pickle(DESTINATION+to_pickle_name+EXT)

if not CLEAN_DATA: # for testing part
    # load already pickled data
    movie_stats_cleaned = pd.read_pickle("./Data/movie-stats_budget.pkl")

## Matching IMDb and CMU films

In [208]:
movie_stats_cleaned.head()

Unnamed: 0,name,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime\
0,The Shining,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0\
1,The Blue Lagoon,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0\
2,Star Wars: Episode V - The Empire Strikes Back,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0\
3,Airplane!,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0\
4,Caddyshack,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0\


In [142]:
ratings_cleaned.head()

Unnamed: 0,tconstIdentifier,averageRating,numVotes
1,tt0000001,5.7,1922
2,tt0000002,5.8,259
3,tt0000003,6.5,1734
4,tt0000004,5.6,174
5,tt0000005,6.2,2545


We match the movies from one dataset to the films on the other dataset on the movie name, as the ids are different.

In order to avoid mismatched pairs due to a little variation in the titles, we matched films of the same year, with almost identical titles (via Jaccard distance). We create a dictionnary that matches the index of matched films.

In [209]:
copy_IMDb = title_basics_movies_cleaned.copy()
copy_IMDb = copy_IMDb[copy_IMDb.startYear >= 1910]
copy_CMU = movies.copy()
copy_CMU.dropna(subset=['Movie_box_office_revenue', 'Movie_release_date'], inplace=True)
copy_IMDb.dropna(subset= ['startYear'], inplace=True)
print(len(copy_CMU)) # 8328
print(len(copy_IMDb)) # 541993

8328
541993


In [271]:
import re
common_words = {'a','an','and','the','of','at','in'}
punctuation = {'.',',','!',';','?',''}
def compare(df1,df2,col1_title,col2_title,col1_year,col2_year,threshold = 0.8, delta_year=1):
        matched = {}
        for idx1,row1 in df1.iterrows():
            title1 = set(re.split('[ :,]',row1[col1_title].lower()))
            title1 = title1.difference(punctuation)
            y1 = row1[col1_year]
            print(title1,y1)
            #for idx2,row2 in df2[df2[col2_year].isin([y1-delta_year+i for i in range(delta_year*2)])].iterrows():
            for idx2,row2 in df2[df2[col2_year]==y1].iterrows():
                title2 = set(re.split('[ :,]',row2[col2_title].lower()))
                title2 = title2.difference(punctuation)
                if len(title1 & title2)/(len(title1 | title2)) > threshold:
                    print(title2)
                    print('ok')
                    try:
                        matched[idx1].append(idx2)
                    except KeyError:
                        matched[idx1] = [idx2]
        return matched

In [211]:
if MATCH_DATA:
    from time import time
    deb = time()
    matched = compare(copy_CMU,copy_IMDb, 'Movie_name', 'primaryTitle', 'Movie_release_date', 'startYear')
    end = time()
    print('Time of execution:', end-deb) # 2360s
    matched

In [212]:
if MATCH_DATA:
    # save the matching table
    with open(MATCHING_TABLE, 'wb') as file:
        pickle.dump(matched, file, protocol=pickle.HIGHEST_PROTOCOL)

else:
    matched = pd.read_pickle(MATCHING_TABLE)

In [213]:
doublons = {}
for match in matched:
    if len(matched[match]) > 1:
        doublons[match] = matched[match]
print('{nb} duplicates ({per:.2f}% of all matchings)'.format(nb=len(doublons), per=len(doublons)/len(matched)*100))

86 duplicates (1.16% of all matchings)


In [214]:
for cmu,imdbs in doublons.items():
    print(copy_CMU.loc[cmu,'Movie_name'] + '  VS  ' + copy_IMDb.loc[imdbs,'primaryTitle'])

3391684    The Way  VS  The Way
4535866    The Way  VS  The Way
Name: primaryTitle, dtype: object
58112    Harlow  VS  Harlow
58113    Harlow  VS  Harlow
Name: primaryTitle, dtype: object
88939     Down and Out in Beverly Hills  VS  Down and Ou...
176644    Down and Out in Beverly Hills  VS  In and Out ...
Name: primaryTitle, dtype: object
368686     The Dying Gaul  VS  The Dying Gaul
4613260    The Dying Gaul  VS  The Dying Gaul
Name: primaryTitle, dtype: object
88104      Sweet Dreams  VS  Sweet Dreams
7993798    Sweet Dreams  VS  Sweet Dreams
Name: primaryTitle, dtype: object
97951    My Blue Heaven  VS  My Blue Heaven
97952    My Blue Heaven  VS  My Blue Heaven
Name: primaryTitle, dtype: object
86067      Runaway  VS  Runaway
7811004    Runaway  VS  Runaway
Name: primaryTitle, dtype: object
3772516    Super  VS  Super
4745142    Super  VS  Super
Name: primaryTitle, dtype: object
3264408    White Night  VS  White Night
3886728    White Night  VS  White Night
Name: primaryTitle, dtyp

Many duplicated matched are juste films with the same name (and same year), so probably duplicated films in the database.

Some are similar titles but the order of words is changed (e.g "Black and White" corresponding to "Black and White" and "White and Black").

## Merging the datasets

As only $1.16\%$ of the matchings are duplicated, we will simply drop them.

In [215]:
for cmu,imdbs in matched.items():
    if cmu not in doublons:
        copy_CMU.loc[cmu,'IMDb_index'] = imdbs[0]
copy_IMDb['IMDb_index'] = copy_IMDb.index
copy_CMU.dropna(subset=['IMDb_index'],inplace=True)
copy_CMU['IMDb_index'] = copy_CMU['IMDb_index'].astype('int64')
merge_df = pd.merge(copy_CMU, copy_IMDb, on = 'IMDb_index', how = "inner")
print(len(merge_df))
merge_df.head()

7356


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,IMDb_index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",218707,tt0228333,movie,Ghosts of Mars,Ghosts of Mars,0,2001.0,98,"Action,Horror,Sci-Fi"
1,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938.0,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]",29325,tt0029852,movie,Alexander's Ragtime Band,Alexander's Ragtime Band,0,1938.0,106,"Drama,Music,Musical"
2,171005,/m/016ywb,Henry V,1989.0,10161099.0,137.0,[English Language],[United Kingdom],"[Costume drama, War film, Epic, Period piece, ...",95308,tt0097499,movie,Henry V,Henry V,0,1989.0,137,"Biography,Drama,History"
3,77856,/m/0kcn7,Mary Poppins,1964.0,102272727.0,139.0,[English Language],[United States of America],"[Children's/Family, Musical, Fantasy, Comedy, ...",57208,tt0058331,movie,Mary Poppins,Mary Poppins,0,1964.0,139,"Comedy,Family,Fantasy"
4,156558,/m/014k4y,Baby Boy,2001.0,29381649.0,123.0,[English Language],[United States of America],"[Crime Fiction, Drama, Coming of age]",244954,tt0255819,movie,Baby Boy,Baby Boy,0,2001.0,130,"Crime,Drama,Romance"


In [216]:
merge_df.Movie_runtime = merge_df.Movie_runtime.apply(lambda x:int(x) if pd.notna(x) else -1)
len(merge_df[merge_df.runtimeMinutes == merge_df.Movie_runtime])
# All the matched films have the same runtime

0

Now, we add the ratings to the corresponding films: we lose 5 films which did not have ratings, which is negligeable.

In [217]:
merge_df_ratings = pd.merge(merge_df, ratings_cleaned, left_on = 'tconst', right_on ='tconstIdentifier', how = "inner")
print(len(merge_df_ratings))
merge_df_ratings.head()

7351


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,IMDb_index,...,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,tconstIdentifier,averageRating,numVotes
0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",218707,...,movie,Ghosts of Mars,Ghosts of Mars,0,2001.0,98,"Action,Horror,Sci-Fi",tt0228333,4.9,55237
1,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938.0,3600000.0,106,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]",29325,...,movie,Alexander's Ragtime Band,Alexander's Ragtime Band,0,1938.0,106,"Drama,Music,Musical",tt0029852,6.9,2159
2,171005,/m/016ywb,Henry V,1989.0,10161099.0,137,[English Language],[United Kingdom],"[Costume drama, War film, Epic, Period piece, ...",95308,...,movie,Henry V,Henry V,0,1989.0,137,"Biography,Drama,History",tt0097499,7.5,30168
3,77856,/m/0kcn7,Mary Poppins,1964.0,102272727.0,139,[English Language],[United States of America],"[Children's/Family, Musical, Fantasy, Comedy, ...",57208,...,movie,Mary Poppins,Mary Poppins,0,1964.0,139,"Comedy,Family,Fantasy",tt0058331,7.8,173216
4,156558,/m/014k4y,Baby Boy,2001.0,29381649.0,123,[English Language],[United States of America],"[Crime Fiction, Drama, Coming of age]",244954,...,movie,Baby Boy,Baby Boy,0,2001.0,130,"Crime,Drama,Romance",tt0255819,6.4,14988


In [269]:
# Select the columns
merge_df_ratings.drop(['titleType', 'startYear','originalTitle'], axis=1, inplace=True)

# Save the dataset
with open(DESTINATION + 'merge_CMU_IMDb.pkl', 'wb') as file:
        pickle.dump(merge_df_ratings, file, protocol=pickle.HIGHEST_PROTOCOL)

KeyError: "['titleType', 'startYear', 'originalTitle'] not found in axis"

## Merge this dataframe to the movie_stats one

In [272]:
 if MERGE_AGAIN:
    matched = compare(merge_df_ratings,movie_stats_cleaned, 'Movie_name', 'name', 'Movie_release_date', 'year')
    # save the matching table
    with open(DESTINATION+'matching_table_bis.pkl', 'wb') as file:
        pickle.dump(matched, file, protocol=pickle.HIGHEST_PROTOCOL)

else:
    matched = pd.read_pickle(DESTINATION+'matching_table_bis.pkl')

{'of', 'mars', 'ghosts'} 2001
{'of', 'mars', 'ghosts'}
ok
{"alexander's", 'band', 'ragtime'} 1938
{'v', 'henry'} 1989
{'v', 'henry'}
ok
{'poppins', 'mary'} 1964
{'boy', 'baby'} 2001
{'boy', 'baby'}
ok
{'what', 'knew', 'wanted', 'they'} 1940
{'be', 'the', 'must', 'crazy', 'gods'} 1980
{'be', 'the', 'must', 'crazy', 'gods'}
ok
{'rudo', 'cursi', 'y'} 2008
{'kinjite', 'subjects', 'forbidden'} 1989
{'kinjite', 'subjects', 'forbidden'}
ok
{'loverboy'} 1989
{'the', 'wagon', 'covered'} 1923
{'the', 'hut', 'little'} 1957
{'boston', 'the', 'strangler'} 1968
{'die', 'dae-ro', "can't", 'lee'} 2005
{'naked', 'the', 'kitchen'} 2009
{'star!'} 1968
{'great', 'the', 'santini'} 1979
{'beachhead'} 1954
{'to', 'when', 'do', 'in', 'denver', 'things', "you're", 'dead'} 1995
{'to', 'when', 'do', 'in', 'denver', 'things', "you're", 'dead'}
ok
{'convoy'} 1978
{'c.h.u.d.'} 1984
{'c.h.u.d.'}
ok
{'cat', 'black', 'white'} 1998
{'the', 'astronaut', 'farmer'} 2006
{'the', 'astronaut', 'farmer'}
ok
{'dogs', 'straw'} 

{'omen', 'the'} 1976
{'horizons', 'the', 'far'} 1955
{'giant'} 1956
{'great', 'lover', 'the'} 1949
{'before', 'mast', 'the', 'years', 'two'} 1946
{'terri'} 2011
{'30', 'going', 'on', '13'} 2004
{'30', 'going', 'on', '13'}
ok
{'the', 'perfect', 'game'} 2009
{'algiers'} 1938
{'to', 'goes', 'ernest', 'camp'} 1987
{'to', 'goes', 'ernest', 'camp'}
ok
{'your', 'only', 'for', 'eyes'} 1981
{'your', 'only', 'for', 'eyes'}
ok
{'county', 'line', 'macon'} 1974
{'birth', 'of', 'nation', 'a', 'the'} 1915
{'six', 'days', 'seven', 'nights'} 1998
{'six', 'days', 'seven', 'nights'}
ok
{'dreamz', 'american'} 2006
{'king', 'of', 'kings'} 1961
{'colombiana'} 2011
{'colombiana'}
ok
{'of', 'the', 'year', 'comet'} 1992
{'connection', 'french', 'ii'} 1975
{'next', 'friday', 'after'} 2002
{'next', 'friday', 'after'}
ok
{'prejudice', 'extreme'} 1987
{'prejudice', 'extreme'}
ok
{'parts', 'speaking'} 1989
{'sky', 'high'} 2005
{'sky', 'high'}
ok
{'of', 'dreams', 'winter', 'our'} 1981
{"henry's", 'crime'} 2010
{"hen

KeyboardInterrupt: 

In [264]:
print(len(matched))
print(len(movie_stats_cleaned))
print(len(merge_df_ratings))

3514
5497
7351
