# Import the modules

In [38]:
import pandas as pd
import numpy as np
import pickle

# Import the data

In [39]:
DATA_FOLDER = 'Data/'
CHARACTER_DATASET = DATA_FOLDER + 'character.metadata.tsv'
MOVIE_DATASET = DATA_FOLDER + 'Movie.metadata.tsv'
SUMMARIES_DATASET = DATA_FOLDER + 'plot_summaries.txt'
NLP_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries/'
DEFAULT_COMPRESSION = 'gzip'

In [40]:
def load_metadata(path, column_names, header=None, low_memory=False):
    return pd.read_table(path, header=header, names=column_names)

In [41]:
columns_character = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_release_date', 'Character_name', 'Actor_date_of_birth', 'Actor_gender', 'Actor_height_meters', 'Actor_ethnicity_Freebase_ID', 'Actor_name', 'Actor_age_at_movie_release', 'Freebase_character_actor_map_ID', 'Freebase_character_ID', 'Freebase_actor_ID']
columns_movie = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name','Movie_release_date','Movie_box_office_revenue', 'Movie_runtime','Movie_languages','Movie_countries','Movie_genres' ]

characters = load_metadata(CHARACTER_DATASET,column_names=columns_character)
movies = load_metadata(MOVIE_DATASET,column_names=columns_movie)

In [42]:
with open(SUMMARIES_DATASET,'r', encoding='utf-8') as file:
    summaries = file.readlines()

## First glimpse

In [43]:
print(len(movies))
movies.head(2)

81741


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


In [44]:
print(len(characters))
characters.head(2)

450669


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Character_name,Actor_date_of_birth,Actor_gender,Actor_height_meters,Actor_ethnicity_Freebase_ID,Actor_name,Actor_age_at_movie_release,Freebase_character_actor_map_ID,Freebase_character_ID,Freebase_actor_ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4


In [45]:
print(len(summaries))
summaries[0]

42306


"23890098\tShlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.\n"

# Cleaning

## Problem of date

Typo in the release date of the movie, but this movie is not in characters

In [46]:
movies.loc[movies.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters.loc[characters.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters[characters.Actor_date_of_birth == '2050'] = '1971'
characters = characters.drop(characters[characters.Actor_date_of_birth < '1500'].index)
characters = characters.drop(characters[characters.Actor_date_of_birth > '2030'].index)

## Format of movie languages, genres and country

Convert the format of languages, genres, country columns to a simpler format (in term of utilisation).

In [47]:
def format_multiple(chain,deb,step):
    '''Split the chain of characters at each " encountered, and keep only the element in deb +i*step'''
    res = chain.split('"')[deb::step]
    return res

In [48]:
movies.loc[:,'Movie_genres'] = movies.Movie_genres.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_countries'] = movies.Movie_countries.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_languages'] = movies.Movie_languages.apply(format_multiple,deb=3,step=4)

In [49]:
keys = ['Movie_languages','Movie_countries','Movie_genres']
for key in keys:
    nb = len(movies[movies[key].apply(len) == 0])
    print('{nb} movies without {key} ({percentage:.2f}% of the dataset)'.format(nb=nb,key=key, percentage=nb*100/len(movies)))

13866 movies without Movie_languages (16.96% of the dataset)
8154 movies without Movie_countries (9.98% of the dataset)
2294 movies without Movie_genres (2.81% of the dataset)


## Format for dates

For our study, we only keep the years from the dates.

In [50]:
movies.Movie_release_date = pd.to_datetime(movies.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Movie_release_date = pd.to_datetime(characters.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Actor_date_of_birth = pd.to_datetime(characters.Actor_date_of_birth,format='%Y-%m-%d',utc=True,errors='coerce').dt.year

## Lemmatizing the summaries

We used the `corenlp_plit_summaries` files, and exctract from it the lemmatized versions of the movies summaries.

In [51]:
LEMMATIZE_SUMMARIES = False # take approximately 13min to run
if LEMMATIZE_SUMMARIES:
    from time import time
    import os
    import gzip

    # count the number of files in the directory
    nb_files = 0
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename)
        nb_files += 1
    print('Number of summaries:',nb_files)

    ext = '.xml.gz'
    dico_processed_summmaries = {}
    regex = r'<lemma>.*?</lemma>' # expression to detect in the corenlp data <lemma>(word)</lemma>

    deb = time()
    count = 0

    # iteration over the files
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename)
        id_summary = path[len(NLP_FOLDER):-len(ext)] # id of the summary = filename without extension
        summary = ''

        if os.path.isfile(path): # checking if it is a file
            with gzip.open(path, 'rb') as f: # opening the .gz file
                for line in f:
                    txt = line.decode().strip() # extracte the line as txt
                    for elt in re.finditer(regex,txt): # find all the elements like regex
                        summary += re.split('[><]',elt.group(0))[2] + ' ' # adding only the lemmatized word
        dico_processed_summmaries[id_summary] = summary
        count += 1
        # evolution of the process
        if count%1000 == 0:
            print('{processed}/{tot} files processed --> {perc:.1f}% ({t:.1f} seconds since deb)'.format(processed=count,tot=nb_files,perc=count/nb_files*100,t=time()-deb))
    
    # Pickle the file
    with open(DATA_FOLDER + 'nlp_summaries.pkl', 'wb') as file:
        pickle.dump(dico_processed_summmaries, file, protocol=pickle.HIGHEST_PROTOCOL)

A little extract of the data

In [52]:
nlp_summaries = pd.read_pickle(DATA_FOLDER+'nlp_summaries.pkl')
for key,value in nlp_summaries.items():
    print('Key:',key)
    print('Summary:\n',value[:200]+'...')
    break

Key: 10000053
Summary:
 Fur trapper Jean La B te paddle he canoe through wild water towards the settlement in order to sell a load of fur . at the settlement a steamboat be landing and the trader and he foster-child Eve , ar...


# Saving the new dataset

We pickle our data in order to reuse directly the cleaned data (and load it faster).

In [53]:
DESTINATION = './Data/'
EXT = '.pkl'
to_pickle_data = [characters,movies]
to_pickle_name = ['characters','movies']
for i in range(len(to_pickle_data)):
    to_pickle_data[i].to_pickle(DESTINATION+to_pickle_name[i]+EXT)

#To unpickle:
# characters = pd.read_pickle("./Data/characters.pkl") 
# movies = pd.read_pickle("./Data/movies.pkl")

# Enriching the CMU dataset with IMDb dataset

## Loading the data and first glimpse

In [54]:
#Load the most useful datasets for the moment
TITLE_BASICS_DATASET = DATA_FOLDER + 'title.basics.tsv.gz'
TITLE_RATINGS_DATASET = DATA_FOLDER + 'title.ratings.tsv.gz'

columns_title_basics = ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
columns_ratings = ['tconstIdentifier', 'averageRating', 'numVotes']

In [55]:
#Load title_basics
title_basics = load_metadata(TITLE_BASICS_DATASET, column_names=columns_title_basics)
print("length of title_basics: ", len(title_basics))
title_basics.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


length of title_basics:  9363391


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
2,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
3,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
4,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"


In [56]:
#Load title_ratings
ratings = load_metadata(TITLE_RATINGS_DATASET, column_names=columns_ratings)
print("length of ratings: ", len(ratings))
ratings.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


length of ratings:  1246149


Unnamed: 0,tconstIdentifier,averageRating,numVotes
0,tconst,averageRating,numVotes
1,tt0000001,5.7,1922
2,tt0000002,5.8,259
3,tt0000003,6.5,1734
4,tt0000004,5.6,174


## Cleaning the dataset

In [57]:
#Create a new table with only titleType=movies (get rid of videos, tvshows, tvepisodes and short)
title_basics_movies = title_basics[title_basics["titleType"] == "movie"]
        
print("length of title_basics_movies: ", len(title_basics_movies))
title_basics_movies.head()

length of title_basics_movies:  626772


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
9,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
588,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
611,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama


In [58]:
#Remove the endYear column since movies are not concerned by thats
title_basics_movies_cleaned = title_basics_movies.drop(columns='endYear')
title_basics_movies_cleaned.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
9,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,45,Romance
499,tt0000502,movie,Bohemios,Bohemios,0,1905,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
588,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama
611,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,Drama


In [113]:
title_basics_movies_cleaned.replace('\\N',np.NaN,inplace=True)

In [114]:
title_basics_movies_cleaned.startYear = pd.to_datetime(title_basics_movies_cleaned.startYear,format='%Y-%m-%d').dt.year

In [115]:
title_basics_movies_cleaned.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
9,tt0000009,movie,Miss Jerry,Miss Jerry,0,1970.0,45.0,Romance
499,tt0000502,movie,Bohemios,Bohemios,0,1970.0,100.0,
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1970.0,70.0,"Action,Adventure,Biography"
588,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1970.0,90.0,Drama
611,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1970.0,,Drama


## Saving the cleaned dataset

In [116]:
#Pickle the data
to_pickle_data = title_basics_movies_cleaned
to_pickle_name = 'IMDb_title_movies'
to_pickle_data.to_pickle(DESTINATION+to_pickle_name+EXT)

#To unpickle:
# IMDb_title_movies = pd.read_pickle("./Data/IMDb_title_movies.pkl") 

## Merging IMDb and CMU datasets

In [109]:
copy_IMDb = title_basics_movies_cleaned.copy()
copy_IMDb = copy_IMDb[copy_IMDb.startYear >= 1910]
copy_CMU = movies.copy()
copy_CMU.dropna(subset=['Movie_box_office_revenue', 'Movie_release_date'], inplace=True)
copy_IMDb.dropna(subset= ['startYear'], inplace=True)

In [110]:
print(len(copy_CMU))
print(len(copy_IMDb))

8328
541993


In [111]:
import re
common_words = {'a','an','and','the','of','at','in'}
punctuation = {'.',',','!',';','?',''}
def compare(df1,df2,col1_title,col2_title,col1_year,col2_year,threshold = 0.8, delta_year=1):
    matched = {}
    count = 0
    for idx1,row1 in df1.iterrows():
        title1 = set(re.split('[ :,]',row1[col1_title].lower()))
        title1 = title1.difference(punctuation)
        y1 = row1[col1_year]
        #for idx2,row2 in df2[df2[col2_year].isin([y1-delta_year+i for i in range(delta_year*2)])].iterrows():
        for idx2,row2 in df2[df2[col2_year]==y1].iterrows():
            title2 = set(re.split('[ :,]',row2[col2_title].lower()))
            title2 = title2.difference(punctuation)
            if len(title1 & title2)/(len(title1 | title2)) > threshold:
                try:
                    matched[idx1].append(idx2)
                except KeyError:
                    matched[idx1] = [idx2]
        count += 1
        if count == 83: # remove for the whole computation
            break
    return matched

In [112]:
from time import time
deb = time()
matched = compare(movies,title_basics_movies_cleaned, 'Movie_name', 'primaryTitle', 'Movie_release_date', 'startYear')
end = time()
print(end-deb)
matched

70.14672899246216


{0: [218707],
 2: [92684],
 3: [92207],
 4: [82107],
 7: [29325],
 8: [192689],
 9: [52695],
 10: [20997],
 11: [70654],
 12: [116711],
 13: [95308],
 17: [57208],
 18: [60426],
 20: [20502],
 22: [864742],
 24: [21925],
 27: [2979077],
 28: [77845],
 29: [244954],
 30: [352160],
 31: [244807],
 32: [50632],
 33: [86197],
 34: [224342],
 35: [350852],
 36: [32552],
 38: [171794],
 40: [172108],
 42: [3109573],
 43: [82521],
 47: [160588],
 49: [79055],
 53: [388561],
 54: [95478],
 56: [816685],
 57: [124841],
 58: [113782],
 59: [291218],
 61: [95595],
 64: [104126],
 65: [30950],
 69: [461031],
 73: [362367],
 75: [73073],
 76: [52389],
 77: [72155],
 78: [157914],
 79: [307057],
 80: [1329316]}