# Import the modules

In [209]:
import pandas as pd
import numpy as np
import pickle

# Import the data

In [12]:
DATA_FOLDER = 'Data/'
CHARACTER_DATASET = DATA_FOLDER + 'character.metadata.tsv'
MOVIE_DATASET = DATA_FOLDER + 'Movie.metadata.tsv'
SUMMARIES_DATASET = DATA_FOLDER + 'plot_summaries.txt'
NLP_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries/'
DEFAULT_COMPRESSION = 'gzip'

In [211]:
def load_metadata(path, column_names, header=None, low_memory=False):
    return pd.read_table(path, header=header, names=column_names)

In [212]:
columns_character = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_release_date', 'Character_name', 'Actor_date_of_birth', 'Actor_gender', 'Actor_height_meters', 'Actor_ethnicity_Freebase_ID', 'Actor_name', 'Actor_age_at_movie_release', 'Freebase_character_actor_map_ID', 'Freebase_character_ID', 'Freebase_actor_ID']
columns_movie = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name','Movie_release_date','Movie_box_office_revenue', 'Movie_runtime','Movie_languages','Movie_countries','Movie_genres' ]

characters = load_metadata(CHARACTER_DATASET,column_names=columns_character)
movies = load_metadata(MOVIE_DATASET,column_names=columns_movie)

In [213]:
with open(SUMMARIES_DATASET,'r', encoding='utf-8') as file:
    summaries = file.readlines()

## First glimpse

In [214]:
print(len(movies))
movies.head(2)

81741


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


In [215]:
print(len(characters))
characters.head(2)

450669


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Character_name,Actor_date_of_birth,Actor_gender,Actor_height_meters,Actor_ethnicity_Freebase_ID,Actor_name,Actor_age_at_movie_release,Freebase_character_actor_map_ID,Freebase_character_ID,Freebase_actor_ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4


In [216]:
print(len(summaries))
summaries[0]

42306


"23890098\tShlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.\n"

# Cleaning

## Problem of date

Typo in the release date of the movie, but this movie is not in characters

In [217]:
movies.loc[movies.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters.loc[characters.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters[characters.Actor_date_of_birth == '2050'] = '1971'
characters = characters.drop(characters[characters.Actor_date_of_birth < '1500'].index)
characters = characters.drop(characters[characters.Actor_date_of_birth > '2030'].index)

## Format of movie languages, genres and country

Convert the format of languages, genres, country columns to a simpler format (in term of utilisation).

In [218]:
def format_multiple(chain,deb,step):
    '''Split the chain of characters at each " encountered, and keep only the element in deb +i*step'''
    res = chain.split('"')[deb::step]
    return res

In [219]:
movies.loc[:,'Movie_genres'] = movies.Movie_genres.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_countries'] = movies.Movie_countries.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_languages'] = movies.Movie_languages.apply(format_multiple,deb=3,step=4)

In [220]:
keys = ['Movie_languages','Movie_countries','Movie_genres']
for key in keys:
    nb = len(movies[movies[key].apply(len) == 0])
    print('{nb} movies without {key} ({percentage:.2f}% of the dataset)'.format(nb=nb,key=key, percentage=nb*100/len(movies)))

13866 movies without Movie_languages (16.96% of the dataset)
8154 movies without Movie_countries (9.98% of the dataset)
2294 movies without Movie_genres (2.81% of the dataset)


## Format for dates

For our study, we only keep the years from the dates.

In [221]:
movies.Movie_release_date = pd.to_datetime(movies.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Movie_release_date = pd.to_datetime(characters.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Actor_date_of_birth = pd.to_datetime(characters.Actor_date_of_birth,format='%Y-%m-%d',utc=True,errors='coerce').dt.year

## Lemmatizing the summaries

We used the `corenlp_plit_summaries` files, and exctract from it the lemmatized versions of the movies summaries.

In [10]:
LEMMATIZE_SUMMARIES = False # take approximately 13min to run
if LEMMATIZE_SUMMARIES:
    from time import time
    import os
    import gzip

    # count the number of files in the directory
    nb_files = 0
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename)
        nb_files += 1
    print('Number of summaries:',nb_files)

    ext = '.xml.gz'
    dico_processed_summmaries = {}
    regex = r'<lemma>.*?</lemma>' # expression to detect in the corenlp data <lemma>(word)</lemma>

    deb = time()
    count = 0

    # iteration over the files
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename)
        id_summary = path[len(NLP_FOLDER):-len(ext)] # id of the summary = filename without extension
        summary = ''

        if os.path.isfile(path): # checking if it is a file
            with gzip.open(path, 'rb') as f: # opening the .gz file
                for line in f:
                    txt = line.decode().strip() # extracte the line as txt
                    for elt in re.finditer(regex,txt): # find all the elements like regex
                        summary += re.split('[><]',elt.group(0))[2] + ' ' # adding only the lemmatized word
        dico_processed_summmaries[id_summary] = summary
        count += 1
        # evolution of the process
        if count%1000 == 0:
            print('{processed}/{tot} files processed --> {perc:.1f}% ({t:.1f} seconds since deb)'.format(processed=count,tot=nb_files,perc=count/nb_files*100,t=time()-deb))
    
    # Pickle the file
    with open(DATA_FOLDER + 'nlp_summaries.pkl', 'wb') as file:
        pickle.dump(dico_processed_summmaries, file, protocol=pickle.HIGHEST_PROTOCOL)

A little extract of the data

In [16]:
nlp_summaries = pd.read_pickle(DATA_FOLDER+'nlp_summaries.pkl')
for key,value in nlp_summaries.items():
    print('Key:',key)
    print('Summary:\n',value[:200]+'...')
    break

Key: 10000053
Summary:
 Fur trapper Jean La B te paddle he canoe through wild water towards the settlement in order to sell a load of fur . at the settlement a steamboat be landing and the trader and he foster-child Eve , ar...


# Saving the new dataset

We pickle our data in order to reuse directly the cleaned data (and load it faster).

In [224]:
DESTINATION = './Data/'
EXT = '.pkl'
to_pickle_data = [characters,movies]
to_pickle_name = ['characters','movies']
for i in range(len(to_pickle_data)):
    to_pickle_data[i].to_pickle(DESTINATION+to_pickle_name[i]+EXT)

#To unpickle:
# characters = pd.read_pickle("./Data/characters.pkl") 
# movies = pd.read_pickle("./Data/movies.pkl")