# Import the modules

In [86]:
import tarfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
%matplotlib inline

# Import the data

In [87]:
DATA_FOLDER = 'Data/'
CHARACTER_DATASET = DATA_FOLDER + 'character.metadata.tsv'
NLP_DATASET = DATA_FOLDER + 'corenlp_plot_summaries.tar'
MOVIE_DATASET = DATA_FOLDER + 'Movie.metadata.tsv'
SUMMARIES_DATASET = DATA_FOLDER + 'plot_summaries.txt.gz'
DEFAULT_COMPRESSION = 'gzip'

In [88]:
def load_metadata(path, column_names, header=None, low_memory=False):
    return pd.read_table(path, header=header, names=column_names)

In [89]:
columns_character = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_release_date', 'Character_name', 'Actor_date_of_birth', 'Actor_gender', 'Actor_height_meters', 'Actor_ethnicity_Freebase_ID', 'Actor_name', 'Actor_age_at_movie_release', 'Freebase_character_actor_map_ID', 'Freebase_character_ID', 'Freebase_actor_ID']
columns_movie = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name','Movie_release_date','Movie_box_office_revenue', 'Movie_runtime','Movie_languages','Movie_countries','Movie_genres' ]

characters = load_metadata(CHARACTER_DATASET,column_names=columns_character)
movies = load_metadata(MOVIE_DATASET,column_names=columns_movie)

In [90]:
movies.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [91]:
characters.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Character_name,Actor_date_of_birth,Actor_gender,Actor_height_meters,Actor_ethnicity_Freebase_ID,Actor_name,Actor_age_at_movie_release,Freebase_character_actor_map_ID,Freebase_character_ID,Freebase_actor_ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


# Cleaning

## Drop NaN values

In [92]:
characters = characters.dropna()
movies = movies.dropna()

## Problem of date

Typo in the release date of the movie, but this movie is not in characters

In [93]:
movies.loc[movies.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'

## Format of movie languages, genres and country

In [94]:
def format_multiple(chain,deb,step):
    res = chain.split('"')[deb::step]
    return res

In [95]:
movies.Movie_genres = movies.Movie_genres.apply(format_multiple,deb=3,step=4)
movies.Movie_countries = movies.Movie_countries.apply(format_multiple,deb=3,step=4)
movies.Movie_languages = movies.Movie_languages.apply(format_multiple,deb=3,step=4)

In [96]:
keys = ['Movie_languages','Movie_countries','Movie_genres']
for key in keys:
    nb = len(movies[movies[key].apply(len) == 0])
    print('{nb} movies without {key} ({percentage:.2f}% of the dataset)'.format(nb=nb,key=key, percentage=nb*100/len(movies)))

243 movies without Movie_languages (2.95% of the dataset)
75 movies without Movie_countries (0.91% of the dataset)
3 movies without Movie_genres (0.04% of the dataset)


## Format for dates

In [97]:
movies.Movie_release_date = pd.to_datetime(movies.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Movie_release_date = pd.to_datetime(characters.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Actor_date_of_birth = pd.to_datetime(characters.Actor_date_of_birth,format='%Y-%m-%d').dt.year

## Merging IMDb and CMU

In [98]:
import re
common_words = {'a','an','and','the','of','at','in'}
punctuation = {'.',',','!',';','?',''}
def get_titles(df,col):
    res =  df[col].apply(lambda title:set(re.split('[ :,]',title.lower())))
    res = res.apply(lambda S: S.difference(punctuation))
    return res
CMU_titles = get_titles(movies,'Movie_name')
print(CMU_titles)

0                  {ghosts, of, mars}
7        {alexander's, ragtime, band}
13                         {henry, v}
17                    {poppins, mary}
21                 {hotel, rose, new}
                     ...             
81694                         {wilde}
81695           {america, coming, to}
81720              {invaders, spaced}
81725              {state, and, main}
81726               {as, guilty, sin}
Name: Movie_name, Length: 8243, dtype: object


In [100]:
def compare(titles1,titles2,threshold = 0.8):
    matched = {}
    count = 0
    for idx1,elt1 in enumerate(titles1):
        for idx2,elt2 in enumerate(titles2):
            if len(elt1 & elt2)/(len(elt1 | elt2)) > threshold:
                try:
                    matched[idx1].append(idx2)
                except KeyError:
                    matched[idx1] = [idx2]
        count += 1
        if count == 10: # remove for the whole computation
            break
    return matched
compare(CMU_titles,CMU_titles)

{0: [0],
 1: [1],
 2: [2],
 3: [3],
 4: [4],
 5: [5],
 6: [6],
 7: [7],
 8: [8],
 9: [9]}