# Data wrangling
This notebooks is used to import and clean the datasets

In [1]:
DATA_FOLDER = './data/'
MOVIES_FOLDER = DATA_FOLDER + 'movies_summaries/'
PLOT_SUMMARY_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries'
IMDB_FOLDER = DATA_FOLDER + 'imdb/'

REPORT_FOLDER = './gen/reports/'
ETHNICITY_FILE = './gen/ethnicities.tsv'

CHARACTERS_FILE = MOVIES_FOLDER + 'character.metadata.tsv'
MOVIES_FILE = MOVIES_FOLDER + 'movie.metadata.tsv'
PLOT_SUMMARIES_FILE = MOVIES_FOLDER + 'plot_summaries.txt'
TROPES_FILE = MOVIES_FOLDER + 'tvtropes.clusters.txt'
IMDB_FILE = IMDB_FOLDER + 'movies_metadata.csv'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

from pandas_profiling import ProfileReport
%matplotlib inline

from data_wrangling_tools import *

# Characters

In [3]:
# load characters
characters = load_characters(CHARACTERS_FILE)
print(characters.shape)
characters.head()

(450669, 13)


Unnamed: 0,wiki_movie_id,freebase_movie_id,m_release_date,name,a_dob,a_gender,a_height,a_ethnicity_freebase_id,a_name,a_age_at_release,freebase_char/a_map,freebase_char_id,freebase_a_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [4]:
# generate profiling report
# profile_characters = ProfileReport(characters, title='Characters report')
# profile_characters.to_file(REPORT_FOLDER + "character_report_0.html")

# Ethnicities

In [5]:
# load ethnicities
ethnicities = load_ethnicities(ETHNICITY_FILE)
ethnicities.head()

Unnamed: 0,freebase_ethnicity_id,ethnicity_name
0,/m/0x67,African Americans
1,/m/064b9n,Omaha people
2,/m/041rx,Jewish people
3,/m/033tf_,Irish Americans
4,/m/04gfy7,Indian Americans


In [6]:
# add characters ethnicities
characters = add_characters_ethnicities(characters, ethnicities)
characters.head()

Unnamed: 0,wiki_movie_id,freebase_movie_id,m_release_date,name,a_dob,a_gender,a_height,a_ethnicity_freebase_id,a_name,a_age_at_release,freebase_char/a_map,freebase_char_id,freebase_a_id,ethnicity_name
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,African Americans
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,


# Movies

In [7]:
# load movies
movies = load_movies(MOVIES_FILE)
print(movies.shape)
movies.head()

(81741, 9)


Unnamed: 0,wiki_movie_id,freebase_movie_id,name,release_date,box_office_revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987-01-01,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [8]:
# generate profiling report
# profile_movies = ProfileReport(movies, title='Movies report')
# profile_movies.to_file(REPORT_FOLDER + "movies_report_0.html")

In [9]:
# clean features    
movies = clean_unknowns(movies)
movies = clean_jsons(movies)

# IMDB movies

In [10]:
# import imdb dataset

def load_imdb(imdb_file, columns=['original_title', 'revenue', 'budget', 'vote_average', 'vote_count']):
    imdb = pd.read_csv(imdb_file, usecols=columns)

    # remove wrongly formatted rows (only 3)
    imdb = imdb.drop(imdb[imdb['budget'].str.contains('.jpg')].index)

    # convert numerical columns to float
    imdb['revenue'] = imdb['revenue'].astype(float).apply(lambda x: np.nan if x == 0.0 else x)
    imdb['budget'] = imdb['budget'].astype(float).apply(lambda x: np.nan if x == 0.0 else x)

    return imdb

imdb = load_imdb(IMDB_FILE)
print(imdb.shape)
imdb.head()

(45463, 5)


Unnamed: 0,budget,original_title,revenue,vote_average,vote_count
0,30000000.0,Toy Story,373554033.0,7.7,5415.0
1,65000000.0,Jumanji,262797249.0,6.9,2413.0
2,,Grumpier Old Men,,6.5,92.0
3,16000000.0,Waiting to Exhale,81452156.0,6.1,34.0
4,,Father of the Bride Part II,76578911.0,5.7,173.0


# Merge imdb movies and movies

In [11]:
# merge movies and imdb movies

def merge_movies_imdb(movies, imdb):
    df = pd.merge(movies, imdb, left_on='name', right_on='original_title', how='left')

    # drop movies that have been duplicated during the merge TODO see it
    df = df.drop_duplicates(subset=['name', 'vote_count', 'vote_average'])

    # fill the box_office revenue with the imdb revenue if it's missing
    df['box_office_revenue'] = df['box_office_revenue'].fillna(df['revenue'].copy())
    df = df.drop(columns=['revenue', 'original_title'])

    return df

movies = merge_movies_imdb(movies, imdb)

print(movies.shape)
movies.head()

(77380, 12)


Unnamed: 0,wiki_movie_id,freebase_movie_id,name,release_date,box_office_revenue,runtime,languages,countries,genres,budget,vote_average,vote_count
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",28000000.0,4.8,299.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",,,
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",,,
3,9363483,/m/0285_cd,White Of The Eye,1987-01-01,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",,,
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,[German Language],[Germany],[Drama],,,


# Merge characters and movies

In [12]:
# merge characters and movies
df = merge_characters_movies(characters, movies)
df.head()

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,release_date,box_office_revenue,runtime,genres,languages,countries,char_name,a_name,a_gender,a_ethnicity,a_dob,a_age_at_release,a_height,freebase_char/a_map,freebase_char_id,freebase_a_id,a_ethnicity_freebase_id
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Akooshay,Wanda De Jesus,F,,1958-08-26,42.0,1.62,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,
1,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Lieutenant Melanie Ballard,Natasha Henstridge,F,,1974-08-15,27.0,1.78,/m/0jys3m,/m/0bgchn4,/m/0346l4,/m/044038p
2,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Desolation Williams,Ice Cube,M,African Americans,1969-06-15,32.0,1.727,/m/0jys3g,/m/0bgchn_,/m/01vw26l,/m/0x67
3,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Sgt Jericho Butler,Jason Statham,M,,1967-09-12,33.0,1.75,/m/02vchl6,/m/0bgchnq,/m/034hyc,
4,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Bashira Kincaid,Clea DuVall,F,,1977-09-25,23.0,1.65,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,


In [14]:
# generate clean df
df1 = generate_clean_df_with_imdb(CHARACTERS_FILE, ETHNICITY_FILE, MOVIES_FILE, IMDB_FILE)

assert(df.equals(df1))
df1.head()

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,release_date,box_office_revenue,runtime,genres,languages,countries,char_name,a_name,a_gender,a_ethnicity,a_dob,a_age_at_release,a_height,freebase_char/a_map,freebase_char_id,freebase_a_id,a_ethnicity_freebase_id
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Akooshay,Wanda De Jesus,F,,1958-08-26,42.0,1.62,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,
1,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Lieutenant Melanie Ballard,Natasha Henstridge,F,,1974-08-15,27.0,1.78,/m/0jys3m,/m/0bgchn4,/m/0346l4,/m/044038p
2,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Desolation Williams,Ice Cube,M,African Americans,1969-06-15,32.0,1.727,/m/0jys3g,/m/0bgchn_,/m/01vw26l,/m/0x67
3,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Sgt Jericho Butler,Jason Statham,M,,1967-09-12,33.0,1.75,/m/02vchl6,/m/0bgchnq,/m/034hyc,
4,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],Bashira Kincaid,Clea DuVall,F,,1977-09-25,23.0,1.65,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,
