# Imports and loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import json
from loads import *

PATH_FOLDER = "MovieSummaries/"

In [2]:
# Loading character metadata DataFrame
character_metadata_df = load_character_metadata()
# Loading movie metadata DataFrame
movie_metadata_df = load_movie_metadata()
# Loading plot summaries DataFrame
plot_summaries_df = load_plot_summaries()
# Loading TVTropes DataFrame
tvtropes_df = load_tvtropes()
# Loading name clusters DataFrame
name_clusters_df = load_name_clusters()

# Preprocessing

## CLEANING  

In [3]:
movie_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [4]:
movie_metadata_df['Movie Languages'] = movie_metadata_df['Movie languages (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))
movie_metadata_df['Movie Countries'] = movie_metadata_df['Movie countries (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))
movie_metadata_df['Movie Genres'] = movie_metadata_df['Movie genres (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))

In [5]:
movie_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples),Movie Languages,Movie Countries,Movie Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",[Norwegian Language],[Norway],"[Crime Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",[German Language],[Germany],[Drama]


In [6]:
columns_to_drop = ['Movie languages (Freebase ID:name tuples)', 'Movie countries (Freebase ID:name tuples)','Movie genres (Freebase ID:name tuples)','Freebase movie ID']
movie_metadata_df = movie_metadata_df.drop(columns_to_drop, axis=1)
movie_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie Languages,Movie Countries,Movie Genres
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,Brun bitter,1988,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]"
3,9363483,White Of The Eye,1987,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
4,261236,A Woman in Flames,1983,,106.0,[German Language],[Germany],[Drama]


In [7]:
movie_metadata_df.dropna(subset=['Movie box office revenue'], inplace=True)
movie_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie Languages,Movie Countries,Movie Genres
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
7,10408933,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]"
13,171005,Henry V,1989-11-08,10161099.0,137.0,[English Language],[United Kingdom],"[Costume drama, War film, Epic, Period piece, ..."
17,77856,Mary Poppins,1964-08-27,102272727.0,139.0,[English Language],[United States of America],"[Children's/Family, Musical, Fantasy, Comedy, ..."
21,612710,New Rose Hotel,1999-10-01,21521.0,92.0,[English Language],[United States of America],"[Thriller, Science Fiction, Future noir, Indie..."


In [8]:
character_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [9]:
columns_to_drop = ['Freebase movie ID', 'Actor ethnicity (Freebase ID)','Freebase character ID','Freebase actor ID','Actor height (in meters)']
character_metadata_df = character_metadata_df.drop(columns_to_drop, axis=1)
character_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor name,Actor age at movie release,Freebase character/actor map ID
0,975900,2001-08-24,Akooshay,1958-08-26,F,Wanda De Jesus,42.0,/m/0bgchxw
1,975900,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,Natasha Henstridge,27.0,/m/0jys3m
2,975900,2001-08-24,Desolation Williams,1969-06-15,M,Ice Cube,32.0,/m/0jys3g
3,975900,2001-08-24,Sgt Jericho Butler,1967-09-12,M,Jason Statham,33.0,/m/02vchl6
4,975900,2001-08-24,Bashira Kincaid,1977-09-25,F,Clea DuVall,23.0,/m/02vbb3r


In [10]:
tvtropes_df.head()

Unnamed: 0,Character role,Character name,Movie name,Freebase character/actor map ID,Actor name
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader


In [11]:
name_clusters_df.head()

Unnamed: 0,Character name,Freebase character/actor map ID
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn


## Splitting

In [12]:
movie_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie Languages,Movie Countries,Movie Genres
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
7,10408933,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]"
13,171005,Henry V,1989-11-08,10161099.0,137.0,[English Language],[United Kingdom],"[Costume drama, War film, Epic, Period piece, ..."
17,77856,Mary Poppins,1964-08-27,102272727.0,139.0,[English Language],[United States of America],"[Children's/Family, Musical, Fantasy, Comedy, ..."
21,612710,New Rose Hotel,1999-10-01,21521.0,92.0,[English Language],[United States of America],"[Thriller, Science Fiction, Future noir, Indie..."


In [13]:
movie_runtime_df = movie_metadata_df[['Movie runtime','Movie box office revenue']].copy()
movie_languages_df = movie_metadata_df[['Movie Languages','Movie box office revenue']].copy()
movie_countries_df = movie_metadata_df[['Movie Countries','Movie box office revenue']].copy()
movie_genres_df = movie_metadata_df[['Movie Genres','Movie box office revenue']].copy()

In [14]:
movie_runtime_df.dropna(subset=['Movie runtime'], inplace=True)
movie_runtime_df.shape

(8302, 2)

In [17]:
merged_df = pd.merge(character_metadata_df, name_clusters_df, on=['Freebase character/actor map ID'], how='outer')


In [18]:
merged_df.head()

Unnamed: 0,Wikipedia movie ID,Movie release date,Character name_x,Actor date of birth,Actor gender,Actor name,Actor age at movie release,Freebase character/actor map ID,Character name_y
0,975900,2001-08-24,Akooshay,1958-08-26,F,Wanda De Jesus,42.0,/m/0bgchxw,
1,975900,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,Natasha Henstridge,27.0,/m/0jys3m,
2,975900,2001-08-24,Desolation Williams,1969-06-15,M,Ice Cube,32.0,/m/0jys3g,
3,975900,2001-08-24,Sgt Jericho Butler,1967-09-12,M,Jason Statham,33.0,/m/02vchl6,
4,975900,2001-08-24,Bashira Kincaid,1977-09-25,F,Clea DuVall,23.0,/m/02vbb3r,
