# Preliminary analysis of Movies Dataset

In [1]:
import os
while os.path.basename(os.getcwd()) != "ada-project-private":
    os.chdir('..')

import pandas as pd
import json

DATA_FOLDER = './MovieSummaries/'

# Load character.metadata.tsv
character_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID",
                           "Movie release date", "Character name",
                           "Actor date of birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)",
                           "Actor name", "Actor age at movie release",
                           "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_metadata_df = pd.read_csv(DATA_FOLDER + 'character.metadata.tsv', sep='\t', header=None, names=character_metadata_cols)

# Load movie.metadata.tsv
movie_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", "Movie box office revenue", "Movie runtime",
                       "Movie languages (Freebase ID:name tuples)", "Movie countries (Freebase ID:name tuples)",
                       "Movie genres (Freebase ID:name tuples)"]
movie_metadata_df = pd.read_csv(DATA_FOLDER + 'movie.metadata.tsv', sep='\t', header=None, names=movie_metadata_cols)

# Load name.clusters.txt
name_clusters_cols = ["Character name", "Freebase movie ID"]
name_clusters_df = pd.read_csv(DATA_FOLDER + 'name.clusters.txt', sep='\t', header=None, names=name_clusters_cols)

# Load plot_summaries.txt
plot_summaries_cols = ["Wikipedia movie ID", "Wikipedia plot"]
plot_summaries_df = pd.read_csv(DATA_FOLDER + 'plot_summaries.txt', sep='\t', header=None, names=plot_summaries_cols)

# Load tvtropes.clusters.txt
tvtropes_clusters_cols = ["Character types", "details"]
tvtropes_clusters_df = pd.read_csv(DATA_FOLDER + 'tvtropes.clusters.txt', sep='\t', header=None, names=tvtropes_clusters_cols)

tvtropes_clusters_df["details_dict"] = tvtropes_clusters_df["details"].apply(json.loads)
tvtropes_clusters_df["Character name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('char'))
tvtropes_clusters_df["Movie name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('movie'))
tvtropes_clusters_df["Freebase character/actor map ID"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('id'))
tvtropes_clusters_df["Actor name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('actor'))

tvtropes_clusters_df.drop(columns=["details", "details_dict"], inplace=True)

In [2]:
print("total len:", len(character_metadata_df))
for col in character_metadata_df.columns:
    print("missing " + col + ":", len(character_metadata_df[character_metadata_df[col].isna()]))

character_metadata_df.head(10)

total len: 450669
missing Wikipedia movie ID: 0
missing Freebase movie ID: 0
missing Movie release date: 9995
missing Character name: 257875
missing Actor date of birth: 106145
missing Actor gender: 45609
missing Actor height (in meters): 295845
missing Actor ethnicity (Freebase ID): 344611
missing Actor name: 1228
missing Actor age at movie release: 158113
missing Freebase character/actor map ID: 0
missing Freebase character ID: 257865
missing Freebase actor ID: 815


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
5,975900,/m/03vyhn,2001-08-24,Commander Helena Braddock,1949-05-26,F,1.727,/m/0x67,Pam Grier,52.0,/m/02vdcfp,/m/0bgchnd,/m/0418ft
6,975900,/m/03vyhn,2001-08-24,Whitlock,1945-08-02,F,1.753,,Joanna Cassidy,56.0,/m/02vd6kw,/m/0bgchmx,/m/06lj1m
7,975900,/m/03vyhn,2001-08-24,Big Daddy Mars,,M,,,Richard Cetrone,,/m/0bgchsy,/m/0bgcht0,/m/0bgcht7
8,975900,/m/03vyhn,2001-08-24,Michael Descanso,1971-03-20,M,1.892,,Liam Waite,30.0,/m/03jqhb0,/m/0bgchs4,/m/0ks8b0
9,975900,/m/03vyhn,2001-08-24,Uno,,M,,,Duane Davis,,/m/0bgchtj,/m/0bgchtm,/m/03nrwdy


In [12]:
print("total len:", len(movie_metadata_df))
for col in movie_metadata_df.columns:
    print("missing " + col + ":", len(movie_metadata_df[movie_metadata_df[col].isna()]))

movie_metadata_df.head(10)

total len: 81741
missing Wikipedia movie ID: 0
missing Freebase movie ID: 0
missing Movie name: 0
missing Movie release date: 6902
missing Movie box office revenue: 73340
missing Movie runtime: 20450
missing Movie languages (Freebase ID:name tuples): 0
missing Movie countries (Freebase ID:name tuples): 0
missing Movie genres (Freebase ID:name tuples): 0


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
5,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen..."
6,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant..."
7,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ..."
8,9997961,/m/06_y2j7,Contigo y aquí,1974,,,"{""/m/06nm1"": ""Spanish Language""}","{""/m/0jgd"": ""Argentina""}","{""/m/04t36"": ""Musical"", ""/m/07s9rl0"": ""Drama"",..."
9,2345652,/m/075f66,City of the Dead,1960,,76.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/03npn"": ""Horror"", ""/m/0fdjb"": ""Supernatur..."


In [15]:
print("total len:", len(name_clusters_df))

name_clusters_df.head(10)

total len: 2666


Unnamed: 0,Character name,Freebase movie ID
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn
5,Josh Framm,/m/0jt3p3
6,Josh Framm,/m/0jt3t3
7,Caspian X,/m/0646flc
8,Caspian X,/m/02vd12n
9,Apostle Peter,/m/02vd6_v


In [14]:
print("total len:", len(plot_summaries_df))

plot_summaries_df.head(10)

total len: 42303


Unnamed: 0,Wikipedia movie ID,Wikipedia plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
5,5272176,The president is on his way to give a speech. ...
6,1952976,"{{plot}} The film opens in 1974, as a young gi..."
7,24225279,"The story begins with Hannah, a young Jewish t..."
8,2462689,Infuriated at being told to write one final co...
9,20532852,A line of people drool at the window of the s...


In [13]:
print("total len:", len(tvtropes_clusters_df))
for col in tvtropes_clusters_df.columns:
    print("missing " + col + ":", len(tvtropes_clusters_df[tvtropes_clusters_df[col].isna()]))

tvtropes_clusters_df.head(10)

total len: 501
missing Character types: 0
missing Character name: 0
missing Movie name: 0
missing Freebase character/actor map ID: 0
missing Actor name: 0


Unnamed: 0,Character types,Character name,Movie name,Freebase character/actor map ID,Actor name
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
5,adventurer_archaeologist,Indiana Jones,Indiana Jones and the Kingdom of the Crystal S...,/m/0jzx78,Harrison Ford
6,adventurer_archaeologist,Indiana Jones,Indiana Jones and the Raiders of the Lost Ark,/m/0k294p,Harrison Ford
7,adventurer_archaeologist,Indiana Jones,Indiana Jones and the Temple of Doom,/m/0jzx9b,Harrison Ford
8,adventurer_archaeologist,Evelyn Carnahan-O'Connell,The Mummy,/m/0k5yzc,Rachel Weisz
9,arrogant_kungfu_guy,Han,Enter the Dragon,/m/02vd8hn,Shih Kien
