In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import ast

In [2]:
# File paths
# Movie Summaries
CHARACTER_META_FILE = 'MovieSummaries/character.metadata.tsv'
MOVIE_META_FILE = 'MovieSummaries/movie.metadata.tsv'
NAME_CLUSTERS = 'MovieSummaries/name.clusters.txt'
PLOT_SUM = 'MovieSummaries/plot_summaries.txt'
TVTROPES_CLUSTERS = 'MovieSummaries/tvtropes.clusters.txt'

In [40]:
# character metadata
char_cols = ['Wikipedia_Movie_ID', 'Freebase_Movie_ID', 'Movie_Release_Date', 'Character_Name',
             'Actor_DOB', 'Actor_Gender', 'Actor_Height', 'Actor_Ethnicity', 'Actor_Name',
             'Actor_Age_at_Movie_Release', 'Freebase_Char_Actor_Map_ID', 'Freebase_Char_ID',
             'Freebase_Actor_ID']
char_meta = pd.read_csv(CHARACTER_META_FILE, sep='\t', names=char_cols)
char_meta.head()

Unnamed: 0,Wikipedia_Movie_ID,Freebase_Movie_ID,Movie_Release_Date,Character_Name,Actor_DOB,Actor_Gender,Actor_Height,Actor_Ethnicity,Actor_Name,Actor_Age_at_Movie_Release,Freebase_Char_Actor_Map_ID,Freebase_Char_ID,Freebase_Actor_ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [27]:
# movie metadata
mov_cols = ['Wikipedia_Movie_ID', 'Freebase_Movie_ID', 'Movie_Name', 'Movie_Release_Date', 'Revenue',
            'Movie_Runtime', 'Movie_Languages', 'Movie_Countries', 'Movie_Genres']
raw_mov = pd.read_csv(MOVIE_META_FILE, sep='\t', names=mov_cols)
raw_mov.head()

Unnamed: 0,Wikipedia_Movie_ID,Freebase_Movie_ID,Movie_Name,Movie_Release_Date,Revenue,Movie_Runtime,Movie_Languages,Movie_Countries,Movie_Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [28]:
def extract_string_from_dict(str_dict):
    """
    Data in represented as a dictionary in string format and is therefore converted into dictionary
    and then into a list of values.
    """
    dic = ast.literal_eval(str_dict)
    return ', '.join(dic.values())

In [30]:
mov = raw_mov.copy()
mov.Movie_Languages = raw_mov.Movie_Languages.apply(extract_string_from_dict)
mov.Movie_Countries = raw_mov.Movie_Countries.apply(extract_string_from_dict)
mov.Movie_Genres = raw_mov.Movie_Genres.apply(extract_string_from_dict)
mov.head()

Unnamed: 0,Wikipedia_Movie_ID,Freebase_Movie_ID,Movie_Name,Movie_Release_Date,Revenue,Movie_Runtime,Movie_Languages,Movie_Countries,Movie_Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,English Language,United States of America,"Thriller, Science Fiction, Horror, Adventure, ..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,English Language,United States of America,"Mystery, Biographical film, Drama, Crime Drama"
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,Norwegian Language,Norway,"Crime Fiction, Drama"
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,English Language,United Kingdom,"Thriller, Erotic thriller, Psychological thriller"
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,German Language,Germany,Drama


In [42]:
# character names cluster
char_name_cluster = pd.read_csv(NAME_CLUSTERS, sep='\t', names=['Character_Name', 'Freebase_Char_Actor_Map_ID'])
char_name_cluster.head()

Unnamed: 0,Character_Name,Freebase_Char_Actor_Map_ID
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn


In [43]:
# plot summaries
plot_sum = pd.read_csv(PLOT_SUM, sep='\t', names=['Wikipedia_Movie_ID', 'Summary'])
plot_sum.head()

Unnamed: 0,Wikipedia_Movie_ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [44]:
# character types from tvtropes.com
raw_char_types = pd.read_csv(TVTROPES_CLUSTERS, sep='\t', names=['Character_Type', 'Character_Description'])
raw_char_types.head()

Unnamed: 0,Character_Type,Character_Description
0,absent_minded_professor,"{""char"": ""Professor Philip Brainard"", ""movie"":..."
1,absent_minded_professor,"{""char"": ""Professor Keenbean"", ""movie"": ""Richi..."
2,absent_minded_professor,"{""char"": ""Dr. Reinhardt Lane"", ""movie"": ""The S..."
3,absent_minded_professor,"{""char"": ""Dr. Harold Medford"", ""movie"": ""Them!..."
4,absent_minded_professor,"{""char"": ""Daniel Jackson"", ""movie"": ""Stargate""..."


Character_Description contains the character's name, the movie, the Freebase character/actor map ID as well as the actor's name. New columns will be created:

In [49]:
def extract_infos_from_description(string, feature):
    """
    Converts the string to a dictionary and extracts the value to the corresponding key.
    """
    des_dict = ast.literal_eval(string)
    return des_dict[feature]

In [60]:
char_types = raw_char_types.copy()
char_types['Character_Name'] = raw_char_types.Character_Description.apply(extract_infos_from_description,
                                                                          feature='char')
char_types['Movie_Name'] = raw_char_types.Character_Description.apply(extract_infos_from_description,
                                                                      feature='movie')
char_types['Freebase_Char_Actor_Map_ID'] = raw_char_types.Character_Description.apply(
    extract_infos_from_description, feature='id')
char_types['Actor_Name'] = raw_char_types.Character_Description.apply(extract_infos_from_description,
                                                                      feature='actor')
char_types = char_types[['Character_Type', 'Character_Name', 'Movie_Name', 'Freebase_Char_Actor_Map_ID',
                        'Actor_Name']]
char_types.head()

Unnamed: 0,Character_Type,Character_Name,Movie_Name,Freebase_Char_Actor_Map_ID,Actor_Name
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
