In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

In [2]:
data_path = Path('../data/')
cmu_path = data_path.joinpath('raw/MovieSummaries/')

### Expand character data

In [3]:
col_names = ['Character type', 'data']
tvtropes_clusters = pd.read_csv(cmu_path.joinpath('tvtropes.clusters.txt'), delimiter='\t', names=col_names)
tvtropes_clusters

Unnamed: 0,Character type,data
0,absent_minded_professor,"{""char"": ""Professor Philip Brainard"", ""movie"":..."
1,absent_minded_professor,"{""char"": ""Professor Keenbean"", ""movie"": ""Richi..."
2,absent_minded_professor,"{""char"": ""Dr. Reinhardt Lane"", ""movie"": ""The S..."
3,absent_minded_professor,"{""char"": ""Dr. Harold Medford"", ""movie"": ""Them!..."
4,absent_minded_professor,"{""char"": ""Daniel Jackson"", ""movie"": ""Stargate""..."
...,...,...
496,young_gun,"{""char"": ""Morgan Earp"", ""movie"": ""Tombstone"", ..."
497,young_gun,"{""char"": ""Colorado Ryan"", ""movie"": ""Rio Bravo""..."
498,young_gun,"{""char"": ""Tom Sawyer"", ""movie"": ""The League of..."
499,young_gun,"{""char"": ""William H. 'Billy the Kid' Bonney"", ..."


In [4]:
def filter_func(row):
    data = eval(row['data'])
    keys, values = list(data.keys()), list(data.values())
    row[list(keys)] = list(values)
    return row

In [5]:
characters = tvtropes_clusters.copy()
characters = characters.assign(char=None, movie=None, id=None, actor=None)
characters = characters.transform(filter_func, axis=1).drop('data', axis=1)
characters

Unnamed: 0,Character type,char,movie,id,actor
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
...,...,...,...,...,...
496,young_gun,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton
497,young_gun,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson
498,young_gun,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West
499,young_gun,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez


In [6]:
characters.to_csv(data_path.joinpath('processed/cmu_characters.csv'))

### Expand Movie Languages

In [7]:
col_names = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Movie box office revenue',
    'Movie runtime',
    'Movie languages (Freebase ID:name tuples)',
    'Movie countries (Freebase ID:name tuples)',
    'Movie genres (Freebase ID:name tuples)'
]
movie = pd.read_csv(cmu_path.joinpath('movie.metadata.tsv'), delimiter='\t', names=col_names)
movie.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [8]:
movie['Movie languages (Freebase ID:name tuples)'].isna().sum()

0

In [30]:
# Get all Freebase genre id and genre string pairs

langs_data = movie['Movie languages (Freebase ID:name tuples)'].transform(eval)

langs_id2literal = dict()
langs_data.apply(lambda x: langs_id2literal.update(x))
del langs_id2literal['/m/0gtg'] # This key appears rarely and does not mean anything

In [44]:
def filter_func(row):
    # Check available languages for a movie
    langs = row['lang_data']
    if '/m/0gtg' in langs:  del langs['/m/0gtg']
    langs = list(langs.values())
    row[langs] = 1
    return row

In [51]:
all_languages = list(langs_id2literal.values()) # All languages in the dataset

df_langs = pd.DataFrame(columns=all_languages)
df_langs['Freebase movie ID'] = movie['Freebase movie ID']
df_langs['lang_data'] = langs_data
df_langs[all_languages] = 0

df_langs = df_langs.apply(filter_func, axis=1).drop('lang_data', axis=1)

In [52]:
df_langs

Unnamed: 0,English Language,Norwegian Language,German Language,Silent film,Spanish Language,Japanese Language,Turkish Language,Russian Language,Italian Language,Tamil Language,...,Osetin Language,Deutsch,Nahuatl languages,Hainanese,Chewa language,Haryanvi Language,Assyrian language,Papiamento language,Kuna language,Freebase movie ID
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/03vyhn
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/08yl5d
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0crgdbh
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0285_cd
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/01mrr1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0j7hxnt
81737,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0g4pl34
81738,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/02pygw1
81739,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/03pcrp


In [53]:
df_langs.to_csv(data_path.joinpath('processed/cmu_languages.csv'))

### Expand movie release countries

In [55]:
movie['Movie countries (Freebase ID:name tuples)'].isna().sum()

0

In [57]:
# Get all Freebase country id and country string pairs

country_data = movie['Movie countries (Freebase ID:name tuples)'].transform(eval)

country_id2literal = dict()
country_data.apply(lambda x: country_id2literal.update(x))

0        None
1        None
2        None
3        None
4        None
         ... 
81736    None
81737    None
81738    None
81739    None
81740    None
Name: Movie countries (Freebase ID:name tuples), Length: 81741, dtype: object

In [60]:
def filter_func(row):
    # Check countries for a movie
    countries = row['country_data']
    countries = list(countries.values())
    row[countries] = 1
    return row

In [62]:
all_countries = list(country_id2literal.values()) # All countries in the dataset

df_countries = pd.DataFrame(columns=all_countries)
df_countries['Freebase movie ID'] = movie['Freebase movie ID']
df_countries['country_data'] = country_data
df_countries[all_countries] = 0

df_countries = df_countries.apply(filter_func, axis=1).drop('country_data', axis=1)

In [63]:
df_countries

Unnamed: 0,United States of America,Norway,United Kingdom,Germany,South Africa,Argentina,Japan,Turkey,German Democratic Republic,Soviet Union,...,Guinea-Bissau,Haiti,Republic of China,Malayalam Language,Macau,Cyprus,Palestinian Territories,German Language,Ukranian SSR,Freebase movie ID
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/03vyhn
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/08yl5d
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0crgdbh
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0285_cd
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/01mrr1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0j7hxnt
81737,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0g4pl34
81738,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/02pygw1
81739,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/03pcrp


In [65]:
df_countries.to_csv(data_path.joinpath('processed/cmu_countries.csv'))

### Expand movie genres

In [69]:
movie['Movie genres (Freebase ID:name tuples)'].isna().sum()

0

In [70]:
# Get all Freebase genre id and country string pairs

genre_data = movie['Movie genres (Freebase ID:name tuples)'].transform(eval)

genre_id2literal = dict()
genre_data.apply(lambda x: genre_id2literal.update(x))

0        {'/m/01jfsb': 'Thriller', '/m/06n90': 'Science...
1        {'/m/02n4kr': 'Mystery', '/m/03bxz7': 'Biograp...
2        {'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D...
3        {'/m/01jfsb': 'Thriller', '/m/0glj9q': 'Erotic...
4                                  {'/m/07s9rl0': 'Drama'}
                               ...                        
81736                              {'/m/07s9rl0': 'Drama'}
81737    {'/m/03bxz7': 'Biographical film', '/m/07s9rl0...
81738         {'/m/06nbt': 'Satire', '/m/01z4y': 'Comedy'}
81739    {'/m/06n90': 'Science Fiction', '/m/0gw5n2f': ...
81740    {'/m/01jfsb': 'Thriller', '/m/03npn': 'Horror'...
Name: Movie genres (Freebase ID:name tuples), Length: 81741, dtype: object

In [72]:
def filter_func(row):
    # Check genres for a movie
    genres = row['genre_data']
    genres = list(genres.values())
    row[genres] = 1
    return row

In [75]:
all_genres = list(genre_id2literal.values()) # All countries in the dataset

df_genres = pd.DataFrame(columns=all_genres)
df_genres['Freebase movie ID'] = movie['Freebase movie ID']
df_genres['genre_data'] = genre_data
df_genres[all_genres] = 0

df_genres = df_genres.apply(filter_func, axis=1).drop('genre_data', axis=1)

In [77]:
df_genres

Unnamed: 0,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,Space western,Mystery,Biographical film,Drama,...,Chick flick,Ninja movie,Buddy Picture,Statutory rape,New Queer Cinema,Neorealism,The Netherlands in World War II,Revisionist Fairy Tale,Homoeroticism,Freebase movie ID
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/03vyhn
1,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,/m/08yl5d
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,/m/0crgdbh
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/0285_cd
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,/m/01mrr1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,/m/0j7hxnt
81737,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,/m/0g4pl34
81738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,/m/02pygw1
81739,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,/m/03pcrp


In [76]:
df_genres.to_csv(data_path.joinpath('processed/cmu_genres.csv'))