# Preprocess the datasets

In [1]:
import pandas as pd
import json
from utils import autoparse_year

## Summaries

In [2]:
df_summary = pd.read_csv(
	'../data/raw/MovieSummaries/plot_summaries.txt',
	sep='\t',
	# index_col='wiki_id',
	names=['wiki_id', 'summary'],
	dtype={'summary': pd.StringDtype()}
)
assert df_summary.index.is_unique
df_summary

Unnamed: 0,wiki_id,summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [3]:
df_summary.to_pickle('../data/generated/preprocessed/summary.pkl')

### Movies dataset

In [4]:
df_movies = pd.read_csv(
	'../data/raw/MovieSummaries/movie.metadata.tsv', sep='\t',
	# index_col='wiki_id',
	names=['wiki_id', 'fb_id', 'movie_name', 'movie_release', 'movie_revenue', 'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']
)
# unwrap the mappings
df_movies.movie_languages = df_movies.movie_languages.map(lambda x: list(json.loads(x).values()))
df_movies.movie_countries = df_movies.movie_countries.map(lambda x: list(json.loads(x).values()))
df_movies.movie_genres = df_movies.movie_genres.map(lambda x: list(json.loads(x).values()))
df_movies['movie_release_year'] = df_movies.movie_release.apply(autoparse_year).astype('Int64')
# TODO : convert dates to months also, where applicable

assert df_movies.index.is_unique
df_movies

Unnamed: 0,wiki_id,fb_id,movie_name,movie_release,movie_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,movie_release_year
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",2001
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",2000
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",1988
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",1987
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,[German Language],[Germany],[Drama],1983
...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,[English Language],[United States of America],[Drama],2011
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,[English Language],"[Ireland, United Kingdom]","[Biographical film, Drama, Documentary]",2011
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,[English Language],[United States of America],"[Satire, Comedy]",1972
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,[Japanese Language],[Japan],"[Science Fiction, Japanese Movies, Adventure, ...",1992


#### Correcting some anomalies

In [5]:
df_movies[df_movies.movie_release_year == 1010]

Unnamed: 0,wiki_id,fb_id,movie_name,movie_release,movie_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,movie_release_year
62836,29666067,/m/0fphzrf,Hunting Season,1010-12-02,12160978.0,140.0,"[Turkish Language, English Language]",[Turkey],"[Crime Fiction, Mystery, Drama, Thriller]",1010


In [6]:
df_movies.loc[df_movies.movie_release_year == 1010, 'movie_release'] = '2010-12-02'
df_movies.loc[df_movies.movie_release_year == 1010, 'movie_release_year'] = 2010

In [7]:
df_movies.to_pickle('../data/generated/preprocessed/movies.pkl')

### Character dataset

In [8]:
df_chars = pd.read_csv(
	'../data/raw/MovieSummaries/character.metadata.tsv', sep='\t',
	names=['wiki_id', 'fb_movie_id', 'release', 'character_name', 'actor_birth', 'actor_gender', 'actor_height', 'ethnicity', 'actor_name', 'actor_age', 'fb_char_id', 'useless_fb_char_id', 'fb_actor_id'],
)
df_chars = df_chars.drop("useless_fb_char_id", axis=1)
assert df_chars.index.is_unique

In [9]:
df_actor2nationnality_id = pd.read_csv("../data/raw/extra/actor_id2nationnality_id.csv", skiprows=1, names=["fb_actor_id", "nationnality_id"])
df_nationnality_id2nationnality = pd.read_csv("../data/raw/extra/nationnality_id2nationnality.csv", skiprows=1, names=["nationnality_id", "actor_nationnality"])
df_nationnality = pd.merge(df_actor2nationnality_id, df_nationnality_id2nationnality, on="nationnality_id")
df_nationnality = df_nationnality.drop("nationnality_id", axis=1)

In [19]:
df_nationnality = df_nationnality.drop_duplicates(subset=["fb_actor_id"])

In [11]:
df_chars = pd.merge(df_chars, df_nationnality, on="fb_actor_id", how="left")


In [12]:
df_chars.actor_nationnality.count()
with_nationnalities = df_chars.actor_nationnality.count()
ratio = 100.0 - 100.0*(with_nationnalities/len(df_chars))
print("There are %d rows with resolved nationnalities, this is %.01f%% of rows"%(with_nationnalities, ratio))

There are 272284 rows with resolved nationnalities, this is 39.6% of rows


In [13]:
# Ethnic groups mapping (queried from wikidata)
ethnic_groups = pd.read_csv('../data/raw/extra/ethnic_groups.csv', index_col="freebaseID")
found_ethnicities = pd.merge(df_chars, ethnic_groups, how="left", left_on="ethnicity", right_on="freebaseID")
now_count = found_ethnicities.name.count()
previous_count = df_chars.ethnicity.count()
ratio = 100.0 - 100.0*(now_count/previous_count)
print("There were %d rows with ethnicities, and we can resolve %d of them. We lost %.1f%% of rows"% (previous_count, now_count, ratio))

dictionnary = {}
for fbID, name in ethnic_groups.name.iteritems():
    dictionnary[fbID] = name

df_chars.ethnicity = df_chars.ethnicity.map(dictionnary)
df_chars

There were 106058 rows with ethnicities, and we can resolve 48652 of them. We lost 54.1% of rows


Unnamed: 0,wiki_id,fb_movie_id,release,character_name,actor_birth,actor_gender,actor_height,ethnicity,actor_name,actor_age,fb_char_id,fb_actor_id,actor_nationnality
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/03wcfv7,United States of America
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,,Natasha Henstridge,27.0,/m/0jys3m,/m/0346l4,Canada
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,,Ice Cube,32.0,/m/0jys3g,/m/01vw26l,
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/034hyc,United Kingdom
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/01y9xg,United States of America
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0b_vcv,
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0bx7_j,
450666,28308153,/m/0cp05t9,1957,,1941-11-18,M,1.730,English people,David Hemmings,15.0,/m/0g8ngmc,/m/022g44,England
450667,28308153,/m/0cp05t9,1957,,,,,,Roberta Paterson,,/m/0g8ngmj,/m/0g8ngmm,


In [14]:
df_chars.to_pickle('../data/generated/preprocessed/characters.pkl')

### TV tropes

In [15]:
df_tropes = pd.read_csv(
	'../data/raw/MovieSummaries/tvtropes.clusters.txt',
	sep='\t',
	names=['trope', 'char_movie_id']
)
df_tropes['char_name'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['char'])
df_tropes['movie_name'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['movie'])
df_tropes['actor_name'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['actor'])
df_tropes['fb_id'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['id'])
df_tropes.drop(columns='char_movie_id', inplace=True)
df_tropes

Unnamed: 0,trope,char_name,movie_name,actor_name,fb_id
0,absent_minded_professor,Professor Philip Brainard,Flubber,Robin Williams,/m/0jy9q0
1,absent_minded_professor,Professor Keenbean,Richie Rich,Michael McShane,/m/02vchl3
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,Ian McKellen,/m/0k6fkc
3,absent_minded_professor,Dr. Harold Medford,Them!,Edmund Gwenn,/m/0k6_br
4,absent_minded_professor,Daniel Jackson,Stargate,James Spader,/m/0k3rhh
...,...,...,...,...,...
496,young_gun,Morgan Earp,Tombstone,Bill Paxton,/m/0k776f
497,young_gun,Colorado Ryan,Rio Bravo,Ricky Nelson,/m/0k2kqg
498,young_gun,Tom Sawyer,The League of Extraordinary Gentlemen,Shane West,/m/0k5nsh
499,young_gun,William H. 'Billy the Kid' Bonney,Young Guns II,Emilio Estevez,/m/03lrjk0


In [16]:
df_tropes.to_pickle('../data/generated/preprocessed/tropes.pkl')

### Character clusters

In [17]:
df_charclusters = pd.read_csv(
	'../data/raw/MovieSummaries/name.clusters.txt',
	sep='\t',
	names=['char_name', 'fb_id']
)
df_charclusters

Unnamed: 0,char_name,fb_id
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn
...,...,...
2661,John Rolfe,/m/0k5_ql
2662,John Rolfe,/m/02vd6vs
2663,Elizabeth Swann,/m/0k1xvz
2664,Elizabeth Swann,/m/0k1x_d


In [18]:
df_charclusters.to_pickle('../data/generated/preprocessed/character_clusters.pkl')