# Preprocess the datasets

In [None]:
import pandas as pd
import json
from utils import autoparse_year

## Summaries

In [None]:
df_summary = pd.read_csv(
	'../data/raw/MovieSummaries/plot_summaries.txt',
	sep='\t',
	# index_col='wiki_id',
	names=['wiki_id', 'summary'],
	dtype={'summary': pd.StringDtype()}
)
assert df_summary.index.is_unique
df_summary

In [None]:
df_summary.to_pickle('../data/generated/preprocessed/summary.pkl')

### Movies dataset

In [None]:
df_movies = pd.read_csv(
	'../data/raw/MovieSummaries/movie.metadata.tsv', sep='\t',
	# index_col='wiki_id',
	names=['wiki_id', 'fb_id', 'movie_name', 'movie_release', 'movie_revenue', 'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']
)
# unwrap the mappings
df_movies.movie_languages = df_movies.movie_languages.map(lambda x: list(json.loads(x).values()))[0][0]
df_movies.movie_countries = df_movies.movie_countries.map(lambda x: list(json.loads(x).values()))[0][0]
df_movies.movie_genres = df_movies.movie_genres.map(lambda x: list(json.loads(x).values()))[0][0]
df_movies['movie_release_year'] = df_movies.movie_release.apply(autoparse_year).astype('Int64')
# TODO : convert dates to months also, where applicable

assert df_movies.index.is_unique
df_movies

#### Correcting some anomalies

In [None]:
df_movies[df_movies.movie_release_year == 1010]

In [None]:
df_movies.loc[df_movies.movie_release_year == 1010, 'movie_release'] = '2010-12-02'
df_movies.loc[df_movies.movie_release_year == 1010, 'movie_release_year'] = 2010

In [None]:
df_movies.to_pickle('../data/generated/preprocessed/movies.pkl')

### Character dataset

In [None]:
df_chars = pd.read_csv(
	'../data/raw/MovieSummaries/character.metadata.tsv', sep='\t',
	names=['wiki_id', 'fb_movie_id', 'release', 'character_name', 'actor_birth', 'actor_gender', 'actor_height', 'ethnicity', 'actor_name', 'actor_age', 'fb_char_id', 'useless_fb_char_id', 'fb_actor_id'],
)
df_chars = df_chars.drop("useless_fb_char_id", axis=1)
assert df_chars.index.is_unique

In [None]:
df_actor2nationality_id = pd.read_csv("../data/raw/extra/actor_id2nationality_id.csv", skiprows=1, names=["fb_actor_id", "nationality_id"])
df_nationality_id2nationality = pd.read_csv("../data/raw/extra/nationality_id2nationality.csv", skiprows=1, names=["nationality_id", "actor_nationality"])
df_nationality = pd.merge(df_actor2nationality_id, df_nationality_id2nationality, on="nationality_id")
df_nationality = df_nationality.drop("nationality_id", axis=1)

In [None]:
df_nationality = df_nationality.drop_duplicates(subset=["fb_actor_id"])

In [None]:
df_chars = pd.merge(df_chars, df_nationality, on="fb_actor_id", how="left")


In [None]:
df_chars.actor_nationality.count()
with_nationnalities = df_chars.actor_nationality.count()
ratio = 100.0 - 100.0*(with_nationnalities/len(df_chars))
print("There are %d rows with resolved nationalities, this is %.01f%% of rows"%(with_nationnalities, ratio))

In [None]:
# Ethnic groups mapping (queried from wikidata)
ethnic_groups = pd.read_csv('../data/raw/extra/ethnic_groups.csv', index_col="freebaseID")
found_ethnicities = pd.merge(df_chars, ethnic_groups, how="left", left_on="ethnicity", right_on="freebaseID")
now_count = found_ethnicities.name.count()
previous_count = df_chars.ethnicity.count()
ratio = 100.0 - 100.0*(now_count/previous_count)
print("There were %d rows with ethnicities, and we can resolve %d of them. We lost %.1f%% of rows"% (previous_count, now_count, ratio))

dictionnary = {}
for fbID, name in ethnic_groups.name.iteritems():
    dictionnary[fbID] = name

df_chars.ethnicity = df_chars.ethnicity.map(dictionnary)
df_chars

In [None]:
df_chars.to_pickle('../data/generated/preprocessed/characters.pkl')

### TV tropes

In [None]:
df_tropes = pd.read_csv(
	'../data/raw/MovieSummaries/tvtropes.clusters.txt',
	sep='\t',
	names=['trope', 'char_movie_id']
)
df_tropes['char_name'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['char'])
df_tropes['movie_name'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['movie'])
df_tropes['actor_name'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['actor'])
df_tropes['fb_id'] = df_tropes.char_movie_id.map(lambda x: json.loads(x)['id'])
df_tropes.drop(columns='char_movie_id', inplace=True)
df_tropes

In [None]:
df_tropes.to_pickle('../data/generated/preprocessed/tropes.pkl')

### Character clusters

In [None]:
df_charclusters = pd.read_csv(
	'../data/raw/MovieSummaries/name.clusters.txt',
	sep='\t',
	names=['char_name', 'fb_id']
)
df_charclusters

In [None]:
df_charclusters.to_pickle('../data/generated/preprocessed/character_clusters.pkl')