We make some pre-processing on the given dataset and save them in csv files

In [1]:
import pandas as pd

DATA_FOLDER = "../data/"

CHARACTER_DATASET = DATA_FOLDER + "character.metadata.tsv"
MOVIE_DATASET = DATA_FOLDER + "movie.metadata.tsv"
NAME_CLUSTER_DATASET = DATA_FOLDER + "name.clusters.txt"
PLOT_DATASET = DATA_FOLDER + "plot_summaries.txt"
TVTROPES_DATASET = DATA_FOLDER + "tvtropes.clusters.txt"

In [2]:
movie_metadata = pd.read_csv(MOVIE_DATASET, sep='\t', header=None) \
    .rename(columns={0:'wikipedia_movie_id', 1 : 'freebase_movie_id', 2 : 'movie_name', 3 : 'movie_release_date', 4 : 'office_revenue', 5 : 'runtime', 6 : 'languages', 7 : 'countries', 8 : 'genres'})
movie_metadata['movie_release_date'] = pd.to_datetime(movie_metadata['movie_release_date'], errors = 'coerce')

In [3]:
character_metadata = pd.read_csv(CHARACTER_DATASET, sep='\t', header=None) \
    .rename(columns={0 : 'wikipedia_movie_id', 1 : 'freebase_movie_id', 2 : 'movie_release_date', 3 : 'character_name', 4 : 'actor_birth_date', 5 : 'actor_gender', 6 : 'actor_height', 7 : 'actor_ethnicity', 8 : 'actor_name', 9 : 'actor_age', 10 : 'freebase_map_id', 11 : 'freebase_character_id', 12 : 'freebase_actor_id'})
character_metadata['movie_release_date'] = pd.to_datetime(character_metadata['movie_release_date'], errors = 'coerce')
character_metadata['actor_birth_date'] = pd.to_datetime(character_metadata['actor_birth_date'],utc=True , errors='coerce')

In [4]:
plot_summaries_dict = {}
with open(PLOT_DATASET) as f :
    for line in f :
        words = line.split() 
        plot_summaries_dict[words[0]] = " ".join(words[1:])

plot_summaries = pd.DataFrame.from_dict(plot_summaries_dict.items()).rename(columns={0:'movie_id',1 : 'plot'})

In [5]:
name_clusters_dict = {}
with open(NAME_CLUSTER_DATASET) as f :
    for line in f :
        words = line.split() 
        name_clusters_dict[" ".join(words[0:2])] = words[2]

name_clusters = pd.DataFrame.from_dict(name_clusters_dict.items()).rename(columns={0:'character_name',1 : 'freebase_map_id'})

In [6]:
import ast
tvtropes_list = []
with open(TVTROPES_DATASET) as f :
    for line in f :
        words = line.split() 
        tvtropes_list.append([words[0]] + list(ast.literal_eval(" ".join(words[1:])).values()))

tvtropes_cluster = pd.DataFrame(tvtropes_list).rename(columns={0: 'character_type', 1 : 'character_name', 2 : 'movie_name', 3 : 'movie_id', 4 : 'actor_name'})

In [7]:
PROCESSED_DATA_FOLDER = '../processed_data/'

PROCESSED_CHARACTER = PROCESSED_DATA_FOLDER + 'character_metadata.csv'
PROCESSED_MOVIE = PROCESSED_DATA_FOLDER + 'movie_metadata.csv'
PROCESSED_NAME = PROCESSED_DATA_FOLDER + 'name_clusters.csv'
PROCESSED_PLOT = PROCESSED_DATA_FOLDER + 'plot_summaries.csv'
PROCESSED_TVTROPES = PROCESSED_DATA_FOLDER + 'tvtropes_clusters.csv'

In [8]:
character_metadata.to_csv(PROCESSED_CHARACTER, index=False)
movie_metadata.to_csv(PROCESSED_MOVIE, index=False)
plot_summaries.to_csv(PROCESSED_PLOT, index=False)
name_clusters.to_csv(PROCESSED_NAME, index=False)
tvtropes_cluster.to_csv(PROCESSED_TVTROPES, index=False)