# EDA notebook of Jeremy

## Set Up

In [1]:
import numpy as np
import pandas as pd
import itertools
import json

## Load Data

#### Movies

In [2]:
movie_meta_data = pd.read_csv("../../data/MovieSummaries/movie.metadata.tsv",sep="\t",
                              names=["wikipedia_id","freebase_id",
                                     "name","release_date","revenue",
                                     "runtime","languages","countries","genres"],
                             index_col="wikipedia_id")
movie_plots = pd.read_csv("../../data/MovieSummaries/plot_summaries.txt",sep="\t",
                              names=["wikipedia_id","plot"],index_col="wikipedia_id")
movie_raw_df = movie_meta_data.join(movie_plots)

#### Characters

In [3]:
character_meta_data = pd.read_csv("../../data/MovieSummaries/character.metadata.tsv",sep="\t",
                              names=["wikipedia_movie_id","freebase_movie_id",
                                     "release_date","character_name","actor_birth_date",
                                     "actor_gender","actor_height","actor_ethnicity","actor_name",
                                     "actor_age_at_release_date","freebase_map_id","freebase_character_id",
                                     "freebase_actor_id"],
                             index_col="freebase_character_id")

## Format Data

### Helpers and Parsers

In [4]:
def freebase_dict_parser_python(entry):
    """ Parse the entry of the given raw data freebase based entry using built-in python functions. """
    results = []
    for pair in entry[1:-1].split(","):
        if len(pair) > 0:
            single_element = pair.split("\"")[-2].lower()
            results.append(single_element.removesuffix("language"))
    return results

def apply_entry_level_filter(entry,filter_dict):
    """ Replace in the given entry the different terms in the filter dictionnary. """ 
    new_entry = entry
    for old, new in filter_dict.items():
        new_entry = new_entry.replace(old,new)
    return new_entry

def freebase_dict_parser(entry,filter_dict):
    """ Parse the entry of the given raw data freebase based entry using json format. """
    entry_dict = json.loads(entry)
    if len(entry_dict) > 0:
        return list(set([apply_entry_level_filter(s.lower(),filter_dict)
                         for s in entry_dict.values()]))
    else:
        return []
    
def create_flat_movie_entry_list(entry_name,movie_raw_df,filter_dict):
    """ Create a flat list with the movie ids together with the given entry type. """
    flat_entry_list = [(idx,entry) for idx,entry_list in 
                      movie_raw_df[entry_name].apply(lambda e : freebase_dict_parser(e,filter_dict)).to_dict().items()
                      for entry in entry_list]
    return list(set(flat_entry_list))

def create_entry_and_relation_table(movie_raw_df,entry_name,
                                    entry_id_name,movie_id_name, filter_dict=dict()):
    """ Creates the tables for both the given entity and its relation table with the movies. """
    entry_relation_df = pd.DataFrame(create_flat_movie_entry_list(entry_name,movie_raw_df,filter_dict)
                        ,columns=[movie_id_name,entry_id_name])
    entry_df = pd.DataFrame({entry_id_name:entry_relation_df[entry_id_name].unique()}).set_index([entry_id_name])
    return entry_df,entry_relation_df

### Tables Creation

#### Country table and relation

In [5]:
country_df,comes_from_df = create_entry_and_relation_table(movie_raw_df,"countries",
                                                           "country_name","movie_id")

#### Genre table and relation

In [6]:
genre_df,is_of_type_df = create_entry_and_relation_table(movie_raw_df,"genres",
                                                           "genre_name","movie_id")

#### Language table and relation

In [7]:
language_df,spoken_languages_df = create_entry_and_relation_table(movie_raw_df,"languages","language_name",
                                                                  "movie_id",filter_dict={" language":""})

#### Character Table

In [8]:
# Remove Character with no Freebase_id because none of them has a names
character_df = pd.DataFrame(character_meta_data["character_name"][~character_meta_data.index.isna()])
character_df = character_df[~character_df.index.duplicated()]
character_df.index = character_df.index.rename("character_id")

#### Actor Table

In [9]:
# We remove actors that have a nan id or that have duplicated entries
actor_df = character_meta_data[["actor_name","actor_gender",
                                "actor_height","actor_ethnicity","actor_birth_date",
                                "freebase_actor_id"]].set_index("freebase_actor_id")
actor_table_columns_mapping = {"actor_birth_date":"birth_date","actor_gender":"gender",
                                "actor_height":"height","actor_ethnicity":"ethnicity",
                                "actor_name":"name","freebase_actor_id":"actor_id"}
actor_df = actor_df[~actor_df.index.isna()]
actor_df = actor_df[~actor_df.index.duplicated()].rename(
                columns=actor_table_columns_mapping)
actor_df.index = actor_df.index.rename("actor_id")
actor_df = actor_df.dropna(how="all")

#### Movie Table

In [10]:
movie_df = movie_raw_df[["name","release_date","revenue","runtime","freebase_id","plot"]]
movie_df.index = movie_df.index.rename("movie_id")

#### Belongs_To Table

In [11]:
belongs_to_df = pd.DataFrame(
    character_meta_data["wikipedia_movie_id"][~character_meta_data.index.isna()])
belongs_to_df = belongs_to_df.reset_index().drop_duplicates()
# Convert back index to linear range
belongs_to_df = belongs_to_df.reset_index()[["freebase_character_id","wikipedia_movie_id"]]
belongs_to_table_columns_mapping = {"freebase_character_id":"character_id","wikipedia_movie_id":"movie_id"}
belongs_to_df = belongs_to_df.rename(columns=belongs_to_table_columns_mapping)

#### Plays Table

In [12]:
play_df = pd.DataFrame(
    character_meta_data[["freebase_actor_id","freebase_map_id"]][~character_meta_data.index.isna()])
play_df = play_df.reset_index().drop_duplicates()
# Convert back index to linear range
play_df = play_df.reset_index()[["freebase_actor_id","freebase_character_id","freebase_map_id"]]
play_table_columns_mapping = {"freebase_character_id":"character_id",
                                    "freebase_actor_id":"actor_id"}
play_df = play_df.rename(columns=play_table_columns_mapping)

#### Appears In Table

In [13]:
# Remove duplicates that can appear in the dataset (same actor for clark kent and superman)
appears_in_df = character_meta_data.reset_index()[["freebase_actor_id",
    "wikipedia_movie_id","actor_age_at_release_date"]].drop_duplicates()
# Convert back index to linear range
appears_in_table_columns_mapping = {"wikipedia_movie_id":"movie_id",
                                    "freebase_actor_id":"actor_id",
                                    "actor_age_at_release_date":"actor_age"}
appears_in_df = appears_in_df.rename(columns=appears_in_table_columns_mapping)

#### Merge duplicated actors entries

In [14]:
def retrieve_duplicated_actors_ids(actor_dataframe):
    """ Retrive the indices of duplicated actors in the given df. 
        
        We state that an actor is duplicated if it has the same name
        and the same birthdate as another entry in the df. Note that
        we do not consider same years as to be same birth date.
    
    """
    duplicated_actors_df = actor_dataframe[
                            actor_dataframe.duplicated(keep=False)]
    # We stick to defined actors, the row containing only missing values cannot be
    # Assimilated one to another.
    duplicated_actors_df = duplicated_actors_df[~duplicated_actors_df.isna()]
    duplicated_actors_df = duplicated_actors_df.reset_index()
    duplicated_actors_df = duplicated_actors_df[~duplicated_actors_df["birth_date"].isna()]
    duplicated_actors_df = duplicated_actors_df.groupby("name",dropna=False)
    duplicated_actors_dict = duplicated_actors_df["birth_date"].apply(list).to_dict()
    # Filter actors
    duplicated_actors_ids = []
    for actor_name, birth_dates in duplicated_actors_dict.items():
        first_date, second_date = birth_dates[0], birth_dates[1]
        if first_date == second_date and len(first_date) > 4:
            duplicated_actors_ids.append(
                actor_dataframe[(actor_dataframe["name"] == actor_name)
                               & (actor_dataframe["birth_date"] == first_date)].index.to_list())
    return duplicated_actors_ids

def rematch_duplicated_actor_ids(duplicated_ids,actor_dataframe,relationship_dataframes):
    """ Merge the two actors into a single entry. """
    for conserved_id, thrown_id in duplicated_ids:
        actor_dataframe.drop(thrown_id,inplace=True)
        for relation_df in relationship_dataframes:
            relation_df["actor_id"] = relation_df["actor_id"].apply(
                lambda idx: conserved_id if idx == thrown_id else idx)
            relation_df.drop_duplicates(inplace=True)
        
def proces_duplicated_actors(actor_dataframe,relationship_dataframes):
    """ Identify duplicated actors entries and merge them together. """
    duplicated_ids = retrieve_duplicated_actors_ids(actor_dataframe)
    rematch_duplicated_actor_ids(duplicated_ids,actor_dataframe,relationship_dataframes)

In [15]:
proces_duplicated_actors(actor_df,[play_df,appears_in_df])

### Save Tables

In [16]:
country_df.to_pickle("../../data/Tables/country_df.pkl")
comes_from_df.to_pickle("../../data/Tables/comes_from_df.pkl")
genre_df.to_pickle("../../data/Tables/genre_df.pkl")
is_of_type_df.to_pickle("../../data/Tables/is_of_type_df.pkl")
language_df.to_pickle("../../data/Tables/language_df.pkl")
spoken_languages_df.to_pickle("../../data/Tables/spoken_languages_df.pkl")
character_df.to_pickle("../../data/Tables/character_df.pkl")
actor_df.to_pickle("../../data/Tables/actor_df.pkl")
movie_df.to_pickle("../../data/Tables/movie_df.pkl")
belongs_to_df.to_pickle("../../data/Tables/belongs_to_df.pkl")
play_df.to_pickle("../../data/Tables/play_df.pkl")
appears_in_df.to_pickle("../../data/Tables/appears_in_df.pkl")