# Data Final Formatting

In this notebook we will perform the final data pre-processing steps such as data integration, and features parsing and filtering.

## Packages

In [1]:
import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
from utils import data_load

## Load Data

In [2]:
country_df = pd.read_pickle("../data/generated/country_df.pkl")
comes_from_df = pd.read_pickle("../data/generated/comes_from_df.pkl")
genre_df = pd.read_pickle("../data/generated/genre_df.pkl")
is_of_type_df = pd.read_pickle("../data/generated/is_of_type_df.pkl")
language_df = pd.read_pickle("../data/generated/language_df.pkl")
spoken_languages_df = pd.read_pickle("../data/generated/spoken_languages_df.pkl")
character_df = pd.read_pickle("../data/generated/character_df.pkl")
actor_df = pd.read_pickle("../data/generated/actor_df.pkl")
movie_df = pd.read_pickle("../data/generated/movie_df.pkl")
belongs_to_df = pd.read_pickle("../data/generated/belongs_to_df.pkl")
play_df = pd.read_pickle("../data/generated/play_df.pkl")
appears_in_df = pd.read_pickle("../data/generated/appears_in_df.pkl")
wikipedia_imdb_mapping_table = pd.read_pickle("../data/generated/wikipedia_imdb_mapping_df.pkl")

## Helpers

In [3]:
COUNTRY_MAPPING = {"west germany": "germany", "weimar republic": "germany",
"german democratic republic": "germany",
"nazi germany": "germany", "german language": "germany",
"soviet union": "russia", "soviet occupation zone": "russia",
"socialist federal republic of yugoslavia": "yugoslavia", 
"federal republic of yugoslavia" : "yugoslavia",
"uzbek ssr" : "uzbekistan",
"georgian ssr": "georgia",
"kingdom of great britain": "united kingdom", "northern ireland": "united kingdom",
"wales": "united kingdom", "england": "united kingdom", "scotland": "united kingdom",
"serbia and montenegro" : "serbia and montenegro",
"kingdom of italy": "italy",
"mandatory palestine": "palestinian territories",
"ukranian ssr": "ukraine", "ukrainian ssr": "ukraine",
"malayalam language": "india",
"slovak republic" : "slovakia", "republic of china": "china"}

LANGUAGE_MAPPING = {
    "american english": "english",
    "american sign": "sign",
    "assyrian neo-aramaic": "assyrian",
    "australian aboriginal pidgin english": "english",
    "australian english": "english",
    "𐐾𐐲𐑉𐑋𐑌𐐲": "tuu",
    "thai, northeastern": "thai",
    "south african english": "english",
    "standard cantonese": "cantonese",
    "standard mandarin": "mandarin",
    "standard tibetan": "tibetan",
    "mandarin chinese": "mandarin",
    "khmer, central": "khmer",
    "jamaican creole english": "english",
    "indian english": "english",
    "french sign": "sign",
    "france": "french",
    "farsi, western": "farsi",
    "egyptian, ancient": "ancient egyptian",
}

In [4]:
MOVIE_ID_COL_NAME = "movie_id"
CHARACTER_ID_COL_NAME = "character_id"
ACTOR_ID_COL_NAME = "actor_id"
IMDB_ID_COL_NAME = "tconst"
AVG_RATING_COL_NAME = "average_rating"
NUM_VOTES_COL_NAME = "num_votes"

def filter_non_imdb_movies(relationship_dataframe: pd.DataFrame, dataframe: pd.DataFrame,
      movie_ids: set, feature_mapping=dict()) -> tuple:
    """
    """
    feature_name = dataframe.index.name
    new_relationship_df = relationship_dataframe[
        relationship_dataframe[MOVIE_ID_COL_NAME].isin(movie_ids)
    ].copy().reset_index(drop=True)
    new_relationship_df[feature_name] = new_relationship_df[
        feature_name].apply(lambda f: feature_mapping[f] if f in feature_mapping else f)
    new_relationship_df = new_relationship_df.drop_duplicates()
    features_set = set(new_relationship_df[new_relationship_df.columns[-1]])
    new_feature_df = pd.DataFrame(features_set,columns=[feature_name]).set_index(feature_name)
    return new_relationship_df, new_feature_df.sort_index()

## Integrate Ratings

In [5]:
available_imdb_ids = set(wikipedia_imdb_mapping_table[IMDB_ID_COL_NAME])
available_imdb_movie_ids = set(wikipedia_imdb_mapping_table.index)

In [6]:
imdb_ratings = data_load.load_imdb_title_ratings()
imdb_ratings = imdb_ratings[
    imdb_ratings[IMDB_ID_COL_NAME].isin(available_imdb_ids)].copy().reset_index(drop=True)

In [7]:
new_movie_df = movie_df[movie_df.index.isin(available_imdb_movie_ids)].copy()
new_movie_df = new_movie_df.merge(wikipedia_imdb_mapping_table,how="left",on=MOVIE_ID_COL_NAME).reset_index()
new_movie_df = new_movie_df.merge(imdb_ratings,how="left",on=IMDB_ID_COL_NAME).drop(
    [IMDB_ID_COL_NAME],axis=1)
new_movie_df = new_movie_df[~new_movie_df[AVG_RATING_COL_NAME].isna()]
new_movie_df = new_movie_df.set_index(MOVIE_ID_COL_NAME)

## Filter data for movies in the IMDB database

In [8]:
new_target_movie_ids = set(new_movie_df.index)

In [9]:
new_comes_from_df, new_country_df = filter_non_imdb_movies(comes_from_df,
                                    country_df,new_target_movie_ids,
                                    COUNTRY_MAPPING)
new_is_of_type_df, new_genre_df = filter_non_imdb_movies(is_of_type_df,
                                    genre_df,new_target_movie_ids)
new_spoken_languages_df, new_language_df = filter_non_imdb_movies(spoken_languages_df,
                                    language_df,new_target_movie_ids,
                                    LANGUAGE_MAPPING)

In [10]:
new_belongs_to_df = belongs_to_df[belongs_to_df[MOVIE_ID_COL_NAME].isin(new_target_movie_ids)]
new_appears_in_df= appears_in_df[appears_in_df[MOVIE_ID_COL_NAME].isin(new_target_movie_ids)]

In [11]:
new_character_df = character_df[character_df.index.isin(set(new_belongs_to_df[CHARACTER_ID_COL_NAME]))]
new_actor_df = actor_df[actor_df.index.isin(set(new_appears_in_df[ACTOR_ID_COL_NAME]))]

In [12]:
new_play_df = play_df[play_df[ACTOR_ID_COL_NAME].isin(set(new_actor_df.index))]
new_play_df = new_play_df[new_play_df[CHARACTER_ID_COL_NAME].isin(set(new_character_df.index))].reset_index(drop=True)

## Save Data

In [14]:
new_country_df.to_pickle("../data/post_processing//country_df.pkl")
new_comes_from_df.to_pickle("../data/post_processing/comes_from_df.pkl")
new_genre_df.to_pickle("../data/post_processing/genre_df.pkl")
new_is_of_type_df.to_pickle("../data/post_processing/is_of_type_df.pkl")
new_language_df.to_pickle("../data/post_processing/language_df.pkl")
new_spoken_languages_df.to_pickle("../data/post_processing/spoken_languages_df.pkl")
new_character_df.to_pickle("../data/post_processing/character_df.pkl")
new_actor_df.to_pickle("../data/post_processing/actor_df.pkl")
new_movie_df.to_pickle("../data/post_processing/movie_df.pkl")
new_belongs_to_df.to_pickle("../data/post_processing/belongs_to_df.pkl")
new_play_df.to_pickle("../data/post_processing/play_df.pkl")
new_appears_in_df.to_pickle("../data/post_processing/appears_in_df.pkl")

---