# Loading and Formating Data pipeline

## Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from utils import data_load
from dateutil.parser import parse as parse_date

## Data Wrangling of CMU dataset

### Raw Metadata Extraction

We extract here into dataframe the raw information from the CMU dataset. Note that we are merging the plot, when available, directly in the raw movie dataframe.

In [3]:
raw_movie_df = data_load.get_raw_movie_dataframe("../data/MovieSummaries/movie.metadata.tsv","../data/MovieSummaries/plot_summaries.txt")
raw_character_df = data_load.get_raw_character_dataframe("../data/MovieSummaries/character.metadata.tsv")

### Tables Creation

We create now the different tables according to the ER diagram we have designed.

##### Country table and relation

In [4]:
country_df, comes_from_df = data_load.create_entry_and_relation_table(raw_movie_df,"countries",
                                                           "country_name","movie_id")

##### Genre table and relation

In [5]:
genre_df, is_of_type_df = data_load.create_entry_and_relation_table(raw_movie_df,"genres",
                                                           "genre_name","movie_id")

##### Language table and relation

In [6]:
language_df, spoken_languages_df = data_load.create_entry_and_relation_table(raw_movie_df,"languages","language_name",
                                                                  "movie_id",filter_dict={" language":""})

##### Character table

We remove characters with no Freebase_id because none of them has a name

In [7]:
character_df = pd.DataFrame(raw_character_df["character_name"][~raw_character_df.index.isna()])
character_df = character_df[~character_df.index.duplicated()]
character_df.index = character_df.index.rename("character_id")

##### Actor table

We will not keep actors and actresses that have a nan id, that are filled with only nan values or duplicated entries in the dataset.

One example of such duplicated entries is Clark Kent and Superman that are two different characters, thus there will be two rows in the original CMU character df, but they are played by the same actor.

In [8]:
actor_df = raw_character_df[["actor_name","actor_gender",
                                "actor_height","actor_ethnicity","actor_birth_date",
                                "freebase_actor_id"]].set_index("freebase_actor_id")
actor_table_columns_mapping = {"actor_birth_date":"birth_date","actor_gender":"gender",
                                "actor_height":"height","actor_ethnicity":"ethnicity",
                                "actor_name":"name","freebase_actor_id":"actor_id"}
actor_df = actor_df[~actor_df.index.isna()]
actor_df = actor_df[~actor_df.index.duplicated()].rename(
                columns=actor_table_columns_mapping)
actor_df.index = actor_df.index.rename("actor_id")
actor_df = actor_df.dropna(how="all")

##### Movie table

In [9]:
movie_df = raw_movie_df[["name","release_date","revenue","runtime","freebase_id","plot"]]
movie_df.loc[movie_df["release_date"] == "1010-12-02","release_date"] = "2010-12-02"
movie_df.index = raw_movie_df.index.rename("movie_id")
percentage_of_movies_with_missing_release_date = movie_df["release_date"].isna().sum()/len(movie_df)
print(f"We have {100*percentage_of_movies_with_missing_release_date:.5f}% of the movie dataset with missing release date.")

We have 8.44374% of the movie dataset with missing release date.


##### "Belongs to" table

In [10]:
belongs_to_df = pd.DataFrame(
    raw_character_df["wikipedia_movie_id"][~raw_character_df.index.isna()])
belongs_to_df = belongs_to_df.reset_index().drop_duplicates()
# Convert back index to linear range
belongs_to_df = belongs_to_df.reset_index()[["freebase_character_id","wikipedia_movie_id"]]
belongs_to_table_columns_mapping = {"freebase_character_id":"character_id","wikipedia_movie_id":"movie_id"}
belongs_to_df = belongs_to_df.rename(columns=belongs_to_table_columns_mapping)
belongs_to_df = belongs_to_df[belongs_to_df["character_id"].isin(set(character_df.index))]

##### "Plays" table

In [11]:
play_df = pd.DataFrame(
    raw_character_df[["freebase_actor_id","freebase_map_id"]][~raw_character_df.index.isna()])
play_df = play_df.reset_index().drop_duplicates()
# Convert back index to linear range
play_df = play_df.reset_index()[["freebase_actor_id","freebase_character_id","freebase_map_id"]]
play_table_columns_mapping = {"freebase_character_id":"character_id",
                                    "freebase_actor_id":"actor_id"}
play_df = play_df.rename(columns=play_table_columns_mapping)
play_df = play_df[play_df["actor_id"].isin(set(actor_df.index))]
play_df = play_df[play_df["character_id"].isin(set(character_df.index))]

##### "Appears in" table

Remove duplicates that can appear in the dataset (same actor for clark kent and superman)

In [12]:
appears_in_df = raw_character_df.reset_index()[["freebase_actor_id",
    "wikipedia_movie_id","actor_age_at_release_date"]].drop_duplicates()
# Convert back index to linear range
appears_in_table_columns_mapping = {"wikipedia_movie_id":"movie_id",
                                    "freebase_actor_id":"actor_id",
                                    "actor_age_at_release_date":"actor_age"}
appears_in_df = appears_in_df.rename(columns=appears_in_table_columns_mapping)
appears_in_df = appears_in_df[appears_in_df["actor_id"].isin(set(actor_df.index))]

### Filter out the duplicated actors and actresses

In the dataset we have some duplicated actors and actresses. They have different freebase ids but have exactly the same attributes and are indeed duplicates when we look at the filmography. However, for many of such duplicates we do not have enough information to be assume with confidence that there are duplicates. Are two actors named John Bravo the same actors or not? It is hard to tell. Thus we decided that we tagged two actors entries as duplicates if they share the same name and same birthdate (the same birthyear is not consider as sufficient).

In [13]:
data_load.process_duplicated_actors(actor_df,[play_df,appears_in_df])
play_df = play_df[~play_df["actor_id"].isna()]
appears_in_df = appears_in_df[~appears_in_df["actor_id"].isna()]

---

## Data Integration

Our primary goal is to perform a time serie analysis of the different features we have in our movie dataset. The problem is that around 8.5% of the dataset is missing the release date entry. We will try to gather information from Wikipedia to recover the information and thus avoid to throw away this data.

Furthermore, to extend our analysis, we are interested in other features linked to our data such as movie ratings or movie director, producer, etc. Thus we will try to retrieve some data from IMDB for our movies.

### Wikipedia Data Integration

In [15]:
wikipedia_data = pd.read_json("../data/Wikipedia/no_release_date_movies.json").T
print(f"We have the wikipedia data for {len(wikipedia_data)} movie with no release date in the CMU dataset.")

We have the wikipedia data for 5205 movie with no release date in the CMU dataset.


In [16]:
data_load.parse_date_columns(wikipedia_data)

In [17]:
unparsable_date_movies = wikipedia_data[(wikipedia_data["Release dates"] == "") 
               & (wikipedia_data["Release date"] == "")
               & (wikipedia_data["Original release"] == "")]
print(f"There are {len(unparsable_date_movies)} movies for which we cannot retrieve the release date from the wikipedia data.")

There are 719 movies for which we cannot retrieve the release date from the wikipedia data.


In [18]:
wikipedia_retrieved_dates = (wikipedia_data["Release dates"]
                             + wikipedia_data["Release date"]
                             + wikipedia_data["Original release"])
wikipedia_retrieved_dates = wikipedia_retrieved_dates[wikipedia_retrieved_dates != ""]
wikipedia_retrieved_dates = pd.DataFrame(wikipedia_retrieved_dates,columns=["release_date"])

In [19]:
movie_df.update(wikipedia_retrieved_dates)

In [20]:
movie_df["release_date"] = pd.to_datetime(movie_df["release_date"])

### IMDB Data Integration

We are interested in all the complementary information that we can retrieve using the IMDB database. To do so we will keep track of mapping between the wikipedia indices and the IMDB page indices. The mapping is not necessarly straightforward. We will first map on the name, but we may have duplicates as some movies have the same names. This method is not perfect also because we may have misspelled names. 


Thus we need to find a way to filter out duplicates that may occur. We will first conserve entries that share the same date of release. Then if it is not enough to remove all duplicates we will filter on the runtime. And if the latter is not sufficient, we do manual check to assign correctly the ids. This is possible as the number of movies at this stage is quite small. We initially tried with a filtering on the number of votes (i.e. keep the movie with the highest vote) based on the hypothesis that if a movie is more popular than another, then it is more likely to be in our dataset. But after verification with the data, this hypothesis does not hold everytime.

#### Load IMDB Data

In [21]:
imdb_movie_df = data_load.load_imdb_title_basics()
imdb_movie_df = imdb_movie_df[imdb_movie_df.title_type == "movie"].reset_index()
imdb_movie_df.drop(["index","original_title","title_type","is_adult","end_year","genres"], axis='columns', inplace=True)

#### Merge original information and IMDB data

In [22]:
# we merge on the titles name (problem: names not unique and maybe spelling errors)
merged = movie_df.reset_index().merge(
    imdb_movie_df, left_on='name', right_on='primary_title', how='left', suffixes=('_cmu', '_imdb'))
merged = merged[~merged["tconst"].isna()]
merged["release_date"] = pd.to_datetime(merged["release_date"])

#### Remove Duplicates

We call "standard" the data without duplicates and "duplicated" the one with duplicates.

In [23]:
duplicated_ids = data_load.get_duplicated_movie_ids(merged)
merged_standard = merged[~duplicated_ids]
merged_duplicated = merged[duplicated_ids]

In [24]:
merged_duplicated = merged_duplicated[
    merged_duplicated.release_date.apply(lambda r: r.year) == merged_duplicated.start_year]
merged_standard, merged_duplicated = data_load.update_merged_dataframes(merged_standard,merged_duplicated)

In [25]:
merged_duplicated = merged_duplicated[merged_duplicated["runtime"] == 
                                      merged_duplicated["runtime_minutes"]]
merged_standard, merged_duplicated = data_load.update_merged_dataframes(merged_standard,merged_duplicated)

In [26]:
merged_duplicated = merged_duplicated.drop(data_load.LAST_DUPLICATES_ID_LIST)
merged_standard, merged_duplicated = data_load.update_merged_dataframes(merged_standard,merged_duplicated)
assert len(merged_duplicated) == 0

#### Create mapping table

Now that we have a fully merged table, we can drop the irrelevent columns and keep only the mapping between wikipedia movie ids, that we are using as general movie ids, and the IMDB ids.

Note that we filter out movies that have no release dates (~500 movies) as we want to have a dataset that we can investigate in a temporal way without doing filtering. Also we filter out movies with the same IMDB id. This happen because some movies share the same name but are not necessarly both present in the IMDB dataset. Then the merge opperation caused both movie to be mapped to this IMDB id but it is most of the case a mistake, thus we filter them out.

In [27]:
merged_standard = merged_standard[~merged_standard["release_date"].isna()]
merged_standard = merged_standard[~merged_standard["tconst"].isna()]
merged_standard = merged_standard[~merged_standard["tconst"].duplicated(keep=False)]
wikipedia_imdb_mapping_table = merged_standard[["movie_id","tconst"]].set_index("movie_id")

## Save Data

In [28]:
country_df.to_pickle("../data/generated/country_df.pkl")
comes_from_df.to_pickle("../data/generated/comes_from_df.pkl")
genre_df.to_pickle("../data/generated/genre_df.pkl")
is_of_type_df.to_pickle("../data/generated/is_of_type_df.pkl")
language_df.to_pickle("../data/generated/language_df.pkl")
spoken_languages_df.to_pickle("../data/generated/spoken_languages_df.pkl")
character_df.to_pickle("../data/generated/character_df.pkl")
actor_df.to_pickle("../data/generated/actor_df.pkl")
movie_df.to_pickle("../data/generated/movie_df.pkl")
belongs_to_df.to_pickle("../data/generated/belongs_to_df.pkl")
play_df.to_pickle("../data/generated/play_df.pkl")
appears_in_df.to_pickle("../data/generated/appears_in_df.pkl")
wikipedia_imdb_mapping_table.to_pickle("../data/generated/wikipedia_imdb_mapping_df.pkl")