# Loading and Formating Data pipeline

## Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from utils import data_load

## Data Wrangling of CMU dataset

### Raw Metadata Extraction

We extract here into dataframe the raw information from the CMU dataset. Note that we are merging the plot, when available, directly in the raw movie dataframe.

In [3]:
raw_movie_df = data_load.get_raw_movie_dataframe("../data/MovieSummaries/movie.metadata.tsv","../data/MovieSummaries/plot_summaries.txt")
raw_character_df = data_load.get_raw_character_dataframe("../data/MovieSummaries/character.metadata.tsv")

### Tables Creation

We create now the different tables according to the ER diagram we have designed.

##### Country table and relation

In [4]:
country_df, comes_from_df = data_load.create_entry_and_relation_table(raw_movie_df,"countries",
                                                           "country_name","movie_id")

##### Genre table and relation

In [5]:
genre_df, is_of_type_df = data_load.create_entry_and_relation_table(raw_movie_df,"genres",
                                                           "genre_name","movie_id")

##### Language table and relation

In [6]:
language_df, spoken_languages_df = data_load.create_entry_and_relation_table(raw_movie_df,"languages","language_name",
                                                                  "movie_id",filter_dict={" language":""})

##### Character table

We remove characters with no Freebase_id because none of them has a name

In [7]:
character_df = pd.DataFrame(raw_character_df["character_name"][~raw_character_df.index.isna()])
character_df = character_df[~character_df.index.duplicated()]
character_df.index = character_df.index.rename("character_id")

##### Actor table

We will not keep actors and actresses that have a nan id, that are filled with only nan values or duplicated entries in the dataset.

One example of such duplicated entries is Clark Kent and Superman that are two different characters, thus there will be two rows in the original CMU character df, but they are played by the same actor.

In [8]:
actor_df = raw_character_df[["actor_name","actor_gender",
                                "actor_height","actor_ethnicity","actor_birth_date",
                                "freebase_actor_id"]].set_index("freebase_actor_id")
actor_table_columns_mapping = {"actor_birth_date":"birth_date","actor_gender":"gender",
                                "actor_height":"height","actor_ethnicity":"ethnicity",
                                "actor_name":"name","freebase_actor_id":"actor_id"}
actor_df = actor_df[~actor_df.index.isna()]
actor_df = actor_df[~actor_df.index.duplicated()].rename(
                columns=actor_table_columns_mapping)
actor_df.index = actor_df.index.rename("actor_id")
actor_df = actor_df.dropna(how="all")

##### Movie table

In [9]:
movie_df = raw_movie_df[["name","release_date","revenue","runtime","freebase_id","plot"]]
movie_df.index = raw_movie_df.index.rename("movie_id")

##### "Belongs to" table

In [10]:
belongs_to_df = pd.DataFrame(
    raw_character_df["wikipedia_movie_id"][~raw_character_df.index.isna()])
belongs_to_df = belongs_to_df.reset_index().drop_duplicates()
# Convert back index to linear range
belongs_to_df = belongs_to_df.reset_index()[["freebase_character_id","wikipedia_movie_id"]]
belongs_to_table_columns_mapping = {"freebase_character_id":"character_id","wikipedia_movie_id":"movie_id"}
belongs_to_df = belongs_to_df.rename(columns=belongs_to_table_columns_mapping)

##### "Plays" table

In [11]:
play_df = pd.DataFrame(
    raw_character_df[["freebase_actor_id","freebase_map_id"]][~raw_character_df.index.isna()])
play_df = play_df.reset_index().drop_duplicates()
# Convert back index to linear range
play_df = play_df.reset_index()[["freebase_actor_id","freebase_character_id","freebase_map_id"]]
play_table_columns_mapping = {"freebase_character_id":"character_id",
                                    "freebase_actor_id":"actor_id"}
play_df = play_df.rename(columns=play_table_columns_mapping)

##### "Appears in" table

Remove duplicates that can appear in the dataset (same actor for clark kent and superman)

In [12]:
appears_in_df = raw_character_df.reset_index()[["freebase_actor_id",
    "wikipedia_movie_id","actor_age_at_release_date"]].drop_duplicates()
# Convert back index to linear range
appears_in_table_columns_mapping = {"wikipedia_movie_id":"movie_id",
                                    "freebase_actor_id":"actor_id",
                                    "actor_age_at_release_date":"actor_age"}
appears_in_df = appears_in_df.rename(columns=appears_in_table_columns_mapping)

### Filter out the duplicated actors and actresses

In the dataset we have some duplicated actors and actresses. They have different freebase ids but have exactly the same attributes and are indeed duplicates when we look at the filmography. However, for many of such duplicates we do not have enough information to be assume with confidence that there are duplicates. Are two actors named John Bravo the same actors or not? It is hard to tell. Thus we decided that we tagged two actors entries as duplicates if they share the same name and same birthdate (the same birthyear is not consider as sufficient).

In [13]:
data_load.process_duplicated_actors(actor_df,[play_df,appears_in_df])

In [14]:
country_df.to_pickle("../data/generated/country_df.pkl")
comes_from_df.to_pickle("../data/generated/comes_from_df.pkl")
genre_df.to_pickle("../data/generated/genre_df.pkl")
is_of_type_df.to_pickle("../data/generated/is_of_type_df.pkl")
language_df.to_pickle("../data/generated/language_df.pkl")
spoken_languages_df.to_pickle("../data/generated/spoken_languages_df.pkl")
character_df.to_pickle("../data/generated/character_df.pkl")
actor_df.to_pickle("../data/generated/actor_df.pkl")
movie_df.to_pickle("../data/generated/movie_df.pkl")
belongs_to_df.to_pickle("../data/generated/belongs_to_df.pkl")
play_df.to_pickle("../data/generated/play_df.pkl")
appears_in_df.to_pickle("../data/generated/appears_in_df.pkl")

---

### Combine CMU with IMDB

In [15]:
df_imdb = data_load.load_imdb_title_basics()

In [16]:
df_imdb = df_imdb[df_imdb.title_type == "movie"]

In [17]:
df_cmu_movie = data_load.load_cmu_movie_metadata()

In [18]:
df_imdb.drop('original_title', axis='columns', inplace=True)

In [19]:
# we merge on the titles name (problem: names not unique and maybe spelling errors)
merged = df_cmu_movie.merge(df_imdb, left_on='movie_name', right_on='primary_title', how='left', suffixes=('_cmu', '_imdb'))

In [20]:
merged = merged[merged.release_date.apply(lambda r: r.year) == merged.start_year]

In [21]:
# we have duplicate entries
duplicates = merged.freebase_id.value_counts()
duplicates = duplicates[duplicates > 1]
duplicates

/m/0h1z21s    5
/m/064p159    4
/m/0bbx0_     4
/m/09vq1kn    4
/m/03cg7t2    3
             ..
/m/03cb5j0    2
/m/04g0gvz    2
/m/07scm0y    2
/m/06w9zfg    2
/m/09s60jk    2
Name: freebase_id, Length: 465, dtype: Int64

In [22]:
df_imdb_rating = data_load.load_imdb_title_ratings()

In [23]:
# keep the ones with the most imdb votes => most meaningful
df = merged[merged.freebase_id.isin(list(duplicates.index))].merge(df_imdb_rating, on='tconst', how='left')

In [24]:
dropping = []
for id in list(duplicates.index):
    dft = df[df.freebase_id == id]
    ids = list(dft.sort_values('num_votes', ascending=False).iloc[1:].tconst.values)
    dropping += ids

In [25]:
merged = merged[~merged.tconst.isin(dropping)]

In [26]:
merged.head()

Unnamed: 0,wikipedia_movie_id,freebase_id,movie_name,release_date,revenue,runtime,languages,countries,genres_cmu,tconst,title_type,primary_title,is_adult,start_year,end_year,runtime_minutes,genres_imdb
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",tt0228333,movie,Ghosts of Mars,0,2001,,98,"[Action, Horror, Sci-Fi]"
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",tt0094806,movie,Brun bitter,0,1988,,83,"[Crime, Drama]"
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,[German Language],[Germany],[Drama],tt0083949,movie,A Woman in Flames,0,1983,,106,[Drama]
9,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]",tt0029852,movie,Alexander's Ragtime Band,0,1938,,106,"[Drama, Music, Musical]"
10,9997961,/m/06_y2j7,Contigo y aquí,1974-01-01,,0.0,[Spanish Language],[Argentina],"[Musical, Drama, Comedy]",tt0200545,movie,Contigo y aquí,0,1974,,70,"[Comedy, Drama, Musical]"


In [27]:
merged.to_pickle('../data/generated/movie_metadata.pkl')