# Loading and Formating Data pipeline

## Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from utils import data_load
from dateutil.parser import parse as parse_date

## Data Wrangling of CMU dataset

### Raw Metadata Extraction

We extract here into dataframe the raw information from the CMU dataset. Note that we are merging the plot, when available, directly in the raw movie dataframe.

In [3]:
raw_movie_df = data_load.get_raw_movie_dataframe("../data/MovieSummaries/movie.metadata.tsv","../data/MovieSummaries/plot_summaries.txt")
raw_character_df = data_load.get_raw_character_dataframe("../data/MovieSummaries/character.metadata.tsv")

### Tables Creation

We create now the different tables according to the ER diagram we have designed.

##### Country table and relation

In [4]:
country_df, comes_from_df = data_load.create_entry_and_relation_table(raw_movie_df,"countries",
                                                           "country_name","movie_id")

##### Genre table and relation

In [5]:
genre_df, is_of_type_df = data_load.create_entry_and_relation_table(raw_movie_df,"genres",
                                                           "genre_name","movie_id")

##### Language table and relation

In [6]:
language_df, spoken_languages_df = data_load.create_entry_and_relation_table(raw_movie_df,"languages","language_name",
                                                                  "movie_id",filter_dict={" language":""})

##### Character table

We remove characters with no Freebase_id because none of them has a name

In [7]:
character_df = pd.DataFrame(raw_character_df["character_name"][~raw_character_df.index.isna()])
character_df = character_df[~character_df.index.duplicated()]
character_df.index = character_df.index.rename("character_id")

##### Actor table

We will not keep actors and actresses that have a nan id, that are filled with only nan values or duplicated entries in the dataset.

One example of such duplicated entries is Clark Kent and Superman that are two different characters, thus there will be two rows in the original CMU character df, but they are played by the same actor.

In [8]:
actor_df = raw_character_df[["actor_name","actor_gender",
                                "actor_height","actor_ethnicity","actor_birth_date",
                                "freebase_actor_id"]].set_index("freebase_actor_id")
actor_table_columns_mapping = {"actor_birth_date":"birth_date","actor_gender":"gender",
                                "actor_height":"height","actor_ethnicity":"ethnicity",
                                "actor_name":"name","freebase_actor_id":"actor_id"}
actor_df = actor_df[~actor_df.index.isna()]
actor_df = actor_df[~actor_df.index.duplicated()].rename(
                columns=actor_table_columns_mapping)
actor_df.index = actor_df.index.rename("actor_id")
actor_df = actor_df.dropna(how="all")

##### Movie table

In [9]:
movie_df = raw_movie_df[["name","release_date","revenue","runtime","freebase_id","plot"]]
movie_df.loc[movie_df["release_date"] == "1010-12-02","release_date"] = "2010-12-02"
movie_df.index = raw_movie_df.index.rename("movie_id")

##### "Belongs to" table

In [10]:
belongs_to_df = pd.DataFrame(
    raw_character_df["wikipedia_movie_id"][~raw_character_df.index.isna()])
belongs_to_df = belongs_to_df.reset_index().drop_duplicates()
# Convert back index to linear range
belongs_to_df = belongs_to_df.reset_index()[["freebase_character_id","wikipedia_movie_id"]]
belongs_to_table_columns_mapping = {"freebase_character_id":"character_id","wikipedia_movie_id":"movie_id"}
belongs_to_df = belongs_to_df.rename(columns=belongs_to_table_columns_mapping)

##### "Plays" table

In [11]:
play_df = pd.DataFrame(
    raw_character_df[["freebase_actor_id","freebase_map_id"]][~raw_character_df.index.isna()])
play_df = play_df.reset_index().drop_duplicates()
# Convert back index to linear range
play_df = play_df.reset_index()[["freebase_actor_id","freebase_character_id","freebase_map_id"]]
play_table_columns_mapping = {"freebase_character_id":"character_id",
                                    "freebase_actor_id":"actor_id"}
play_df = play_df.rename(columns=play_table_columns_mapping)

##### "Appears in" table

Remove duplicates that can appear in the dataset (same actor for clark kent and superman)

In [12]:
appears_in_df = raw_character_df.reset_index()[["freebase_actor_id",
    "wikipedia_movie_id","actor_age_at_release_date"]].drop_duplicates()
# Convert back index to linear range
appears_in_table_columns_mapping = {"wikipedia_movie_id":"movie_id",
                                    "freebase_actor_id":"actor_id",
                                    "actor_age_at_release_date":"actor_age"}
appears_in_df = appears_in_df.rename(columns=appears_in_table_columns_mapping)

### Filter out the duplicated actors and actresses

In the dataset we have some duplicated actors and actresses. They have different freebase ids but have exactly the same attributes and are indeed duplicates when we look at the filmography. However, for many of such duplicates we do not have enough information to be assume with confidence that there are duplicates. Are two actors named John Bravo the same actors or not? It is hard to tell. Thus we decided that we tagged two actors entries as duplicates if they share the same name and same birthdate (the same birthyear is not consider as sufficient).

In [13]:
data_load.process_duplicated_actors(actor_df,[play_df,appears_in_df])

---

## Data Integration

Our primary goal is to perform a time serie analysis of the different features we have in our movie dataset. The problem is that around 12% of the dataset is missing the release date entry. We will try to gather information from IMDB and Wikipedia to recover the information and thus avoid to throw away this data.

### Wikipedia Data Integration

In [14]:
wikipedia_data = pd.read_json("../data/Wikipedia/no_release_date_movies.json").T
print(f"We have the wikipedia data for {len(wikipedia_data)} movie with no release date in the CMU dataset.")

We have the wikipedia data for 5205 movie with no release date in the CMU dataset.


In [15]:
data_load.parse_date_columns(wikipedia_data)

In [16]:
unparsable_date_movies = wikipedia_data[(wikipedia_data["Release dates"] == "") 
               & (wikipedia_data["Release date"] == "")
               & (wikipedia_data["Original release"] == "")]
print(f"There are {len(unparsable_date_movies)} movies for which we cannot retrieve the release date from the wikipedia data.")

There are 719 movies for which we cannot retrieve the release date from the wikipedia data.


In [17]:
wikipedia_retrieved_dates = (wikipedia_data["Release dates"]
                             + wikipedia_data["Release date"]
                             + wikipedia_data["Original release"])
wikipedia_retrieved_dates = wikipedia_retrieved_dates[wikipedia_retrieved_dates != ""]
wikipedia_retrieved_dates = pd.DataFrame(wikipedia_retrieved_dates,columns=["release_date"])

In [18]:
movie_df.update(wikipedia_retrieved_dates)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df.update(wikipedia_retrieved_dates)


In [19]:
movie_df["release_date"] = pd.to_datetime(movie_df["release_date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df["release_date"] = pd.to_datetime(movie_df["release_date"])


### IMDB Data Integration

We are interested in all the complementary information that we can retrieve using the IMDB database. To do so we will keep track of mapping between the wikipedia indices and the IMDB page indices. The mapping is not necessarly straightforward. We will first map on the name, but we may have duplicates as some movies have the same names. This method is not perfect also because we may have misspelled names. Thus we need to find a way to filter out duplicates that may occur. We will first conserve entries that share the same date of release. Then if it is not enough to remove all duplicates we will filter on the runtime. And if the latter is not sufficient, we keep the movie with the highest vote, because it is the one that is most likely to be present in our dataset.

In [20]:
imdb_rating_df = data_load.load_imdb_title_ratings()
imdb_rating_df.set_index("tconst",inplace=True)
imdb_movie_df = data_load.load_imdb_title_basics()
imdb_movie_df = imdb_movie_df[imdb_movie_df.title_type == "movie"].reset_index()
imdb_movie_df.drop(["index","original_title","title_type","is_adult","end_year","genres"], axis='columns', inplace=True)

In [22]:
imdb_movie_df

Unnamed: 0,tconst,primary_title,start_year,runtime_minutes
0,tt0000009,Miss Jerry,1894,45
1,tt0000502,Bohemios,1905,100
2,tt0000574,The Story of the Kelly Gang,1906,70
3,tt0000591,The Prodigal Son,1907,90
4,tt0000615,Robbery Under Arms,1907,0
...,...,...,...,...
627188,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57
627189,tt9916680,De la ilusión al desconcierto: cine colombiano...,2007,100
627190,tt9916706,Dankyavar Danka,2013,0
627191,tt9916730,6 Gunn,2017,116


In [113]:
# we merge on the titles name (problem: names not unique and maybe spelling errors)
merged = movie_df.reset_index().merge(
    imdb_movie_df, left_on='name', right_on='primary_title', how='left', suffixes=('_cmu', '_imdb'))
merged["release_date"] = pd.to_datetime(merged["release_date"])

In [114]:
merged_standard = merged[~merged["movie_id"].duplicated(keep=False)]
merged_duplicated = merged[merged["movie_id"].duplicated(keep=False)]

In [115]:
merged_duplicated = merged_duplicated[
    merged_duplicated.release_date.apply(lambda r: r.year) == merged_duplicated.start_year]
merged_standard = pd.concat([
    merged_standard,merged_duplicated[~merged_duplicated["movie_id"].duplicated(keep=False)]])

In [116]:
merged_duplicated = merged_duplicated[merged_duplicated["movie_id"].duplicated(keep=False)]
merged_duplicated = merged_duplicated[merged_duplicated["runtime"] == 
                                      merged_duplicated["runtime_minutes"]]
merged_standard = pd.concat([
    merged_standard,merged_duplicated[~merged_duplicated["movie_id"].duplicated(keep=False)]])

In [117]:
merged_duplicated = merged_duplicated[merged_duplicated["movie_id"].duplicated(keep=False)]
merged_duplicated = merged_duplicated.reset_index()

In [118]:
#merged_duplicated.drop([0,1])
merged_duplicated

Unnamed: 0,index,movie_id,name,release_date,revenue,runtime,freebase_id,plot,tconst,primary_title,start_year,runtime_minutes
0,5407,16958933,Inferno,1999-01-01,,95.0,/m/0413dmv,The film opens as Eddie Lomax drives an India...,tt0139151,Inferno,1999,95
1,5408,16958933,Inferno,1999-01-01,,95.0,/m/0413dmv,The film opens as Eddie Lomax drives an India...,tt0209043,Inferno,1999,95
2,25514,33238319,The Reunion,2011-10-21,,90.0,/m/0h134d0,"After the death of her father, Nina is charge...",tt1699135,The Reunion,2011,90
3,25515,33238319,The Reunion,2011-10-21,,90.0,/m/0h134d0,"After the death of her father, Nina is charge...",tt1792543,The Reunion,2011,90
4,35338,31701975,The Artist,2011-05-15,133432856.0,100.0,/m/0gmcwlb,"In 1927, silent film star George Valentin is p...",tt1655442,The Artist,2011,100
5,35339,31701975,The Artist,2011-05-15,133432856.0,100.0,/m/0gmcwlb,"In 1927, silent film star George Valentin is p...",tt1825978,The Artist,2011,100
6,47411,2462649,Violent Cop,2000-01-01,,88.0,/m/07fttx,,tt0275052,Violent Cop,2000,88
7,47412,2462649,Violent Cop,2000-01-01,,88.0,/m/07fttx,,tt4143626,Violent Cop,2000,88
8,62821,12165949,The 11th Hour,2007-05-19,985207.0,95.0,/m/02vs16l,"With contributions from over 50 politicians, s...",tt0492931,The 11th Hour,2007,95
9,62822,12165949,The 11th Hour,2007-05-19,985207.0,95.0,/m/02vs16l,"With contributions from over 50 politicians, s...",tt1043415,The 11th Hour,2007,95


## Save Data

In [None]:
country_df.to_pickle("../data/generated/country_df.pkl")
comes_from_df.to_pickle("../data/generated/comes_from_df.pkl")
genre_df.to_pickle("../data/generated/genre_df.pkl")
is_of_type_df.to_pickle("../data/generated/is_of_type_df.pkl")
language_df.to_pickle("../data/generated/language_df.pkl")
spoken_languages_df.to_pickle("../data/generated/spoken_languages_df.pkl")
character_df.to_pickle("../data/generated/character_df.pkl")
actor_df.to_pickle("../data/generated/actor_df.pkl")
movie_df.to_pickle("../data/generated/movie_df.pkl")
belongs_to_df.to_pickle("../data/generated/belongs_to_df.pkl")
play_df.to_pickle("../data/generated/play_df.pkl")
appears_in_df.to_pickle("../data/generated/appears_in_df.pkl")

### Combine CMU with IMDB

In [25]:
imdb_movie_df = data_load.load_imdb_title_basics()
imdb_movie_df = imdb_movie_df[imdb_movie_df.title_type == "movie"]
imdb_movie_df.drop('original_title', axis='columns', inplace=True)

KeyboardInterrupt: 

In [None]:
# we merge on the titles name (problem: names not unique and maybe spelling errors)
merged = movie_df.reset_index().merge(imdb_movie_df, left_on='name', right_on='primary_title', how='left', suffixes=('_cmu', '_imdb'))

In [20]:
merged = merged[merged.release_date.apply(lambda r: r.year) == merged.start_year]

In [21]:
# we have duplicate entries
duplicates = merged.freebase_id.value_counts()
duplicates = duplicates[duplicates > 1]
duplicates

/m/0h1z21s    5
/m/064p159    4
/m/0bbx0_     4
/m/09vq1kn    4
/m/03cg7t2    3
             ..
/m/03cb5j0    2
/m/04g0gvz    2
/m/07scm0y    2
/m/06w9zfg    2
/m/09s60jk    2
Name: freebase_id, Length: 465, dtype: Int64

In [22]:
df_imdb_rating = data_load.load_imdb_title_ratings()

In [23]:
# keep the ones with the most imdb votes => most meaningful
df = merged[merged.freebase_id.isin(list(duplicates.index))].merge(df_imdb_rating, on='tconst', how='left')

In [24]:
dropping = []
for id in list(duplicates.index):
    dft = df[df.freebase_id == id]
    ids = list(dft.sort_values('num_votes', ascending=False).iloc[1:].tconst.values)
    dropping += ids

In [25]:
merged = merged[~merged.tconst.isin(dropping)]

In [26]:
merged.head()

Unnamed: 0,wikipedia_movie_id,freebase_id,movie_name,release_date,revenue,runtime,languages,countries,genres_cmu,tconst,title_type,primary_title,is_adult,start_year,end_year,runtime_minutes,genres_imdb
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",tt0228333,movie,Ghosts of Mars,0,2001,,98,"[Action, Horror, Sci-Fi]"
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",tt0094806,movie,Brun bitter,0,1988,,83,"[Crime, Drama]"
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,[German Language],[Germany],[Drama],tt0083949,movie,A Woman in Flames,0,1983,,106,[Drama]
9,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]",tt0029852,movie,Alexander's Ragtime Band,0,1938,,106,"[Drama, Music, Musical]"
10,9997961,/m/06_y2j7,Contigo y aquí,1974-01-01,,0.0,[Spanish Language],[Argentina],"[Musical, Drama, Comedy]",tt0200545,movie,Contigo y aquí,0,1974,,70,"[Comedy, Drama, Musical]"


In [27]:
merged.to_pickle('../data/generated/movie_metadata.pkl')