In [3]:
import bz2
import json
import numpy as np
import pandas as pd
import spacy
import pickle
from pathlib import Path

from IPython.display import display, HTML

from ressources import config

In [5]:
RAW_DATA_FOLDER = config.RAW_DATA_FOLDER
GENERATED_DATA_FOLDER = config.GENERATED_DATA_FOLDER

In [None]:
def sanitarization(word_list):
    nlp = spacy.load("en_core_web_sm")
    word_list_san = []   
    for film in word_list:
        doc_film_name = nlp(film)
        tokens = [token.text for token in doc_film_name]
        result = " ".join(tokens)
        word_list_san.append(result)
    return word_list_san

In [None]:
principals = './data/title.principals.tsv.gz'
names = './data/name.basics.tsv.gz'
akas = './data/title.akas.tsv.gz'
titles = './data/title.basics.tsv.gz'
crew = './data/title.crew.tsv.gz'
ratings = './data/title.ratings.tsv.gz'

## The Internet Movie Database

### The datasets of the IMDb 

The Internet Movie Database is an open source database that contains informations regarding movies, TV series, TV movies and even video games. This database which is hosted on a website, is used to rates and simply record characteristics of each features present. The dataset can be found on the IMDB.com website, and it is described as follows :

Each dataset is contained in a gzipped, tab-separated-values (TSV) formatted file in the UTF-8 character set. The first line in each file contains headers that describe what is in each column. A ‘\N’ is used to denote that a particular field is missing or null for that title/name. The available datasets are as follows:

title.akas.tsv.gz - Contains the following information for titles:

- titleId (string) - a tconst, an alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- title (string) – the localized title
- region (string) - the region for this version of the title
- language (string) - the language of the title
- types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
- attributes (array) - Additional terms to describe this alternative title, not enumerated
- isOriginalTitle (boolean) – 0: not original title; 1: original title

title.basics.tsv.gz - Contains the following information for titles:

- tconst (string) - alphanumeric unique identifier of the title
- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string) - original title, in the original language
- isAdult (boolean) - 0: non-adult title; 1: adult title
- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
- runtimeMinutes – primary runtime of the title, in minutes
- genres (string array) – includes up to three genres associated with the title

title.crew.tsv.gz – Contains the director and writer information for all the titles in IMDb. Fields include:

- tconst (string) - alphanumeric unique identifier of the title
- directors (array of nconsts) - director(s) of the given title
- writers (array of nconsts) – writer(s) of the given title
- title.episode.tsv.gz – Contains the tv episode information. Fields include:
- tconst (string) - alphanumeric identifier of episode
- parentTconst (string) - alphanumeric identifier of the parent TV Series
- seasonNumber (integer) – season number the episode belongs to
- episodeNumber (integer) – episode number of the tconst in the TV series

title.principals.tsv.gz – Contains the principal cast/crew for titles

- tconst (string) - alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- nconst (string) - alphanumeric unique identifier of the name/person
- category (string) - the category of job that person was in
- job (string) - the specific job title if applicable, else '\N'
- characters (string) - the name of the character played if applicable, else '\N'

title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles

- tconst (string) - alphanumeric unique identifier of the title
- averageRating – weighted average of all the individual user ratings
- numVotes - number of votes the title has received

name.basics.tsv.gz – Contains the following information for names:

- nconst (string) - alphanumeric unique identifier of the name/person
- primaryName (string)– name by which the person is most often credited
- birthYear – in YYYY format
- deathYear – in YYYY format if applicable, else '\N'
- primaryProfession (array of strings)– the top-3 professions of the person
- knownForTitles (array of tconsts) – titles the person is known for

source : https://www.imdb.com/interfaces/

### Goal to achieve 
In this IMDb pre-processing notebook, we want to import the datasets that were defined as being of interest (not all were taken as title.akas.tsv.gz was left behind - sorry title.akas.tsv.gz) and treat them in order to obtain a main dataset, with rows and columns of interest regarding our project. As described before, the database doesn't concern only movies, so the main dataset will have to be filtered as we will not consider series - for example. 

Datasets were merged using the different ids (tconst and nconst) that link tables among them.

### Importing datasets
In this section, we are importing the datasets of interest as dataframes. Datasets are compressed into .gz archives and None values where directly replaced when importing them, in order to facilitate the processing.

Nota Bene : as the datasets are of size around 300MB each, it is not possible to push them in the git repository (limited to 100MB). Datasets where opened locally and so the paths below can't be run.

The link to download the datasets is : https://datasets.imdbws.com/ 

### Processing the datasets

In [None]:
df_names = pd.read_csv(names, 
                       compression = "infer",
                       sep = '\t',
                       na_values = '\\N')

In [None]:
df_principals = pd.read_csv(principals, 
                            compression = "infer",
                            sep = '\t',
                            na_values = '\\N')

In [None]:
df_titles = pd.read_csv(titles, 
            compression = "infer",
            sep = '\t',
            na_values = '\\N')

In [None]:
df_crew = pd.read_csv(crew, 
            compression="infer",
            sep = '\t',
            na_values = '\\N')

In [None]:
df_ratings = pd.read_csv(ratings, 
            compression="infer",
            sep = '\t',
            na_values = '\\N')

1) Starting with df_name : 
Here, it is only needed to drop columns that would not be in use for the project, which are the birth year and death year of each people present in the IMDb. 

In [None]:
df_names.drop(columns = ['birthYear', 'deathYear'], inplace = True)

2) Then with df_principals : this dataframe needs more processing as we want to transform values of some column into a dict. Indeed, as there are multiple actors, actresses or crew members for each movies in the database, the dataframe will be separate into two parts (respectively actors/actresses and crew members). Each columsn of each rows of the separated dataframes are then aggregated into one dictionary - one dictionary containing, as example, the id/ the name/ the category of the concerned person. Finally, people are merged together into a list of dictionaries, to obtain a final dataframe which has one row per movie containing all crew members, etc. 

In [None]:
# creating a copy of the dataframe column 'nconst'
df_principals['name'] = df_principals['nconst'].copy()

#mapping nconst values of the dataframe with names that are present in the df_names dataframe (based on the id nconst)
df_principals['name'] = df_principals['name'].map(df_names.set_index('nconst')['primaryName'])
df_principals = df_principals [['tconst', 'ordering', 'nconst', 'name', 'category']]

In [None]:
# separating dataset by keeping rows where category = actors or actresses 
actors = ['actor', 'actress']
df_principals_actors = df_principals[df_principals.category.isin(actors)]

# creating a new dataframe to transform columns into a dictionary
# set index nconst and drop tconst & ordering
df_principals_actors_tmp = df_principals_actors.copy()
df_principals_actors_tmp = df_principals_actors_tmp.set_index(['tconst', 'ordering'])
actors_dictionary = df_principals_actors_tmp.to_dict('index')

# replacing with dict values 
df_principals_actors['actor/actress'] = actors_dictionary.values()

# as there are multiple rows for each movie (as there are multiple crew member), aggregating rows by movie's id and so creating a list of dict
# on the column actor/actress
df_principals_actors = df_principals_actors.groupby(['tconst']).agg(lambda x: tuple(x)).applymap(list).reset_index()
df_principals_actors = df_principals_actors.drop(columns = ['category', 'ordering', 'nconst', 'name'])

In [None]:
# separating dataset by keeping the rows concerning the crew and same as before
actors = ['actor', 'actress', 'self']
df_principals_crew = df_principals[~df_principals['category'].isin(actors)]

# creating a new dataframe to transform columns into a dictionary
# set index nconst and drop tconst & ordering
df_principals_crew_tmp = df_principals_crew.copy()
df_principals_crew_tmp = df_principals_crew_tmp.set_index(['tconst', 'ordering'])
crew_dictionary = df_principals_crew_tmp.to_dict('index')

# replacing with dict values 
df_principals_crew['crew'] = crew_dictionary.values()

# as there are multiple rows for each movie (as there are multiple crew member), aggregating rows by movie's id and so creating a list of dict
# on the column actor/actress
df_principals_crew= df_principals_crew.groupby(['tconst']).agg(lambda x: tuple(x)).applymap(list).reset_index()
df_principals_crew = df_principals_crew.drop(columns = ['category', 'ordering', 'name', 'nconst'])

3) Processing the dataframe df_titles, by only dropping some columns.

In [None]:
df_titles.drop(columns = ['endYear', 'isAdult', 'primaryTitle'], inplace = True)

### Merging the datasets

Here, we are merging all datasets into one main dataset, by simple aggregating columns based on the 'tconst' id, which are ids of movies. But first, rows have to be filtered to keep only the ones which concern movies !

In [None]:
# merging df_crew and df_titles to have a dataframe containing movies and there respective crew
merged = pd.merge(df_titles, df_crew, on = 'tconst')

In [None]:
# checking types considered in the database
merged['titleType'].unique()

In [None]:
# keeping only rows concerning movies
merged.drop(merged.loc[merged['titleType'] != 'movie'].index, inplace=True)

print(f'the dataframe contains now {len(merged)} rows')

The dataset contains ~500k movies

In [None]:
# doing some fancy rearranging
merged = merged.drop(columns = ['titleType'])
merged = merged.rename(columns = {'startYear' : 'year'})

In [None]:
merged = merged.merge(df_principals_actors, on = 'tconst', how = 'left')

In [None]:
merged = merged.merge(df_principals_crew, on = 'tconst', how = 'left')

In [None]:
# adding ratings from df_ratings (now we have movies and there title, the crew associated and the ratings)
merged = merged.merge(df_ratings, on = 'tconst', how = 'left')

Saving into pickle (why not, can be useful) :

In [None]:
merged.to_pickle('main_df.pickle')

### Saving main dataframe into json
Here, we simply save the dataframe in json format. json was chosen of behalf of csv format, to match the quotebank dataset which is also in json and to more easily play with the dicts created on the dataframe.

In [None]:
with bz2.open("IMDb.json.bz2", 'wb') as d_file:
    d_file.write(merged.to_json(orient = 'records', lines = True).encode('utf8'))

### Prepare another dict for further work with IMDb.json
Creating a dict that will be used to end the pre-processing of this database. As some columns of the previous main dataframe are still imperfect (directors and writers columns have nconst values instead of names), we're creating a dict that simply link nconst with names. It will be also useful to handle the Wikidata dataset which contains the same ids (nconst) as found in the IMDb

In [None]:
# creating a dict with nconst as keys and names as values
df_nconst = df_names[['nconst', 'primaryName']]
df_nconst['id'] = df_nconst['nconst'].copy()
df_nconst = df_nconst.rename(columns = {'primaryName' : 'name'})
df_nconst.set_index('id')

nconst_names = df_nconst.to_dict('index')

In [None]:
# saving dict into pickle
nconst_names.to_pickle('nconst_names.pickle')

In [None]:
# saving dict into pickle for futur use
import pickle

with open('nconst_names.pickle', 'wb') as handle:
    pickle.dump(nconst_names, handle, protocol = pickle.HIGHEST_PROTOCOL)

#### Saving lists to pickel

In [None]:
film_name = list(merged['originalTitle'])
with open('generated/film_name_list.pickle', 'wb') as f: 
    pickle.dump(film_name, f)

In [None]:
person_name_list = []
for index, row in merged.iterrows():
    if row['actor/actress'] :
        for e in row['actor/actress']:
            actor_name = e['name']
            person_name_list.append(actor_name)
    if row['crew'] :        
        for t in row['crew']:
            crew_name = t['name']
            person_name_list.append(crew_name)

#drop duplicates
person_name_list = list(set(person_name_list))
with open('generated/person_name_list.pickle', 'wb') as f: 
    pickle.dump(person_name_list, f)