In [1]:
import bz2
import json
import numpy as np
import pandas as pd
from pathlib import Path

from IPython.display import display, HTML

from ressources import config

In [2]:
RAW_DATA_FOLDER = config.RAW_DATA_FOLDER
GENERATED_DATA_FOLDER = config.GENERATED_DATA_FOLDER

## QUOTEBANK Dataset
Exploring and filtering of quotbank Dataset

In [3]:
QUOTEBANK_FOLDER = RAW_DATA_FOLDER / "QUOTEBANK"
file_list = list(QUOTEBANK_FOLDER.glob('*.json.bz2'))

### List of features

In [5]:
with pd.read_json(file_list[0], lines=True, compression='bz2', chunksize=1) as df_reader:
    for chunk in df_reader:
        df_quotebank = chunk
        break
# column list for Quotebank dataset
print(f"\nColumns quotebank:\n{df_quotebank.columns}")


Columns quotebank:
Index(['quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences',
       'probas', 'urls', 'phase'],
      dtype='object')


### Sample

In [6]:
# Sample for Quotebank dataset
print("\nSample quotebank:\n")
display(df_quotebank)


Sample quotebank:



Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2018-06-30-000005,... a minimum of 5.25 trillion (plastic) parti...,Marcus Eriksen,[Q55997400],2018-06-30 07:00:00,3,"[[Marcus Eriksen, 0.6814], [None, 0.3186]]",[http://www.santacruzsentinel.com/environment-...,E


### Number of rows

In [None]:
# Number of rows for quotebank dataset
chunksize = 10000
for file in file_list:
    n = 0
    print(f"Processing file {file.name}")
    with pd.read_json(file, lines=True, compression='bz2', chunksize=chunksize) as df_reader:
        for chunk in df_reader:
            n += len(chunk)
            print(n, end = "\r")
    
    print(f"{n} rows in {file.name}")

Output:

Processing file quotes-2015.json.bz2</br>
20874338 rows in quotes-2015.json.bz2</br>
Processing file quotes-2016.json.bz2</br>
13862129 rows in quotes-2016.json.bz2</br>
Processing file quotes-2017.json.bz2</br>
26611588 rows in quotes-2017.json.bz2</br>
Processing file quotes-2018.json.bz2</br>
27228451 rows in quotes-2018.json.bz2</br>
Processing file quotes-2019.json.bz2</br>
21763302 rows in quotes-2019.json.bz2</br>

### Filtering out data
We decided to keep only the data that contains some keywords in the quote itself or in the url

In [None]:
for file in file_list:
    path_to_out = GENERATED_DATA_FOLDER / "QUOTEBANK"
    path_to_out = path_to_out / f"{file.name.split('.', 1)[0]}-cinema.{file.name.split('.', 1)[1]}"
    with bz2.open(file, 'rb') as in_file:
        with bz2.open(path_to_out, 'wb') as out_file:
            for instance in in_file:
                instance = json.loads(instance)
                quote = instance['quotation']
                urls = instance['urls']
                if 'cinema' in quote or 'film' in quote or 'movie' in quote:
                    d_file.write((json.dumps(instance)+'\n').encode('utf-8'))
                elif:
                    for url in urls:
                        if 'cinema' in url or 'film' in url or 'movie' in url:
                            d_file.write((json.dumps(instance)+'\n').encode('utf-8'))
                            break

## The Internet Movie Database

### The datasets of the IMDb 

Each dataset is contained in a gzipped, tab-separated-values (TSV) formatted file in the UTF-8 character set. The first line in each file contains headers that describe what is in each column. A ‘\N’ is used to denote that a particular field is missing or null for that title/name. The available datasets are as follows:

title.akas.tsv.gz - Contains the following information for titles:

- titleId (string) - a tconst, an alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- title (string) – the localized title
- region (string) - the region for this version of the title
- language (string) - the language of the title
- types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
- attributes (array) - Additional terms to describe this alternative title, not enumerated
- isOriginalTitle (boolean) – 0: not original title; 1: original title

title.basics.tsv.gz - Contains the following information for titles:

- tconst (string) - alphanumeric unique identifier of the title
- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string) - original title, in the original language
- isAdult (boolean) - 0: non-adult title; 1: adult title
- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
- runtimeMinutes – primary runtime of the title, in minutes
- genres (string array) – includes up to three genres associated with the title

title.crew.tsv.gz – Contains the director and writer information for all the titles in IMDb. Fields include:

- tconst (string) - alphanumeric unique identifier of the title
- directors (array of nconsts) - director(s) of the given title
- writers (array of nconsts) – writer(s) of the given title
- title.episode.tsv.gz – Contains the tv episode information. Fields include:
- tconst (string) - alphanumeric identifier of episode
- parentTconst (string) - alphanumeric identifier of the parent TV Series
- seasonNumber (integer) – season number the episode belongs to
- episodeNumber (integer) – episode number of the tconst in the TV series

title.principals.tsv.gz – Contains the principal cast/crew for titles

- tconst (string) - alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- nconst (string) - alphanumeric unique identifier of the name/person
- category (string) - the category of job that person was in
- job (string) - the specific job title if applicable, else '\N'
- characters (string) - the name of the character played if applicable, else '\N'

title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles

- tconst (string) - alphanumeric unique identifier of the title
- averageRating – weighted average of all the individual user ratings
- numVotes - number of votes the title has received

name.basics.tsv.gz – Contains the following information for names:

- nconst (string) - alphanumeric unique identifier of the name/person
- primaryName (string)– name by which the person is most often credited
- birthYear – in YYYY format
- deathYear – in YYYY format if applicable, else '\N'
- primaryProfession (array of strings)– the top-3 professions of the person
- knownForTitles (array of tconsts) – titles the person is known for

source : https://www.imdb.com/interfaces/

### Goal to achieve 
In this IMDb pre-processing notebook, we want to import the datasets that were defined as being of interest (not all were taken as title.akas.tsv.gz was left behind - sorry title.akas.tsv.gz) and treat them in order to obtain a main dataset, with rows and columns of interest regarding our project. Datasets were merged using the different ids (tconst and nconst) that link tables among them.

### Importing datasets

In [None]:
principals = './data/title.principals.tsv.gz'
names = './data/name.basics.tsv.gz'
akas = './data/title.akas.tsv.gz'
titles = './data/title.basics.tsv.gz'
crew = './data/title.crew.tsv.gz'
ratings = './data/title.ratings.tsv.gz'

In [None]:
df_names = pd.read_csv(names, 
            compression = "infer",
            sep = '\t',
            na_values = '\\N')

df_names.drop(columns = ['birthYear', 'deathYear'], inplace = True)

In [None]:
df_principals = pd.read_csv(principals, 
            compression = "infer",
            sep = '\t',
            na_values = '\\N')

df_principals['name'] = df_principals['nconst'].copy()
df_principals['name'] = df_principals['name'].map(df_names.set_index('nconst')['primaryName'])
df_principals = df_principals [['tconst', 'ordering', 'nconst', 'name', 'category']]

In [None]:
#separating dataset by keeping only actors
actors = ['actor', 'actress']
df_principals_actors = df_principals[df_principals.category.isin(actors)]

In [None]:
#creating a new dataframe to transform columns into a dictionary
df_principals_actors_bis = df_principals_actors.copy()
df_principals_actors_bis = df_principals_actors_bis.set_index(['tconst', 'ordering'])
dict = df_principals_actors_bis.to_dict('index')

In [None]:
#replacing with dict values 
df_principals_actors['actor/actress'] = dict.values()
df_principals_actors = df_principals_actors.groupby(['tconst']).agg(lambda x: tuple(x)).applymap(list).reset_index()

In [None]:
df_principals_actors = df_principals_actors.drop(columns = ['category', 'ordering', 'nconst', 'name'])

In [None]:
#separating dataset by keeping the crew and same as before
actors = ['actor', 'actress', 'self']
df_principals_crew = df_principals[~df_principals['category'].isin(actors)]

In [None]:
df_principals_crew_bis = df_principals_crew.copy()
df_principals_crew_bis = df_principals_crew_bis.set_index(['tconst', 'ordering'])

In [None]:
dict = df_principals_crew_bis.to_dict('index')

In [None]:
df_principals_crew['crew'] = dict.values()
df_principals_crew= df_principals_crew.groupby(['tconst']).agg(lambda x: tuple(x)).applymap(list).reset_index()

In [None]:
df_principals_crew = df_principals_crew.drop(columns = ['category', 'ordering', 'name'])

In [None]:
df_titles = pd.read_csv(titles, 
            compression = "infer",
            sep = '\t',
            na_values = '\\N')

df_titles.drop(columns = ['endYear', 'isAdult', 'primaryTitle'], inplace = True)

In [None]:
df_crew = pd.read_csv(crew, 
            compression="infer",
            sep = '\t',
            na_values = '\\N')

In [None]:
df_ratings = pd.read_csv(ratings, 
            compression="infer",
            sep = '\t',
            na_values = '\\N')

### Merging the datasets

Here, we are merging df_titles, df_crew and df_ratings

In [18]:
#merging df_crew and df_titles to have a dataframe containing movies and there respective crew
merged = pd.merge(df_titles, df_crew, on = 'tconst')

In [19]:
#checking types considered in the database
merged['titleType'].unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [20]:
#keeping only rows concerning movies
merged.drop(merged.loc[merged['titleType'] != 'movie'].index, inplace=True)

print(f'the dataframe contains now {len(merged)} rows')

the dataframe contains now 593343 rows


In [21]:
#doing some fancy rearranging
merged = merged.drop(columns = ['titleType'])
merged = merged.rename(columns = {'startYear' : 'year'})

In [22]:
merged = merged.merge(df_principals_actors, on = 'tconst', how = 'left')

In [23]:
merged = merged.merge(df_principals_crew, on = 'tconst', how = 'left')