In [2]:
import bz2
import json
import numpy as np
import pandas as pd
import spacy
import pickle
from pathlib import Path

from IPython.display import display, HTML

from ressources import config
from ressources.empath_cat import Empath

In [2]:
RAW_DATA_FOLDER = config.RAW_DATA_FOLDER
GENERATED_DATA_FOLDER = config.GENERATED_DATA_FOLDER

## QUOTEBANK Dataset
Exploring all Quotebank datasets (2015 to 2019) to have an idea of their size

In [3]:
QUOTEBANK_FOLDER = RAW_DATA_FOLDER / "QUOTEBANK"
file_list = list(QUOTEBANK_FOLDER.glob('*.json.bz2'))

### List of features

In [None]:
with pd.read_json(file_list[0], lines=True, compression='bz2', chunksize=1) as df_reader:
    for chunk in df_reader:
        df_quotebank = chunk
        break
# column list for Quotebank dataset
print(f"\nColumns quotebank:\n{df_quotebank.columns}")

### Sample

In [None]:
# Sample for Quotebank dataset
print("\nSample quotebank:\n")
display(df_quotebank)

### Number of rows

In [None]:
# Number of rows for quotebank dataset
chunksize = 10000
for file in file_list:
    n = 0
    print(f"Processing file {file.name}")
    with pd.read_json(file, lines=True, compression='bz2', chunksize=chunksize) as df_reader:
        for chunk in df_reader:
            n += len(chunk)
            print(n, end = "\r")
    
    print(f"{n} rows in {file.name}")

Output:

Processing file quotes-2015.json.bz2</br>
20874338 rows in quotes-2015.json.bz2</br>
Processing file quotes-2016.json.bz2</br>
13862129 rows in quotes-2016.json.bz2</br>
Processing file quotes-2017.json.bz2</br>
26611588 rows in quotes-2017.json.bz2</br>
Processing file quotes-2018.json.bz2</br>
27228451 rows in quotes-2018.json.bz2</br>
Processing file quotes-2019.json.bz2</br>
21763302 rows in quotes-2019.json.bz2</br>

## Open quotebank 2018 and add columns for gender and dob 
In concert with the assistant, we decide to focalise on only one year. We choose 2018 because it's the year with the most quotes. Here we use the wikidata dataframe we created in the prepro_WIKIDATA.ipynb to add gender and date of birth for each quotations.

In [12]:
#open wikidata dictionary
with open('../generated/WIKIDATA/dict_wikidata_people.pickle', 'rb') as f: 
     dict_wikidata_people = pickle.loads(f.read())

In [None]:
path_to_file = '../data/QUOTEBANK/quotes-2018.json.bz2'
path_to_out = '.../temp/QUOTEBANK/people_quotes-2018.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            qids = instance['qids'] # extracting list of qids
            gender = []
            dob = []
            for qid in qids:
                for qid_wiki in dict_wikidata_people.items() : 
                    if qid == qid_wiki[0] : 
                        gender = q[1]['genderlabel']
                        dob = q[1]['dob_std']
                        gender.append(gender)
                        dob.append(dob)
                    else :
                        gender.append('None')
                        dob.append('None')
            instance['gender'] = gender # updating the sample with gender
            instance['dob'] = dob # updating the sample with date of bith
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

### Filtering out data
We filter it for the first question by choosing quotes that contains the word "movie", "cinema" or "film". We also choose the quotes that contains these words in their urls. 
We tried looking for film names in the quotes but it gave a lot of false positiv (a lot of films are named with common english words). Therefore for the first question we used only this filter.

In [None]:
path_to_file = '../generated/QUOTEBANK/people_quotes-2018.json.bz2'
path_to_out = '.../temp/QUOTEBANK/movie_quotes-2018.json.bz2'

with bz2.open(file, 'rb') as in_file:
    with bz2.open(path_to_out, 'wb') as out_file:
        for instance in in_file:
            instance = json.loads(instance)
            quote = instance['quotation']
            urls = instance['urls']
            if 'cinema' in quote or 'film' in quote or 'movie' in quote:
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))
            elif:
                for url in urls:
                    if 'cinema' in url or 'film' in url or 'movie' in url:
                        d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

### Open complete quotbank 2018 and search movies in quotes
For the second question, we need to know which quotes speak about wich movie. For that we took the empath library and customized it to match our need.

In [2]:
#lexicon initialization
lexicon = Empath('ressources/empath_cat')

We use a list that comes from prepro_IMDb.ipynb. It contains the english names of all films. With this list we create an empath category.

In [4]:
#open the pickle list with names of the movies
with open('../generated/IMDb/film_name_list.pickle', 'rb') as f: 
     list_imdb_san = pickle.loads(f.read())
list_imdb_san = list(set(list_imdb_san))
print(len(list_imdb_san))

525377


In [4]:
#create a categorie with all movies
lexicon.create_category('ADA_film_name',list_imdb_san)

We filter the dataset to keep only the quotes conatining a film name. We write a line per film, for example if a quote contains the names of two films, we write 2 lines for it. The purpose it to make the groupby in Question2 easier. We also add two columns: the id of the film and the name of the film.

In [None]:
#reseach movies in quote and add a column 'film' in the dataset
path_to_file = '../generated/QUOTEBANK/people_quotes-2018.json.bz2'
path_to_out = '../temp/QUOTEBANK/moviefiltered_10tk_quotes-2018.json.bz2'

lexicon = Empath('ressources/empath_cat')

# dictionnary coming from IMDb.ipynb maping tconst id to film t
with open('../generated/IMDb/tconst_title_dict.pickle', 'rb') as f:
    tconst_title_dict = pickle.loads(f.read())
# dictionary maping film name to id
invert_tconst_dict = {v: k for k, v in tconst_title_dict.items()}

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        n = 0
        for instance in s_file:
            n += 1
            instance = json.loads(instance) # loading a sample
            if instance['quotation'] != 'None':
                features = lexicon.analyze(instance['quotation'],['ADA_film_name'],tokenizer=10,verbose=True,debug = True)
                if features['count']['ADA_film_name']>0 :
                    for film in features['match']:
                        result = instance.copy()
                        result['film'] = film
                        try:
                            result['id_film'] = invert_tconst_dict[film.replace("_", " ")]
                        except KeyError:
                            result['id_film'] = 'Unknown'
                        result.pop('probas', None)
                        result.pop('urls', None)
                        result.pop('qids', None)
                        d_file.write((json.dumps(result)+'\n').encode('utf-8'))
