In [2]:
import bz2
import json
import numpy as np
import pandas as pd
import spacy
import pickle
from pathlib import Path

from IPython.display import display, HTML

from ressources import config
from ressources.empath_cat import Empath

In [2]:
RAW_DATA_FOLDER = config.RAW_DATA_FOLDER
GENERATED_DATA_FOLDER = config.GENERATED_DATA_FOLDER

## QUOTEBANK Dataset
Exploring and filtering of quotbank Dataset

In [3]:
QUOTEBANK_FOLDER = RAW_DATA_FOLDER / "QUOTEBANK"
file_list = list(QUOTEBANK_FOLDER.glob('*.json.bz2'))

### List of features

In [None]:
with pd.read_json(file_list[0], lines=True, compression='bz2', chunksize=1) as df_reader:
    for chunk in df_reader:
        df_quotebank = chunk
        break
# column list for Quotebank dataset
print(f"\nColumns quotebank:\n{df_quotebank.columns}")

### Sample

In [None]:
# Sample for Quotebank dataset
print("\nSample quotebank:\n")
display(df_quotebank)

### Number of rows

In [None]:
# Number of rows for quotebank dataset
chunksize = 10000
for file in file_list:
    n = 0
    print(f"Processing file {file.name}")
    with pd.read_json(file, lines=True, compression='bz2', chunksize=chunksize) as df_reader:
        for chunk in df_reader:
            n += len(chunk)
            print(n, end = "\r")
    
    print(f"{n} rows in {file.name}")

Output:

Processing file quotes-2015.json.bz2</br>
20874338 rows in quotes-2015.json.bz2</br>
Processing file quotes-2016.json.bz2</br>
13862129 rows in quotes-2016.json.bz2</br>
Processing file quotes-2017.json.bz2</br>
26611588 rows in quotes-2017.json.bz2</br>
Processing file quotes-2018.json.bz2</br>
27228451 rows in quotes-2018.json.bz2</br>
Processing file quotes-2019.json.bz2</br>
21763302 rows in quotes-2019.json.bz2</br>

### Filtering out data
We decided to keep only the data that contains some keywords in the quote itself or in the url

In [None]:
for file in file_list:
    path_to_out = GENERATED_DATA_FOLDER / "QUOTEBANK"
    path_to_out = path_to_out / f"{file.name.split('.', 1)[0]}-cinema.{file.name.split('.', 1)[1]}"
    with bz2.open(file, 'rb') as in_file:
        with bz2.open(path_to_out, 'wb') as out_file:
            for instance in in_file:
                instance = json.loads(instance)
                quote = instance['quotation']
                urls = instance['urls']
                if 'cinema' in quote or 'film' in quote or 'movie' in quote:
                    d_file.write((json.dumps(instance)+'\n').encode('utf-8'))
                elif:
                    for url in urls:
                        if 'cinema' in url or 'film' in url or 'movie' in url:
                            d_file.write((json.dumps(instance)+'\n').encode('utf-8'))
                            break

### Open quotebank 2018 and add columns for gender and dob 

In [12]:
#open wikidata dictionary
with open('../generated/WIKIDATA/dict_wikidata_people.pickle', 'rb') as f: 
     dict_wikidata_people = pickle.loads(f.read())

In [None]:
path_to_file = '../data/QUOTEBANK/quotes-2018.json.bz2'
path_to_out = '.../people_quotes-2018.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            qids = instance['qids'] # extracting list of qids
            gender = []
            dob = []
            for qid in qids:
                for qid_wiki in dict_wikidata_people.items() : 
                    if qid == qid_wiki[0] : 
                        gender = q[1]['genderlabel']
                        dob = q[1]['dob_std']
                        gender.append(gender)
                        dob.append(dob)
                    else :
                        gender.append('None')
                        dob.append('None')
            instance['gender'] = gender # updating the sample with gender
            instance['dob'] = dob # updating the sample with date of bith
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

### Open complete quotbank 2018 and search movies in quotes

In [2]:
#lexicon initialization
lexicon = Empath('ressources/empath_cat')

In [4]:
#open the pickle list with names of the movies
with open('../generated/IMDb/film_name_list.pickle', 'rb') as f: 
     list_imdb_san = pickle.loads(f.read())
list_imdb_san = list(set(list_imdb_san))
print(len(list_imdb_san))

525377


In [4]:
#create a categorie with all movies
lexicon.create_category('ADA_film_name',list_imdb_san)

In [None]:
#reseach movies in quote and add a column 'film' in the dataset
path_to_file = '../generated/QUOTEBANK/people_quotes-2018.json.bz2'
path_to_out = '../temp/QUOTEBANK/moviefiltered_quotes-2018.json.bz2'


with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        n = 0
        for instance in s_file:
            n += 1
            instance = json.loads(instance) # loading a sample
            if instance['quotation'] != 'None':
                features = lexicon.analyze(instance['quotation'],['ADA_film_name'],tokenizer=3,verbose=True,debug = True)
                if features['count']['ADA_film_name']>0 :
                    instance['film'] = features['match']
                    instance.pop('probas', None)
                    instance.pop('urls', None)
                    instance.pop('qids', None)
                    d_file.write((json.dumps(instance)+'\n').encode('utf-8'))


In [None]:
path_to_out = '../generated/QUOTEBANK/moviefiltered_quotes-2018.json.bz2'

with bz2.open(path_to_out, 'rb') as f:
    for instance in f:
        instance = json.loads(instance)
        print(instance)
        break