# Preprocessing

In this part we isolate relevant data into new compressed json files with ram-compatible sizes in the 'output' folder.

Basic filtering is applied\:
 - Only quotations with a good confidence are kept
 - Only quotations refering to our chosen subject are kept

## Colab Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import seaborn as sns
import bz2
import json
import os
from importlib import reload

## FIRST TIME? uncomment this to get started
# if you dont have a token https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
"""
os.chdir('/content/drive/MyDrive/')
!git clone https://USERNAME:TOKEN@github.com/epfl-ada/ada-2021-project-adada-sur-mon-bidet.git
"""

os.chdir('/content/drive/MyDrive/ada-2021-project-adada-sur-mon-bidet/')
import helpers
helpers.git_pull()
reload(helpers)
!ls

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
git token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
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
gitignore   output	 Quotebank  word_embeddings.ipynb
helpers.py  __pycache__  README.md


In [5]:
data_path = 'Quotebank/'
out_path  = 'output/'

years = range(2020, 2014, -1)

data_files = os.listdir(data_path)
path_to_files = dict(zip(years, [data_path + f for f in data_files]))
path_to_out = dict( zip(years, [out_path + f for f in data_files]))
path_to_out

{2015: 'output/quotes-2015.json.bz2',
 2016: 'output/quotes-2016.json.bz2',
 2017: 'output/quotes-2017.json.bz2',
 2018: 'output/quotes-2018.json.bz2',
 2019: 'output/quotes-2019.json.bz2',
 2020: 'output/quotes-2020.json.bz2'}

## Sampling and json streamed filtering

In [41]:
def sample_json(file = path_to_files[2020], size=3, filters = None):
    """
    returns a sample of json lines
    """
    data = []
    with bz2.open(file, 'rb') as s_file:
        for i, instance in zip(range(size), s_file):
            line = json.loads(instance)
            if all([filter(line) for filter in filters]):
              data.append(line)

    return data

keywords = lambda line : any([ word in line["quotation"] for word in ["awesome"]])
h = sample_json(size = 1000, filters = [keywords])
print(len(h))

2


In [42]:
def filter_json(infile, outfile, filters = None):

    """
    infile: quotes-*.json.bz2
    outfile: *.json.bz2
    filters: function with signature f(json line) = boolean 
    streamed filtering of infile to outfile
    """
    with bz2.open(infile, 'rb') as s_file:
      with bz2.open(outfile, 'wb') as o_file:
        for instance in s_file:
            line = json.loads(instance)
            ## awfully inefficient, does not stop when it sees a False
            if all([filter(line) for filter in filters]):
              o_file.write((json.dumps(line)+'\n').encode('utf-8'))
    return 

def high_prob(threshold = 0.9):
  return lambda line: (line["probas"][0][0] != 'None') and \
                      (float(line["probas"][0][1]) > threshold)

def contains_topic(topic_dictionary):
  return lambda line : any([word in line["quotation"] for word in topic_dictionary])

climate_dict = ["Climate", "climate", "COP", "renewable", "IPCC"]
filter_json(path_to_files[2020], 'output/climate_highprob2020.json.bz2',
            filters = [contains_topic(climate_dict), high_prob(threshold=0.9)])

In [57]:
data = []
with bz2.open('output/climate_highprob2020.json.bz2', 'rb') as s_file:
        for instance in s_file:
          data.append(eval(json.dumps(json.loads(instance))))

df = pd.DataFrame(data)
df.index.size

3352