In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib import figure
import matplotlib.dates as mdates
import json

# **1. Loading Data**



Here are the paths to the data we use.

In [None]:
PATH_ROOT = '/content/drive/MyDrive/EPFL/ADA'
PATH_PARQUET = PATH_ROOT + '/Project datasets'
PATH_QUOTEBANK = PATH_ROOT + '/Quotebank'
PATH_TO_QUOTES = '/content/drive/MyDrive/EPFL/ADA/quotes_topics.json'
PATH_TO_CLEAN = PATH_ROOT + '/Cleaned_data'

##1.1 Loading quote topics

In [None]:
df = pd.read_json(PATH_TO_QUOTES, orient='index')
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,topic
0,2020-02-18-004289,an appetite for power.,,[],2020-02-18 14:44:45,3,"[[None, 0.3665], [Robin Niblett, 0.3339], [Jos...","[https://hypervocal.com/items/3249757, https:/...",E,181
4,2020-01-23-024008,"He got on top of me, and he raped me.",Annabella Sciorra,[Q231395],2020-01-23 00:00:00,75,"[[Annabella Sciorra, 0.5251], [Harvey Weinstei...",[https://www.rawstory.com/2020/01/sopranos-act...,E,733
5,2020-01-31-027972,How many players in the N.B.A. have a huge rol...,Brian Baldinger,[Q4963033],2020-01-31 10:00:17,4,"[[Brian Baldinger, 0.5348], [None, 0.3464], [D...",[http://www.nytimes.com/2020/01/31/sports/foot...,E,266
8,2020-02-01-020887,I wanted to give everybody an opportunity to c...,Frank Vogel,"[Q1444383, Q2456047, Q36935609]",2020-02-01 14:16:47,2,"[[Frank Vogel, 0.9246], [None, 0.0584], [Damia...",[http://www.nytimes.com/2020/02/01/sports/bask...,E,1206
9,2020-01-11-019849,"If there are missiles flying around, the gover...",John Cox,"[Q16193058, Q21547258, Q22000830, Q28953656, Q...",2020-01-11 04:21:04,1,"[[John Cox, 0.7633], [None, 0.2367]]",[http://www.nytimes.com/interactive/2020/01/10...,E,687


#**2. Preprocessing**

##2.1 Preprocessing of quote topics

In [None]:
df.drop(['phase', 'urls', 'probas'], axis=1, inplace = True)
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences
0,2020-02-18-004289,an appetite for power.,,[],2020-02-18 14:44:45,3
1,2020-01-09-006199,Andrew Yang's Lies About Supporting Medicare f...,Andrew Yang,"[Q11118258, Q28723576]",2020-01-09 01:21:54,2
2,2020-01-22-017789,eager to erase the image of congressional Repu...,Eric Cantor,[Q497271],2020-01-22 21:20:52,2
3,2020-01-31-022641,Given the partisan nature of this impeachment ...,Lisa Murkowski,[Q22360],2020-01-31 00:00:00,24
4,2020-01-23-024008,"He got on top of me, and he raped me.",Annabella Sciorra,[Q231395],2020-01-23 00:00:00,75


Now we remove the outliers which are the quotes labeled with topic -1.

In [None]:
if(~df['quoteID'].is_unique):
    df.drop_duplicates(subset=['quoteID'], inplace=True)
df['quoteID'].count()

438304

We decided to replace the following qids (which are the ids of the speakers):
- qids that are empty list: it means that Quotebert didn't succeed in predicting who is the speaker. 
- qids that are lists with more than one element: it means that there are several speakers with the same name in the wikidata so Quotebert couldn't know which one it was.

Those lines are replaced by NaN as we want to analyse the speaker features of the quotes.

In [None]:
#df.drop(df.loc[df['qids'].apply(lambda row: 0 if row is not None and len(row) <= 1  else 1) == 1].index, inplace = True)
df['qids'] = df['qids'].apply(lambda qid: qid[0] if qid is not None and len(qid) ==1 else None)
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,topic
0,2020-02-18-004289,an appetite for power.,,,2020-02-18 14:44:45,3,"[[None, 0.3665], [Robin Niblett, 0.3339], [Jos...",181
4,2020-01-23-024008,"He got on top of me, and he raped me.",Annabella Sciorra,Q231395,2020-01-23 00:00:00,75,"[[Annabella Sciorra, 0.5251], [Harvey Weinstei...",733
5,2020-01-31-027972,How many players in the N.B.A. have a huge rol...,Brian Baldinger,Q4963033,2020-01-31 10:00:17,4,"[[Brian Baldinger, 0.5348], [None, 0.3464], [D...",266
8,2020-02-01-020887,I wanted to give everybody an opportunity to c...,Frank Vogel,,2020-02-01 14:16:47,2,"[[Frank Vogel, 0.9246], [None, 0.0584], [Damia...",1206
9,2020-01-11-019849,"If there are missiles flying around, the gover...",John Cox,,2020-01-11 04:21:04,1,"[[John Cox, 0.7633], [None, 0.2367]]",687


In [None]:
df['qids'].isnull().sum()

215655

In [None]:
df.dropna(subset=['qids'], inplace=True)
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,topic
4,2020-01-23-024008,"He got on top of me, and he raped me.",Annabella Sciorra,Q231395,2020-01-23 00:00:00,75,733
5,2020-01-31-027972,How many players in the N.B.A. have a huge rol...,Brian Baldinger,Q4963033,2020-01-31 10:00:17,4,266
11,2020-02-21-029926,If we choose a candidate who appeals to a smal...,Michael R. Bloomberg,Q607,2020-02-21 01:07:54,4,10
12,2020-01-27-036296,"If your last name was not Biden, do you think ...",Joe Biden,Q6279,2020-01-27 14:33:49,14,85
13,2020-01-31-052530,"It was not manslaughter, it wasn't a crime,",Walter Van Steenbrugge,Q2104597,2020-01-31 17:57:18,2,214


#3. Creation of cleaned dataset

In [None]:
df.to_parquet(PATH_TO_CLEAN + '/quote_topics_cleaned.parquet')