# Introduction

Introduction to the notebook

## Loading libraries

In [1]:
# Built-in
import json
import bz2
import os
import time

# Third parties
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading data
At first we will simply load the data from the quote bank, display examples, and then we start extracting relevant subsets.

In [4]:
quote_filepath = os.path.join('quotes-2019-nytimes.json.bz2')
politicians_filepath = os.path.join('data','resources','politicians_congress.csv')
speaker_attributes_filepath = os.path.join('speaker_attributes.parquet')
keys = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'phase']

In [3]:
# Load dataframe. It is faster this way than using pd.read_json
with bz2.open(quote_filepath, 'rb') as quote_file:
    df = pd.DataFrame([dict(zip(keys,map(json.loads(instance).get, keys))) for instance in quote_file])

In [8]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E


In [6]:
df_speaker_attributes = pd.read_parquet(speaker_attributes_filepath)

In [7]:
df_speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,
