In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.parse import *

Here we did for the 2020 quotations but we did the same for the other years just by replacing with the corresponding path.

In [2]:
df_2015 = pd.read_json("DATA\Quotebank\quotes-2020.json.bz2", compression="bz2", chunksize=100000, lines=True, encoding='UTF-8')

Here we create a function that allow us to extract the different topics from the nytimes urls.

In [3]:
def parse_url(url):
    url_parse = urlparse(url)
    start = 12
    if url_parse.netloc == 'www.nytimes.com' or url_parse.netloc == 'nytimes.com' or url_parse.netloc == 'mobile.nytimes.com' or url_parse.netloc == 'feeds.nytimes.com':
        if url_parse.netloc == 'feeds.nytimes.com':
            start = 24
        if url_parse.path[1:12] == 'interactive':
            start = 24 # for years before 2017: start = 13
        if url_parse.path[1:5] == 'live':
            return url_parse.netloc, 'live'
        if url_parse.path[1:6] == 'blogs':
            return url_parse.netloc, 'blogs'
        if url_parse.path[1:8] == 'reuters':
            start = 20
        if url_parse.path[1:9] == 'aponline':
            start = 21
        count = 0
        for letter in url_parse.path[start:]:
            if letter == '/':
                break
            count += 1
        return url_parse.netloc, url_parse.path[start:][:count]
        
    return 'www.nytimes.com', 'unknown'

Here we do the same like in the "build_dataset.ipynb" but additionaly we apply the parse_url on each url in the dataset to extract the topics and create a new column with it.

In [None]:
selected_columns = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'urls']
chunk_list = []
for batch in df_2015:
    batch = batch[selected_columns]
    urls_parse = batch['urls'].apply(
                        lambda x: x[0][x[0].index('//') + 2:]).apply(
                        lambda y: y[:y.index('/')] if y.find('/') != -1 else 'unknown')
    batch = batch[urls_parse.str.contains('nytimes')]
    batch[['urls_parse', 'topic']] = batch.apply(lambda x: parse_url(x['urls'][0]), result_type ='expand', axis=1)
    batch.drop('urls', axis=1, inplace=True)
    chunk_list.append(batch)
df_final = pd.concat(chunk_list)

In [5]:
df_final

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,urls_parse,topic
78,2020-04-10-002022,All the films and my life are thanks to my mot...,Bruce Baillie,2020-04-10 23:47:23,1,nytimes.com,movies
128,2020-01-09-006199,Andrew Yang's Lies About Supporting Medicare f...,Andrew Yang,2020-01-09 01:21:54,2,www.nytimes.com,opinion
374,2020-01-22-017789,eager to erase the image of congressional Repu...,Eric Cantor,2020-01-22 21:20:52,2,mobile.nytimes.com,us
562,2020-01-31-027972,How many players in the N.B.A. have a huge rol...,Brian Baldinger,2020-01-31 10:00:17,4,www.nytimes.com,sports
564,2020-01-01-010404,How terrifying can this human be? How afraid o...,Simon Stone,2020-01-01 15:00:09,2,nytimes.com,theater
...,...,...,...,...,...,...,...
5243920,2020-04-14-060385,"This is very sensitive, very difficult for our...",,2020-04-14 04:05:25,4,www.nytimes.com,world
5244002,2020-02-07-100927,trying to understand my own culture.,,2020-02-07 15:00:10,2,mobile.nytimes.com,arts
5244172,2020-01-19-049886,we immediately initiated an investigation and ...,,2020-01-19 18:08:35,7,mobile.nytimes.com,world
5244252,2020-01-26-053553,"Well, Joe, you're wrong.",Bernie Sanders,2020-01-26 17:01:39,10,www.nytimes.com,live


Same as "build_dataset.ipynb" to merge speaker attributes with the quotes dataset.

In [6]:
selected_columns = ['aliases', 'nationality', 'occupation', 'party', 'label']
attributes = pd.read_csv('DATA/speaker_attributes_parse.csv', usecols=selected_columns)
attributes.dropna(how='all', inplace=True)
attributes.reset_index(inplace=True)
attributes.drop('index', axis=1, inplace=True)
attributes.drop_duplicates(subset=['label'], inplace=True)
merged = df_final.merge(attributes, how='left', left_on='speaker', right_on='label')
merged.drop('label', axis=1, inplace=True)

In [7]:
merged

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,urls_parse,topic,aliases,nationality,occupation,party
0,2020-04-10-002022,All the films and my life are thanks to my mot...,Bruce Baillie,2020-04-10 23:47:23,1,nytimes.com,movies,,['United States of America'],['film director'],
1,2020-01-09-006199,Andrew Yang's Lies About Supporting Medicare f...,Andrew Yang,2020-01-09 01:21:54,2,www.nytimes.com,opinion,['Yang Anze'],['United States of America'],"['entrepreneur', 'lawyer', 'political pundit']",['Democratic Party']
2,2020-01-22-017789,eager to erase the image of congressional Repu...,Eric Cantor,2020-01-22 21:20:52,2,mobile.nytimes.com,us,['Eric Ivan Cantor'],['United States of America'],"['politician', 'lawyer', 'businessperson']",['Republican Party']
3,2020-01-31-027972,How many players in the N.B.A. have a huge rol...,Brian Baldinger,2020-01-31 10:00:17,4,www.nytimes.com,sports,,['United States of America'],['American football player'],
4,2020-01-01-010404,How terrifying can this human be? How afraid o...,Simon Stone,2020-01-01 15:00:09,2,nytimes.com,theater,,,,
...,...,...,...,...,...,...,...,...,...,...,...
38024,2020-04-14-060385,"This is very sensitive, very difficult for our...",,2020-04-14 04:05:25,4,www.nytimes.com,world,,,,
38025,2020-02-07-100927,trying to understand my own culture.,,2020-02-07 15:00:10,2,mobile.nytimes.com,arts,,,,
38026,2020-01-19-049886,we immediately initiated an investigation and ...,,2020-01-19 18:08:35,7,mobile.nytimes.com,world,,,,
38027,2020-01-26-053553,"Well, Joe, you're wrong.",Bernie Sanders,2020-01-26 17:01:39,10,www.nytimes.com,live,['Bernard Sanders'],['United States of America'],"['politician', 'journalist', 'carpenter', 'uni...","['Liberty Union Party', 'Democratic Party', 'i..."


In [None]:
merged.to_csv("DATA/quotes_nytimes_2020_parse.csv.bz2", compression='bz2')