In [1]:
# Built-in
import json
import bz2
import os
import time
import csv
import requests

# Third parties
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from bs4 import BeautifulSoup

In [2]:
def to_csv(file_name: str, pol_lst: list) -> None:
    """
    Write list to csv
    """

    csv_path = os.path.join("data", "resources", file_name)

    with open(csv_path, "w") as f:
        writer = csv.writer(f, delimiter=" ")
        writer.writerow(["Name", "Party"])

        for member in pol_lst:
            writer.writerow([el for el in member])
        

# Loading data
At first we will simply load the data from the quote bank, display examples, and then we start extracting relevant subsets.

In [3]:
quote_filepath = os.path.join('quotes-2019-nytimes.json.bz2')
politicians_filepath = os.path.join('data','resources','politicians_congress.csv')
speaker_attributes_filepath = os.path.join('speaker_attributes.parquet')
keys = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'phase']

In [4]:
# Load dataframe. It is faster this way than using pd.read_json
with bz2.open(quote_filepath, 'rb') as quote_file:
    df = pd.DataFrame([dict(zip(keys,map(json.loads(instance).get, keys))) for instance in quote_file])

# Load speakers data
df_speaker_attributes = pd.read_parquet(speaker_attributes_filepath)

In [6]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E


In [18]:
len(df_speaker_attributes) # 9 055 981 speakers
df_speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


## Task 2 


### Task 2.1
Get list of US politicians with political affiliation. 

2 sources:
- https://github.com/casmlab/politicians-tweets 
- https://www.congress.gov/members?q={%22congress%22:[%22110%22,%22111%22,%22112%22,%22113%22,%22114%22,%22115%22,%22116%22,117]}

Take 1st list, keep politicians whose affiliation is known. Then merge with congress list to be sure we have a good dataset.
List of politicians is in `data/ressources/politicians.json`.

In [5]:
file_name = "politicians_github.json"
file_path = os.path.join("data", "resources", file_name)

with open(file_path, "r") as f:
    json = json.load(f)

In [6]:
json.keys()

dict_keys(['id', 'id_str', 'screen_name', 'confirmed_account_type', 'state', 'twitter_name', 'real_name', 'bioguide', 'office_holder', 'party', 'district', 'level', 'woman', 'birthday', 'last_updated'])

In [7]:
# Only keep politicians with political affiliation
politicians = []

for i in tqdm(range(1, len(json["id"]))):
    i = str(i)
    affiliation = json["party"][i]
    screen_name = json["screen_name"][i]
    elected = json["office_holder"][i] is not None

    if affiliation is not None and affiliation in ("Republican", "Democratic"):
        politicians.append((json["real_name"][i], affiliation, elected))
    elif screen_name == "realdonaldtrump":
        politicians.append(("Donald Trump", "Republican", True))
    elif screen_name == "barackobama":
        politicians.append(("Barack Obama", "Democratic", True))


100%|██████████| 9979/9979 [00:00<00:00, 476425.80it/s]


In [10]:
# Count how many politicians are "elected" (-> congress members)
sum(pol[-1] for pol in politicians)

1107

All politicians are in Congress!

In [11]:
# Sanity check
print(f"{len(politicians)}") 
politicians[:10]

1107


[('Mark Green', 'Republican', True),
 ('Pete Stauber', 'Republican', True),
 ('Derek Kilmer', 'Democratic', True),
 ('Andy Harris', 'Republican', True),
 ('Donald Payne', 'Democratic', True),
 ('A. Ferguson', 'Republican', True),
 ('Richard Hudson', 'Republican', True),
 ('Edward Markey', 'Democratic', True),
 ('Bobby Rush', 'Democratic', True),
 ('Gregory Meeks', 'Democratic', True)]

In [12]:
# Write to file
to_csv("politicians_github.csv", politicians)

UnicodeEncodeError: 'charmap' codec can't encode character '\u02bb' in position 7: character maps to <undefined>

#### US Congress dataset

In [10]:
URL = 'https://www.congress.gov/members?q={"congress":["110","111","112","113","114","115","116",117]}&pageSize=250'

def sanitize_name(name: str) -> str:
    """
    Strip and clean name.
    "Senator Cruz, Ted" -> "Ted Cruz"
    """

    for element in ("Representative", "Senator"):
        name = name.strip(element)

    name = " ".join(name.split(",")[::-1])
    name = name.strip()
    
    return name

congress_members = []

# Download each congress page
with requests.Session() as s:
    for page_number in tqdm(range(1, 6)):
        r  = s.get(URL, params={"page": page_number})
        soup = BeautifulSoup(r.text, "html.parser")

        members = soup.find_all("li", class_="compact")

        for member in members:
            # Scrape the information
            items = member.find_all("span", class_="result-item")
            name = sanitize_name(member.span.a.text)
            
            for item in items:
                if item.strong.text == "Party:":
                    affiliation = item.span.text

            congress_members.append((name, affiliation))

100%|██████████| 5/5 [00:10<00:00,  2.06s/it]


In [15]:
# Sanity check
len(congress_members) == 1158

True

In [18]:
# Write to file
to_csv("politicians_congress.csv", politicians)

PermissionError: [Errno 13] Permission denied: 'data\\resources\\politicians_congress.csv'

#### Compare lists

Actually might not be useful (and less of a headache) to just take the congress list, since all politicians from the github list are elected (meaning they are or were congress members).

### 3. Quotes extraction

can't use congress members list directly, to extract quotes by author, need them aliases

In [81]:
a = []
for i in range(len(congress_members)) :
    a.append(congress_members[i][0])
a
df2 = df[df["speaker"].isin(a)]
df2
# None because for example, Hillary Clinton is written 'Hillary Rodham Clinton'

### Works like this
#b = ['James Fisher', 'John Updike', 'Hillary Clinton']
#df2 = df[df["speaker"].isin(b)]
#df2

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score


In [82]:
df_speaker_attributes["aliases"]
df_speakers = df_speaker_attributes["aliases"].tolist()
df_speakers

[array(['Washington', 'President Washington', 'G. Washington',
        'Father of the United States', 'The American Fabius'], dtype=object),
 array(['Douglas Noel Adams', 'Douglas Noël Adams', 'Douglas N. Adams'],
       dtype=object),
 array(['Paul Marie Ghislain Otlet', 'Paul Marie Otlet'], dtype=object),
 array(['George Walker Bush', 'Bush Jr.', 'Dubya', 'GWB', 'Bush 43',
        'President George W. Bush', 'George Bush', 'President Bush',
        'Bush', 'Bush, George W.'], dtype=object),
 array(['Velázquez', 'Diego Rodríguez de Silva y Velázquez',
        'Diego Rodriguez de Silva y Velázquez', 'Diego de Silva Velàzquez',
        'Diego De Velázquez y Silva',
        'Diego Rodríguez de Silva y Velasquier', 'Diego Velasques',
        'Diego Rodríguez de Silva y Velasquiz', 'Diego Velásquez',
        'Diego Valesquez', 'Diego Velasquex', 'Diego Velázquez y Silva',
        'Diego de Silva Velázquez', 'Diego de Silva Velazquez',
        'Diego Villasco', 'Diego de Velázquez y Silva',

In [84]:
unqlist=list(df_speaker.unique())

NameError: name 'df_speaker' is not defined

#### Extracting quotes with 'republicans' and 'democrats'

In [76]:
list1 = ['republican', 'republicans', 'democrat', 'democrats']
rep_dem_quotes = df[df["quotation"].str.contains('|'.join(list1))]
rep_dem_quotes

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score
849,2019-11-28-023610,I just hope the radicals in the democratic cam...,Leung Chun-ying,2019-11-28 12:36:55,3,E,,
2145,2019-02-18-038200,"inassimilable to democratic mores, in ways ver...",Charles Taylor,2019-02-18 14:26:00,2,E,,
2221,2019-04-24-054807,The very idea of a Jewish state [ in Palestine...,Joseph Levine,2019-04-24 18:09:02,2,E,,
2536,2019-11-10-046640,The ugly partisanship that is dominating our d...,,2019-11-10 16:19:32,3,E,,
2844,2019-07-10-075859,"the same disillusionment, one where high-minde...",Elliot Ackerman,2019-07-10 22:15:36,1,E,,
...,...,...,...,...,...,...,...,...
203333,2019-02-07-067831,Most Americans are obviously not up on the dis...,Michael Kazin,2019-02-07 01:14:33,1,E,,
203455,2019-01-10-085559,the most democratic province of the republic o...,William Dean,2019-01-10 21:00:03,1,E,,
203532,2019-08-08-039032,If a country's ruler were empowered to choose ...,,2019-08-08 02:36:19,3,E,,
206556,2019-01-28-112481,would have produced a more democratic sharing ...,Lawrence Goodwyn,2019-01-28 17:26:59,3,E,,


In [80]:
congress_members_list = a.tolist()
congress_members_list
list1 = ['republican', 'republicans', 'democrat', 'democrats']
rep_dem_quotes = df[df["quotation"].str.contains('|'.join(list1 + congress_members_list))]
rep_dem_quotes

  return func(self, *args, **kwargs)


Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score
115,2019-10-24-130308,"You say, `Listen, I've been working on this --...",Al Pacino,2019-10-24 09:00:16,1,E,,
382,2019-12-27-026804,"It appears that the defendant Netanyahu, who i...",Benny Gantz,2019-12-27 00:00:00,23,E,,
422,2019-12-13-064494,Practicing the King's Economy,,2019-12-13 16:50:17,3,E,,
676,2019-05-17-050756,Israel's most right-wing government in history,,2019-05-17 23:27:20,4,E,,
849,2019-11-28-023610,I just hope the radicals in the democratic cam...,Leung Chun-ying,2019-11-28 12:36:55,3,E,,
...,...,...,...,...,...,...,...,...
206941,2019-03-28-096758,The Washington view of Israel-Palestine is sti...,Ben Rhodes,2019-03-28 09:00:04,5,E,,
207037,2019-09-15-050649,waiting to hear from the Kingdom as to who the...,President Donald Trump,2019-09-15 20:20:06,32,E,,
207142,2019-03-24-049069,This legislation appears designed less to comb...,Jeremy Ben-Ami,2019-03-24 21:13:07,10,E,,
207177,2019-03-14-008239,"As a convinced anti-fascist, I apologize to al...",Antonio Tajani,2019-03-14 15:45:34,1,E,,


### 6. Sentiment Analysis 

#### 1. Transformers library (DistilBERT architecture)

In [12]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

In [16]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E


In [13]:
print(df["quotation"][1])
classifier(df["quotation"][1])

a champion figure skater switching to roller skates.


[{'label': 'POSITIVE', 'score': 0.9995531439781189}]

In [14]:
df["Transformers_score"] = ""
for i in range(100):
    df["Transformers_score"][i] = classifier(df["quotation"][i])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E,"[{'label': 'POSITIVE', 'score': 0.991260886192..."
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E,"[{'label': 'POSITIVE', 'score': 0.999553143978..."
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E,"[{'label': 'NEGATIVE', 'score': 0.999620079994..."
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E,"[{'label': 'NEGATIVE', 'score': 0.982650339603..."
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E,"[{'label': 'NEGATIVE', 'score': 0.999175965785..."


#### 2. NLTK 

In [15]:
# https://realpython.com/python-nltk-sentiment-analysis/
# pip install nltk
import nltk

# nltk.download()
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rened\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [16]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(df["quotation"][1])

{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'compound': 0.5994}

In [17]:
df["NLTK_score"] = ""
for i in range(100):
    df["NLTK_score"][i] = sia.polarity_scores(df["quotation"][i])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E,"[{'label': 'POSITIVE', 'score': 0.991260886192...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E,"[{'label': 'POSITIVE', 'score': 0.999553143978...","{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'comp..."
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E,"[{'label': 'NEGATIVE', 'score': 0.999620079994...","{'neg': 0.189, 'neu': 0.811, 'pos': 0.0, 'comp..."
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E,"[{'label': 'NEGATIVE', 'score': 0.982650339603...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E,"[{'label': 'NEGATIVE', 'score': 0.999175965785...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


#### 3. SpaCy

In [28]:
# pip install spacy
# pip install pip install spacytextblob==0.1.7
# !python -m spacy download en_core_web_sm

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("spacytextblob")
nlp(df["quotation"][1])

TypeError: add_pipe() got an unexpected keyword argument 'source'

#### Textblob library

It's using a classical bag of words approach so too simple for us.

In [33]:
from textblob import TextBlob
TextBlob(df["quotation"][5]).sentiment

Sentiment(polarity=0.6, subjectivity=1.0)