In [32]:
# Built-in
import json
import bz2
import os
import time
import csv
import requests

# Third parties
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from bs4 import BeautifulSoup
# from tqdm.contrib.concurrent import process_map
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)
# tqdm for pandas
tqdm.pandas()

# Config part
DATA_PATH = "data"
PKL_PATH = os.path.join(DATA_PATH, "pkl")
RESOURCES_PATH = os.path.join(DATA_PATH, "resources")

# for windows pickle
#!pip3 install pickle5
import pickle5 as pickle

def load_df(
    file_name: str, mode: str = "pandas", save: bool = True, chunksize: int = 1_000_000
) -> pd.DataFrame:
    """
    Load a dataset in DataFrame from a .json.bz2 archive.

    file_name: str
        Name of .json.bz2 archive to load from `DATA_PATH`.

    mode: str = "pandas" | "bz2"
        Either use pandas read_json function or homemade bz2 function. This is usually faster (but makes my computer crash for some reason).

    save: bool
        Save the dataframe as a pickle file in `PKL_PATH`.
    """

    file_path = os.path.join(DATA_PATH, file_name)

    if mode == "bz2":
        keys = ["quoteID", "quotation", "speaker", "date", "numOccurrences", "phase"]

        with bz2.open(file_path, "rb") as quote_file:
            df = pd.DataFrame(
                [
                    dict(zip(keys, map(json.loads(instance).get, keys)))
                    for instance in tqdm(quote_file)
                ]
            )
    else:
        if not save:
            print("Please enable save option.")
            return

        with pd.read_json(file_path, lines=True, chunksize=chunksize) as df_reader:
            for i, chunk in enumerate(df_reader):
                file_name = file_name.strip(".json.bz2")
                pkl_path = os.path.join(PKL_PATH, f"{file_name}-{i:03d}.pkl")
                chunk.to_pickle(pkl_path)

                if i == 1:
                    break
        return

    file_name = file_name.strip(".json.bz2")
    pkl_path = os.path.join(PKL_PATH, f"{file_name}.pkl")

    if save and not os.path.exists(pkl_path):
        df.to_pickle(os.path.join(PKL_PATH, pkl_path))

    return df

def to_csv(file_name: str, pol_lst: list) -> None:
    """
    Write list to csv
    """

    csv_path = os.path.join("data", "resources", file_name)

    with open(csv_path, "w", encoding='utf-8') as f:
        writer = csv.writer(f, delimiter=" ")
        writer.writerow(["Name", "Party"])

        for member in pol_lst:
            writer.writerow([el for el in member])

def get_pkl_year(year: int) -> list:
    """
    Returns a list of the pkl files present in `data/pkl/{year}`.
    """

    dirs = os.listdir(os.path.join(PKL_PATH, str(year)))

    return [os.path.join(str(year), dir) for dir in dirs]

# Loading data
At first we will simply load the data from the quote bank, display examples, and then we start extracting relevant subsets.

In [33]:
quote_filepath = os.path.join('data/quotes-2019-nytimes.json.bz2')
politicians_filepath = os.path.join('data','resources','politicians_congress.csv')
speaker_attributes_filepath = os.path.join('data/speaker_attributes.parquet')
keys = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'phase']

In [34]:
# Load dataframe. It is faster this way than using pd.read_json
with bz2.open(quote_filepath, 'rb') as quote_file:
    df = pd.DataFrame([dict(zip(keys,map(json.loads(instance).get, keys))) for instance in quote_file])

# Load speakers data
df_speaker_attributes = pd.read_parquet(speaker_attributes_filepath)

In [35]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E


In [36]:
len(df_speaker_attributes) # 9 055 981 speakers
df_speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


## Task 2 


### Task 2.1
Get list of US politicians with political affiliation. 

2 sources:
- https://github.com/casmlab/politicians-tweets 
- https://www.congress.gov/members?q={%22congress%22:[%22110%22,%22111%22,%22112%22,%22113%22,%22114%22,%22115%22,%22116%22,117]}

Take 1st list, keep politicians whose affiliation is known. Then merge with congress list to be sure we have a good dataset.
List of politicians is in `data/ressources/politicians.json`.

In [37]:
file_name = "politicians_github.json"
file_path = os.path.join("data", "resources", file_name)

with open(file_path, "r") as f:
    json = json.load(f)

In [38]:
json.keys()

dict_keys(['id', 'id_str', 'screen_name', 'confirmed_account_type', 'state', 'twitter_name', 'real_name', 'bioguide', 'office_holder', 'party', 'district', 'level', 'woman', 'birthday', 'last_updated'])

In [39]:
# Only keep politicians with political affiliation
politicians = []

for i in tqdm(range(1, len(json["id"]))):
    i = str(i)
    affiliation = json["party"][i]
    screen_name = json["screen_name"][i]
    elected = json["office_holder"][i] is not None

    if affiliation is not None and affiliation in ("Republican", "Democratic"):
        politicians.append((json["real_name"][i], affiliation, elected))
    elif screen_name == "realdonaldtrump":
        politicians.append(("Donald Trump", "Republican", True))
    elif screen_name == "barackobama":
        politicians.append(("Barack Obama", "Democratic", True))


100%|██████████| 9979/9979 [00:00<00:00, 667064.46it/s]


In [40]:
# Count how many politicians are "elected" (-> congress members)
sum(pol[-1] for pol in politicians)

1107

All politicians are in Congress!

In [41]:
# Sanity check
print(f"{len(politicians)}") 
politicians[:10]

1107


[('Mark Green', 'Republican', True),
 ('Pete Stauber', 'Republican', True),
 ('Derek Kilmer', 'Democratic', True),
 ('Andy Harris', 'Republican', True),
 ('Donald Payne', 'Democratic', True),
 ('A. Ferguson', 'Republican', True),
 ('Richard Hudson', 'Republican', True),
 ('Edward Markey', 'Democratic', True),
 ('Bobby Rush', 'Democratic', True),
 ('Gregory Meeks', 'Democratic', True)]

In [42]:
# Write to file
to_csv("politicians_github.csv", politicians)

#### US Congress dataset

In [43]:
URL = 'https://www.congress.gov/members?q={"congress":["110","111","112","113","114","115","116",117]}&pageSize=250'

def sanitize_name(name: str) -> str:
    """
    Strip and clean name.
    "Senator Cruz, Ted" -> "Ted Cruz"
    """

    for element in ("Representative", "Senator"):
        name = name.strip(element)

    name = " ".join(name.split(",")[::-1])
    name = name.strip()
    
    return name

congress_members = []

# Download each congress page
with requests.Session() as s:
    for page_number in tqdm(range(1, 6)):
        r  = s.get(URL, params={"page": page_number})
        soup = BeautifulSoup(r.text, "html.parser")

        members = soup.find_all("li", class_="compact")

        for member in members:
            # Scrape the information
            items = member.find_all("span", class_="result-item")
            name = sanitize_name(member.span.a.text)
            
            for item in items:
                if item.strong.text == "Party:":
                    affiliation = item.span.text

            congress_members.append((name, affiliation))

100%|██████████| 5/5 [00:18<00:00,  3.62s/it]


In [44]:
# Sanity check
len(congress_members) == 1158

False

In [45]:
# Write to file
to_csv("politicians_congress.csv", politicians)

#### Compare lists

Actually might not be useful (and less of a headache) to just take the congress list, since all politicians from the github list are elected (meaning they are or were congress members).

### Task 2.2 - Extract quotes from politicians

I downloaded another list, from the congress [website](https://bioguide.congress.gov/search?index=%22bioguideprofiles%22&size=12&matches=[]&filters={%22jobPositions.congressAffiliation.partyAffiliation.party.name%22:[%22Democrat%22,%22Republican%22],%22jobPositions.congressAffiliation.congress.name%22:[%22The%20110th%20United%20States%20Congress%22,%22The%20111th%20United%20States%20Congress%22,%22The%20112th%20United%20States%20Congress%22,%22The%20113th%20United%20States%20Congress%22,%22The%20114th%20United%20States%20Congress%22,%22The%20115th%20United%20States%20Congress%22,%22The%20116th%20United%20States%20Congress%22,%22The%20117th%20United%20States%20Congress%22]}&sort=[{%22_score%22:true},{%22field%22:%22familyName%22,%22order%22:%22asc%22},{%22field%22:%22middleName%22,%22order%22:%22asc%22},{%22field%22:%22givenName%22,%22order%22:%22asc%22}])


Those are the politicians from 2007 to 2009. The json is called `data/resources/congress_biolist.json`. What is nice is that we have the "congress bio ID" of each congress member, which is also present in the `speaker_attributes.parquet` file (field `US_congress_bio_ID`). We will use that to extract the tweets from the politicians.

Testing with quotes from 2008.

In [46]:
politicians_filepath = os.path.join(RESOURCES_PATH, "new_congress_biolist.json")
# quotes_filepath = os.path.join(DATA_PATH, "quotes-2008.json.bz2")
# keys = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'phase']

politicians_df = pd.read_json(politicians_filepath)
# politicians_df = politicians_df.drop("congresses", axis=1)
# quotes_df = pd.read_pickle(os.path.join(PKL_PATH, "quotes-2008.pkl"))

Problem: Donald Trump is not in the dataset (because he was only President, not senator or representative and thus, not a congress member). I will manually add him now.

In [47]:
quotes_2020_00 = pd.read_pickle(os.path.join(PKL_PATH, "2020", "quotes-2020-000.pkl"))

FileNotFoundError: [Errno 2] No such file or directory: 'data\\pkl\\2020\\quotes-2020-000.pkl'

In [None]:
quotes_2020_00[quotes_2020_00["speaker"].str.contains("Trump")]["speaker"].unique()

In [None]:
# Manually add Donald Trump
# Not so elegant trick to capture variations of the name
# Should refactore to another solution (alias field?) when we have time

donald_json1 = {
    "id": np.nan,
    "givenName": "Donald",
    "familyName": "Trump",
    "unaccentedGivenName": "Donald",
    "unaccentedFamilyName": "Trump",
    "birthYear": 1946,
    "deathYear": np.nan,
    "congresses": [
        {
            "position": "President",
            "congressNumber": np.nan,
            "stateName": np.nan,
            "parties": ["Republican"],
        }
    ],
    "middleName": "John",
    "unaccentedMiddleName": "John",
    "nickName": np.nan,
    "honorificPrefix": np.nan,
    "honorificSuffix": np.nan,
}

donald_json2 = {
    "id": np.nan,
    "givenName": "President",
    "familyName": "Trump",
    "unaccentedGivenName": "President",
    "unaccentedFamilyName": "Trump",
    "birthYear": 1946,
    "deathYear": np.nan,
    "congresses": [
        {
            "position": "President",
            "congressNumber": np.nan,
            "stateName": np.nan,
            "parties": ["Republican"],
        }
    ],
    "middleName": "John",
    "unaccentedMiddleName": "John",
    "nickName": np.nan,
    "honorificPrefix": np.nan,
    "honorificSuffix": np.nan,
}

donald_json3 = {
    "id": np.nan,
    "givenName": "President Donald",
    "familyName": "Trump",
    "unaccentedGivenName": "President Donald",
    "unaccentedFamilyName": "Trump",
    "birthYear": 1946,
    "deathYear": np.nan,
    "congresses": [
        {
            "position": "President",
            "congressNumber": np.nan,
            "stateName": np.nan,
            "parties": ["Republican"],
        }
    ],
    "middleName": "John",
    "unaccentedMiddleName": "John",
    "nickName": np.nan,
    "honorificPrefix": np.nan,
    "honorificSuffix": np.nan,
}

politicians_df = politicians_df.append(
    pd.DataFrame([donald_json1, donald_json2, donald_json3]), ignore_index=True
)


In [None]:
politicians_df.tail(3)

In [None]:
# Export new df to json
politicians_df.to_json(os.path.join(RESOURCES_PATH, "new_congress_biolist.json"))

In [None]:
politicians_df["fullName"] = politicians_df["givenName"] + " " + politicians_df["familyName"]
politicians_df["fullName"] = politicians_df["fullName"].str.lower()

congress_members = politicians_df["fullName"].tolist()  # redefined here for clarity

def extract_subset(orig_df: pd.DataFrame, multiproc=False) -> pd.DataFrame:
    """
    This function extracts the quotes of speakers that are in the congress list.

    It returns the number of extracted quotes and the extracted dataframe.
    """

    if multiproc:
        orig_df["subset"] = orig_df["speaker"].parallel_apply(
            lambda x: pd.Series(x.lower()).str.contains("|".join(congress_members))
        )
    else:
        orig_df["subset"] = orig_df["speaker"].progress_apply(
            lambda x: pd.Series(x.lower()).str.contains("|".join(congress_members))
        )

    return orig_df["subset"].sum(), orig_df[orig_df["subset"] == True]
    

In [None]:
# 10% subset
quotes_2020_00 = quotes_2020_00.sample(int(0.1*len(quotes_2020_00)))

In [None]:
# Testing multiprocessing for extraction
subset_count, subset_2020 = extract_subset(quotes_2020_00)
subset_count

In [None]:
subset_2020["speaker"].value_counts()

In [None]:
# Goal is to do a pipeline to automatically extract quotes for a quotes dataset
# The datasets were already loaded from the json.bz2 format and converted to .pkl in `data/pkl`

# Get the names
quotes_datasets = [os.path.join("data", "pkl", f"quotes-20{i:02d}.pkl") for i in range(8, 21)]  

# For each dataset, extract the quotes from congress members
# and save the extracted quotes as pkl for easier handling
for i, dataset in enumerate(quotes_datasets, start=1):
    print(f"{i}/{len(quotes_datasets)} {dataset}:")
    try:
        complete_df = pd.read_pickle(dataset)  # Load dataset
    except FileNotFoundError:
        print(f"{dataset} not found, loading from .json.bz2")
        complete_df = load_df(dataset)

    _, subset_df = extract_subset(complete_df)
    subset_df.to_pickle(os.path.join("data", "pkl", f"extracted_{dataset}"))

In [None]:
year = 2008
files = get_pkl_year(year)

# Extract the quotes of interest of each chunk
all_extracted = []
for file in files:
    df = pd.read_pickle(os.path.join(PKL_PATH, file))
    _, subset_df = extract_subset(df)
    all_extracted.append(subset_df)

# Merge them into a new df
df_extracted = pd.concat(all_extracted)

# Save the df as pkl
pkl_name = f"extracted-quotes-{year}.pkl"
df_extracted.to_pickle(os.path.join(PKL_PATH, pkl_name))

In [None]:
df2_extracted = pd.read_pickle(os.path.join(PKL_PATH, "extracted-quotes-2016.pkl"))
df2 = pd.read_pickle(os.path.join(PKL_PATH, "2016", "quotes-2016-26.pkl"))
df2 = df2.sample(100_000)

In [None]:
length, df2_extracted2 = extract_subset(df2, multiproc=True)

In [None]:
length

## Load datasets

In [24]:
# Mac/Linux
# Load the main dataset
# df = pd.read_pickle("data/pkl/extracted-quotes-2018.pkl")

# Windows
with open("data/pkl/extracted-quotes-2018.pkl", "rb") as fh:
   data = pickle.load(fh)



In [25]:
data

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,subset
16000014,2018-05-11-081190,Mood remained upbeat despite rising crude oil ...,President Donald Trump,[Q22686],2018-05-11 00:00:00,4,"[[President Donald Trump, 0.5749], [None, 0.29...",[http://www.canindia.com/global-cues-better-q4...,E,True
16000115,2018-10-16-075122,My favourite image changes quiet regularly but...,David Price,"[Q1176177, Q16063598, Q20804677, Q20973688, Q3...",2018-10-16 20:21:26,1,"[[David Price, 0.9163], [None, 0.0624], [Thier...",[http://dailycannon.com/2018/10/exclusive-davi...,E,True
16000162,2018-05-25-071034,My philosophy is very simple: when you see som...,John Lewis,"[Q14945660, Q16227288, Q18922034, Q19565272, Q...",2018-05-25 10:24:00,1,"[[John Lewis, 0.6994], [None, 0.3006]]",[https://www.inc.com/emily-canal/john-lewis-co...,E,True
16000292,2018-09-14-074686,"no program, no speeches, no politics.",Lois Capps,[Q459693],2018-09-14 07:02:33,1,"[[Lois Capps, 0.8035], [None, 0.1965]]",[http://www.independent.com/news/2018/sep/14/b...,E,True
16000312,2018-09-05-072584,nor would it be contemplated.,President Donald Trump,[Q22686],2018-09-05 20:08:11,3,"[[President Donald Trump, 0.8022], [None, 0.17...",[http://europe.newsweek.com/trump-says-did-not...,E,True
...,...,...,...,...,...,...,...,...,...,...
23999680,2018-05-29-017492,Donald Trump Jr. retweeted Roseanne Barr's rac...,Donald Trump,"[Q22686, Q27947481]",2018-05-29 23:15:20,1,"[[Donald Trump, 0.62], [None, 0.279], [Roseann...",[https://www.washingtonexaminer.com/news/donal...,E,True
23999706,2018-12-14-017624,down to five finalists.,President Trump,[Q22686],2018-12-14 20:30:10,1,"[[President Trump, 0.5204], [None, 0.4372], [M...",[https://www.theatlantic.com/politics/archive/...,E,True
23999786,2018-03-30-019552,endorsed the crown prince's high profile antic...,President Trump,[Q22686],2018-03-30 01:28:31,4,"[[President Trump, 0.6701], [None, 0.3166], [D...",[http://mobile.nytimes.com/2018/03/29/business...,E,True
23999851,2018-10-25-024068,Eventually we'll get past it; we'll elect diff...,Collin Peterson,[Q434458],2018-10-25 22:50:19,1,"[[Collin Peterson, 0.7925], [None, 0.2075]]",[http://dl-online.com/news/4519529-dl-students...,E,True


In [26]:
# Load the dataset with additional info abbout politicians 
df_politicians = pd.io.json.read_json("data/resources/new_congress_biolist.json")

## Work on df_politicians table

In [28]:
# Extract from congresses column relevant infor (position, state, parties)
def extract_congress_information(row):
    information = pd.json_normalize(row["congresses"]).sort_values("congressNumber").tail(1).loc[:, ["position","stateName","parties"]]

    row["position"] = information.loc[:, "position"].values[0]
    row["stateName"] = information.loc[:, "stateName"].values[0]
    row["parties"] = information.loc[:, "parties"].values[0]
    return row

df_politicians = df_politicians.apply(extract_congress_information, axis=1)

In [29]:
# The value in column parties is a list, we want to select the last past party from the list
def getLastValue(aList):
    return aList[-1]

df_politicians["parties"] = df_politicians["parties"].apply(getLastValue)

In [48]:
# Create new column speaker (full name of the politician)
df_politicians['speaker'] = df_politicians['givenName'] + " " + df_politicians['familyName']

# Have the speaker's full names in the same size 
df_politicians["speaker"] = df_politicians["speaker"].str.lower()

# In lower case also in the quotes dataset
df["speaker"] = df["speaker"].str.lower()

AttributeError: module 'numpy' has no attribute 'matrix'

In [31]:
# Check for full name duplicates
# I propose to delete those as they may be in different parties and we will not which one is talking
df_politicians['speaker'].value_counts()

KeyError: 'speaker'

In [None]:
# We drop duplicates by speaker's full name
df_politicians = df_politicians.drop_duplicates(subset=['speaker'])

## Merge the quotes with info about speakers

In [None]:
# Merge quotes to speaker's info
data = pd.merge(df, df_politicians, on='speaker', how='outer') # we merged the dataset

In [None]:
# Subsets by parties 
subset_democrats = data[data['parties'] == "Democrat"]
subset_republicans = data[data['parties'] == "Republican"]

## Naive model: select quotes that talk about the opponent party

In [None]:
# Get the list of Republicans and Democrats
names_democrats = df_politicians[df_politicians['parties'] == 'Democrat']["speaker"].tolist()
names_republicans = df_politicians[df_politicians['parties'] == 'Republican']["speaker"].tolist()

In [None]:
# Have all quotes in lowercase
data["quotation"] = data["quotation"].str.lower()
subset_democrats["quotation"] = subset_democrats["quotation"].str.lower()
subset_republicans["quotation"] = subset_republicans["quotation"].str.lower()

In [None]:
# Drop rows with those who don't have any quotes
data = data.dropna(subset=['quotation'])
subset_democrats = subset_democrats.dropna(subset=['quotation'])
subset_republicans = subset_republicans.dropna(subset=['quotation'])

In [None]:
# Create full lists 
list_rep = names_republicans + ['republican', 'republicans']
pattern_list_rep = '|'.join(list_rep)

list_dem = names_democrats + ['democrat', 'democrats']
pattern_list_dem = '|'.join(list_dem)

In [None]:
# Subset of quotes said by democrats about republicans
demo_quotes_abt_rep = subset_democrats[subset_democrats['quotation'].str.contains(pattern_list_rep)]

In [None]:
# Subset of quotes said by democrats about democrats
rep_quotes_abt_demo = subset_republicans[subset_republicans['quotation'].str.contains(pattern_list_dem)]

## Sentiment analysis of the quotes

In [None]:
import nltk
# nltk.download()
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Function that get the sentiment of each quote
def get_sentiment(row):   
    row['NLTK score'] = sia.polarity_scores(row['quotation'])
    return row

In [None]:
# Apply function on the subsets - for republicans and democrats
demo_quotes_abt_rep = demo_quotes_abt_rep.apply(get_sentiment, axis=1)
rep_quotes_abt_demo = rep_quotes_abt_demo.apply(get_sentiment, axis=1)

In [None]:
# Split in columns to get values 
# Democrats about republicans 
demo_quotes_abt_rep = pd.concat([demo_quotes_abt_rep, demo_quotes_abt_rep['NLTK score'].apply(pd.Series)], axis=1)

# Republicans about democrats
rep_quotes_abt_demo = pd.concat([rep_quotes_abt_demo, rep_quotes_abt_demo['NLTK score'].apply(pd.Series)], axis=1)


## EDA:  We have 2 datasets that we want to analyze

The problem is that we have too quotes classified as neutral. 
We should read about SentimentIntensityAnalyzer as we may need to preprocess before, or use a different sentiment predictor.
Another thing: may be we should appply the analyzer before even lower casing the sentences (because maybe some excitement/anger can be in upper case)
Something to think about.
https://www.analyticsvidhya.com/blog/2021/01/sentiment-analysis-vader-or-textblob/

#### Republicans about democrats 

In [None]:
rep_quotes_abt_demo.head(1)

In [None]:
plt.hist(rep_quotes_abt_demo['neg'], alpha=0.5);
plt.hist(rep_quotes_abt_demo['pos'], alpha=0.5);
plt.hist(rep_quotes_abt_demo['neu'], alpha=0.5);
labels = ['neg','pos', 'neu'];
plt.title('Distribution of sentiment');
plt.legend(labels);

### 3. Quotes extraction

can't use congress members list directly, to extract quotes by author, need them aliases

In [81]:
a = []
for i in range(len(congress_members)) :
    a.append(congress_members[i][0])
a
df2 = df[df["speaker"].isin(a)]
df2
# None because for example, Hillary Clinton is written 'Hillary Rodham Clinton'

### Works like this
#b = ['James Fisher', 'John Updike', 'Hillary Clinton']
#df2 = df[df["speaker"].isin(b)]
#df2

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score


In [82]:
df_speaker_attributes["aliases"]
df_speakers = df_speaker_attributes["aliases"].tolist()
df_speakers

[array(['Washington', 'President Washington', 'G. Washington',
        'Father of the United States', 'The American Fabius'], dtype=object),
 array(['Douglas Noel Adams', 'Douglas Noël Adams', 'Douglas N. Adams'],
       dtype=object),
 array(['Paul Marie Ghislain Otlet', 'Paul Marie Otlet'], dtype=object),
 array(['George Walker Bush', 'Bush Jr.', 'Dubya', 'GWB', 'Bush 43',
        'President George W. Bush', 'George Bush', 'President Bush',
        'Bush', 'Bush, George W.'], dtype=object),
 array(['Velázquez', 'Diego Rodríguez de Silva y Velázquez',
        'Diego Rodriguez de Silva y Velázquez', 'Diego de Silva Velàzquez',
        'Diego De Velázquez y Silva',
        'Diego Rodríguez de Silva y Velasquier', 'Diego Velasques',
        'Diego Rodríguez de Silva y Velasquiz', 'Diego Velásquez',
        'Diego Valesquez', 'Diego Velasquex', 'Diego Velázquez y Silva',
        'Diego de Silva Velázquez', 'Diego de Silva Velazquez',
        'Diego Villasco', 'Diego de Velázquez y Silva',

#### Extracting quotes with 'republicans' and 'democrats'

In [76]:
list1 = ['republican', 'republicans', 'democrat', 'democrats']
rep_dem_quotes = df[df["quotation"].str.contains('|'.join(list1))]
rep_dem_quotes

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score
849,2019-11-28-023610,I just hope the radicals in the democratic cam...,Leung Chun-ying,2019-11-28 12:36:55,3,E,,
2145,2019-02-18-038200,"inassimilable to democratic mores, in ways ver...",Charles Taylor,2019-02-18 14:26:00,2,E,,
2221,2019-04-24-054807,The very idea of a Jewish state [ in Palestine...,Joseph Levine,2019-04-24 18:09:02,2,E,,
2536,2019-11-10-046640,The ugly partisanship that is dominating our d...,,2019-11-10 16:19:32,3,E,,
2844,2019-07-10-075859,"the same disillusionment, one where high-minde...",Elliot Ackerman,2019-07-10 22:15:36,1,E,,
...,...,...,...,...,...,...,...,...
203333,2019-02-07-067831,Most Americans are obviously not up on the dis...,Michael Kazin,2019-02-07 01:14:33,1,E,,
203455,2019-01-10-085559,the most democratic province of the republic o...,William Dean,2019-01-10 21:00:03,1,E,,
203532,2019-08-08-039032,If a country's ruler were empowered to choose ...,,2019-08-08 02:36:19,3,E,,
206556,2019-01-28-112481,would have produced a more democratic sharing ...,Lawrence Goodwyn,2019-01-28 17:26:59,3,E,,


In [80]:
congress_members_list = a.tolist()
congress_members_list
list1 = ['republican', 'republicans', 'democrat', 'democrats']
rep_dem_quotes = df[df["quotation"].str.contains('|'.join(list1 + congress_members_list))]
rep_dem_quotes

  return func(self, *args, **kwargs)


Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score
115,2019-10-24-130308,"You say, `Listen, I've been working on this --...",Al Pacino,2019-10-24 09:00:16,1,E,,
382,2019-12-27-026804,"It appears that the defendant Netanyahu, who i...",Benny Gantz,2019-12-27 00:00:00,23,E,,
422,2019-12-13-064494,Practicing the King's Economy,,2019-12-13 16:50:17,3,E,,
676,2019-05-17-050756,Israel's most right-wing government in history,,2019-05-17 23:27:20,4,E,,
849,2019-11-28-023610,I just hope the radicals in the democratic cam...,Leung Chun-ying,2019-11-28 12:36:55,3,E,,
...,...,...,...,...,...,...,...,...
206941,2019-03-28-096758,The Washington view of Israel-Palestine is sti...,Ben Rhodes,2019-03-28 09:00:04,5,E,,
207037,2019-09-15-050649,waiting to hear from the Kingdom as to who the...,President Donald Trump,2019-09-15 20:20:06,32,E,,
207142,2019-03-24-049069,This legislation appears designed less to comb...,Jeremy Ben-Ami,2019-03-24 21:13:07,10,E,,
207177,2019-03-14-008239,"As a convinced anti-fascist, I apologize to al...",Antonio Tajani,2019-03-14 15:45:34,1,E,,


### 6. Sentiment Analysis 

#### 1. Transformers library (DistilBERT architecture)

In [12]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

In [16]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E


In [13]:
print(df["quotation"][1])
classifier(df["quotation"][1])

a champion figure skater switching to roller skates.


[{'label': 'POSITIVE', 'score': 0.9995531439781189}]

In [14]:
df["Transformers_score"] = ""
for i in range(100):
    df["Transformers_score"][i] = classifier(df["quotation"][i])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E,"[{'label': 'POSITIVE', 'score': 0.991260886192..."
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E,"[{'label': 'POSITIVE', 'score': 0.999553143978..."
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E,"[{'label': 'NEGATIVE', 'score': 0.999620079994..."
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E,"[{'label': 'NEGATIVE', 'score': 0.982650339603..."
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E,"[{'label': 'NEGATIVE', 'score': 0.999175965785..."


#### 2. NLTK 

In [15]:
# https://realpython.com/python-nltk-sentiment-analysis/
# pip install nltk
import nltk

# nltk.download()
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rened\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [16]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(df["quotation"][1])

{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'compound': 0.5994}

In [17]:
df["NLTK_score"] = ""
for i in range(100):
    df["NLTK_score"][i] = sia.polarity_scores(df["quotation"][i])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,phase,Transformers_score,NLTK_score
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,2019-04-17 13:31:18,1,E,"[{'label': 'POSITIVE', 'score': 0.991260886192...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,2,E,"[{'label': 'POSITIVE', 'score': 0.999553143978...","{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'comp..."
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,2019-05-09 18:11:29,1,E,"[{'label': 'NEGATIVE', 'score': 0.999620079994...","{'neg': 0.189, 'neu': 0.811, 'pos': 0.0, 'comp..."
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,3,E,"[{'label': 'NEGATIVE', 'score': 0.982650339603...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,2019-01-04-001792,A Pile of Leaves.,,2019-01-04 10:00:07,1,E,"[{'label': 'NEGATIVE', 'score': 0.999175965785...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


#### 3. SpaCy

In [28]:
# pip install spacy
# pip install pip install spacytextblob==0.1.7
# !python -m spacy download en_core_web_sm

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("spacytextblob")
nlp(df["quotation"][1])

TypeError: add_pipe() got an unexpected keyword argument 'source'

#### Textblob library

It's using a classical bag of words approach so too simple for us.

In [33]:
from textblob import TextBlob
TextBlob(df["quotation"][5]).sentiment

Sentiment(polarity=0.6, subjectivity=1.0)