## Load packages

In [1]:
# Load packages
import pandas as pd
import numpy as np

## Load datasets

In [2]:
# Load the main dataset
df = pd.read_pickle("data/pkl/extracted-quotes-2018.pkl")

In [518]:
# Load the dataset with additional info abbout politicians 
df_politicians = pd.io.json.read_json("data/resources/congress_biolist.json")

## Work on df_politicians table

In [673]:
# Extract from congresses column relevant infor (position, state, parties)
def extract_congress_information(row):
    information = pd.json_normalize(row["congresses"]).sort_values("congressNumber").tail(1).loc[:, ["position","stateName","parties"]]

    row["position"] = information.loc[:, "position"].values[0]
    row["stateName"] = information.loc[:, "stateName"].values[0]
    row["parties"] = information.loc[:, "parties"].values[0]
    return row

df_politicians = df_politicians.apply(extract_congress_information, axis=1)

In [674]:
# The value in column parties is a list, we want to select the last past party from the list
def getLastValue(aList):
    return aList[-1]

df_politicians["parties"] = df_politicians["parties"].apply(getLastValue)

In [681]:
# Create new column speaker (full name of the politician)
df_politicians['speaker'] = df_politicians['givenName'] + " " + df_politicians['familyName']

# Have the speaker's full names in the same size 
df_politicians["speaker"] = df_politicians["speaker"].str.lower()

# In lower case also in the quotes dataset
df["speaker"] = df["speaker"].str.lower()

In [682]:
# Check for full name duplicates
# I propose to delete those as they may be in different parties and we will not which one is talking
df_politicians['speaker'].value_counts()

neil abercrombie     1
mick mulvaney        1
stephanie murphy     1
scott murphy         1
patrick murphy       1
                    ..
ruben gallego        1
pete gallego         1
elton gallegly       1
michael gallagher    1
ryan zinke           1
Name: speaker, Length: 1151, dtype: int64

In [683]:
# We drop duplicates by speaker's full name
df_politicians = df_politicians.drop_duplicates(subset=['speaker'])

## Merge the quotes with info about speakers

In [684]:
# Merge quotes to speaker's info
data = pd.merge(df, df_politicians, on='speaker', how='outer') # we merged the dataset

In [687]:
# Subsets by parties 
subset_democrats = data[data['parties'] == "Democrat"]
subset_republicans = data[data['parties'] == "Republican"]

## Naive model: select quotes that talk about the opponent party

In [688]:
# Get the list of Republicans and Democrats
names_democrats = df_politicians[df_politicians['parties'] == 'Democrat']["speaker"].tolist()
names_republicans = df_politicians[df_politicians['parties'] == 'Republican']["speaker"].tolist()

In [705]:
# Have all quotes in lowercase
data["quotation"] = data["quotation"].str.lower()
subset_democrats["quotation"] = subset_democrats["quotation"].str.lower()
subset_republicans["quotation"] = subset_republicans["quotation"].str.lower()

In [706]:
# Drop rows with those who don't have any quotes
data = data.dropna(subset=['quotation'])
subset_democrats = subset_democrats.dropna(subset=['quotation'])
subset_republicans = subset_republicans.dropna(subset=['quotation'])

In [707]:
# Create full lists 
list_rep = names_republicans + ['republican', 'republicans']
pattern_list_rep = '|'.join(list_rep)

list_dem = names_democrats + ['democrat', 'democrats']
pattern_list_dem = '|'.join(list_dem)

In [709]:
# Subset of quotes said by democrats about republicans
demo_quotes_abt_rep = subset_democrats[subset_democrats['quotation'].str.contains(pattern_list_rep)]

In [710]:
# Subset of quotes said by democrats about democrats
rep_quotes_abt_demo = subset_republicans[subset_republicans['quotation'].str.contains(pattern_list_dem)]

## Sentiment analysis of the quotes

Use of nltk package

In [716]:
import nltk
# nltk.download()
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/darynabilodid/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [730]:
# demo_quotes_abt_rep.loc[0,'quotation']

In [720]:
demo_quotes_abt_rep

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,subset,...,deathYear,congresses,middleName,unaccentedMiddleName,nickName,honorificPrefix,honorificSuffix,position,stateName,parties
128,2018-02-09-161223,without assurances in the house similar to tho...,david price,"[Q1176177, Q16063598, Q20804677, Q20973688, Q3...",2018-02-09 00:00:00,1.0,"[[David Price, 0.451], [Alma Adams, 0.2769], [...",[http://www.wfae.org/post/congressional-reps-d...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",Eugene,Eugene,,,,Representative,NC,Democrat
246,2018-02-22-103522,"republicans have just been awol,",david price,"[Q1176177, Q16063598, Q20804677, Q20973688, Q3...",2018-02-22 15:10:15,1.0,"[[David Price, 0.7867], [None, 0.2133]]",[http://chapelboro.com/news/safety/local-elect...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",Eugene,Eugene,,,,Representative,NC,Democrat
462,2018-12-14-089596,the republican super majority in the general a...,david price,"[Q1176177, Q16063598, Q20804677, Q20973688, Q3...",2018-12-14 22:24:23,3.0,"[[David Price, 0.5509], [None, 0.2263], [Alma ...",[https://www.heraldonline.com/news/state/north...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",Eugene,Eugene,,,,Representative,NC,Democrat
1149,2018-01-10-080507,no state has suffered more than north carolina...,david price,"[Q1176177, Q16063598, Q20804677, Q20973688, Q3...",2018-01-10 01:43:07,21.0,"[[David Price, 0.8098], [None, 0.1838], [Dalla...",[http://www.thestate.com/latest-news/article19...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",Eugene,Eugene,,,,Representative,NC,Democrat
1341,2018-11-19-078881,"speaking of republican appropriators, they've ...",david price,"[Q1176177, Q16063598, Q20804677, Q20973688, Q3...",2018-11-19 23:34:11,3.0,"[[David Price, 0.6931], [None, 0.2306], [Presi...",[http://p.washingtontimes.com/news/2018/nov/19...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",Eugene,Eugene,,,,Representative,NC,Democrat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501939,2018-11-01-130743,we always thought it important to kind of batt...,paul hodes,[Q959262],2018-11-01 00:00:00,1.0,"[[Paul Hodes, 0.7709], [None, 0.202], [Annie K...",[http://www.nhpr.org/post/dems-midterm-resist-...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",,,,,,Representative,NH,Democrat
502378,2018-11-13-099395,the vietnamese and the republicans are... tryi...,loretta sanchez,[Q469094],2018-11-13 05:50:00,1.0,"[[Loretta Sanchez, 0.8057], [None, 0.1564], [L...",[http://frontpagemag.com/fpm/271908/crooks-and...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",,,,,,Representative,CA,Democrat
502396,2018-03-19-016320,ca political was contracted to provide modeled...,kay hagan,[Q261003],2018-03-19 21:59:40,1.0,"[[Kay Hagan, 0.6904], [None, 0.1662], [Thom Ti...",[http://www.wral.com/tillis-may-have-benefited...,E,True,...,2019.0,"[{'position': 'Senator', 'congressNumber': 111...",,,,,,Senator,NC,Democrat
502545,2018-06-18-005041,an opposition research arm of the republican n...,howard berman,[Q545221],2018-06-18 07:16:19,1.0,"[[Howard Berman, 0.5521], [None, 0.3539], [Ada...",[http://dailycaller.com/2018/06/18/democrats-a...,E,True,...,,"[{'position': 'Representative', 'congressNumbe...",Lawrence,Lawrence,,,,Representative,CA,Democrat
