In [1]:
import nltk
from nltk import word_tokenize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import sentiwordnet as swn
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import random
import time



In [2]:
responses = pd.read_csv("../data/responses_smt_clean.csv")
responses.head()

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.3
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.0
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.0
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.0
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.8


In [3]:
resp_nans = responses[responses.isna().any(axis=1)]
resp_nans

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment


## Add Gender of Responder

In [15]:
import gender_guesser.detector as gender

In [21]:
d = gender.Detector(case_sensitive=False)
responses['resp_gender'] = responses['responder_id'].apply(d.get_gender)
responses

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment,last_name,first_name,type,state,region,resp_gender
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.300000,Williams,Roger,rep,TX,Region6,male
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.000000,Williams,Roger,rep,TX,Region6,female
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,female
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,male
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.800000,Williams,Roger,rep,TX,Region6,female
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937,1.000000,Williams,Roger,rep,TX,Region6,female
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000,0.000000,Williams,Roger,rep,TX,Region6,female
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,female
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795,1.785714,Williams,Roger,rep,TX,Region6,male
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140,0.000000,Williams,Roger,rep,TX,Region6,male


In [6]:
responses['resp_gender'].unique()

array(['male', 'female', 'mostly_female', 'andy', 'mostly_male',
       'unknown'], dtype=object)

In [22]:
# Androgynous names are unknown gender; we will assume mostly male/female are male/female.
gender_dict = {'male':'M', 'female':'F', 'mostly_female':'F', 'andy':'U', 'mostly_male':'M',
       'unknown':'U'}
responses['resp_gender'] = responses['resp_gender'].map(gender_dict)
responses

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment,last_name,first_name,type,state,region,resp_gender
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.300000,Williams,Roger,rep,TX,Region6,M
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.000000,Williams,Roger,rep,TX,Region6,F
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,F
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,M
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.800000,Williams,Roger,rep,TX,Region6,F
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937,1.000000,Williams,Roger,rep,TX,Region6,F
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000,0.000000,Williams,Roger,rep,TX,Region6,F
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,F
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795,1.785714,Williams,Roger,rep,TX,Region6,M
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140,0.000000,Williams,Roger,rep,TX,Region6,M


## Add State and Region for Poster

In [4]:
# Get list of legislators' state, branch (ie, senate/house)
legislators = pd.read_csv('../data/legislators.csv')
legislators.head()

Unnamed: 0,last_name,first_name,type,state
0,Brown,Sherrod,sen,OH
1,Cantwell,Maria,sen,WA
2,Cardin,Benjamin,sen,MD
3,Carper,Thomas,sen,DE
4,Casey,Robert,sen,PA


In [5]:
# Split posters names into first and last to match legislators df.
names = responses['op_name'].str.split(" ", n = 1, expand=True)
responses['last_name'] = names[1]
responses['first_name'] = names[0]

In [6]:
mismatch_fnames = {'Tom Carper':'Thomas','Ben Cardin':'Benjamin','Jim Himes':'James',
                   'Mike Simpson':'Michael','Stephen F. Lynch':'Stephen',
                   'Dave Reichert':'David','Al Franken':'Alan','Dutch Ruppersberger':'C.',
                   'Pat Tiberi':'Patrick','Bob Menendez':'Robert','Johnny Isakson':'John',
                   'Pat Toomey':'Patrick','Mark Sanford':'Marshall','Jan Schakowsky':'Janice',
                   'Pete Visclosky':'Peter','Sean Patrick Maloney':'Sean','Mike Enzi':'Michael',
                   'Dan Lipinski':'Daniel','Chuck Schumer':'Charles','Rob Wittman':'Robert',
                   'Pat Meehan':'Patrick','Rob Portman':'Robert','Morgan Griffith':'H.',
                   'Don Beyer':'Donald','Brad Schneider':'Bradley','Drew Ferguson':'A.',
                   'Ben Sasse':'Benjamin','Mike Crapo':'Michael','Jim Clyburn':'James',
                   'Dave Brat':'David',"Tom O'Halleran":'Thomas','Dan Kildee':'Daniel',
                   'Ted Deutch':'Theodore','Charlie Dent':'Charles','John B. Larson':'John',
                   'Gerry Connolly':'Gerald','Bob Latta':'Robert','Donald McEachin':'A.',
                   'Jim Inhofe':'James','Ed Royce':'Edward','Matt Cartwright':'Matthew',
                   'Tim Kaine':'Timothy','Hank Johnson':'Henry','Ed Markey':'Edward',
                   'Dick Durbin':'Richard','Lou Correa':'J.','Dave Loebsack':'David',
                   'Bernie Sanders':'Bernard','Rick Nolan':'Richard','Mike Conaway':'K.',
                   'Bob Casey Jr.':'Robert','Chuck Grassley':'Charles','Jim Renacci':'James',
                   'Hal Rogers':'Harold','Mike Capuano':'Michael',"Beto O'Rourke":'Beto',
                   'Maggie Hassan':'Margaret','Anthony G. Brown':'Anthony',
                   'G. K. Butterfield':'George','Mike Turner':'Michael','Rick Crawford':'Eric',
                   'Jim Sensenbrenner':'F.','Shelley Moore Capito':'Shelley'}
#Bob Casey Jr. is Robert Patrick Casey Jr.: Senator (D-PA) 
# Robert Casey TX: not in our db

In [7]:
mismatch_lnames = {'Tom Carper':'Carper','Ben Cardin':'Cardin','Jim Himes':'Himes',
                   'Mike Simpson':'Simpson','Stephen F. Lynch':'Lynch','Dave Reichert':'Reichert',
                   'Al Franken':'Franken','Dutch Ruppersberger':'Ruppersberger',
                   'Pat Tiberi':'Tiberi','Bob Menendez':'Menendez','Johnny Isakson':'Isakson',
                   'Pat Toomey':'Toomey','Mark Sanford':'Sanford','Jan Schakowsky':'Schakowsky',
                   'Pete Visclosky':'Visclosky','Sean Patrick Maloney':'Maloney',
                   'Mike Enzi':'Enzi','Dan Lipinski':'Lipinski','Chuck Schumer':'Schumer',
                   'Rob Wittman':'Wittman','Pat Meehan':'Meehan','Rob Portman':'Portman',
                   'Morgan Griffith':'Griffith','Don Beyer':'Beyer','Brad Schneider':'Schneider',
                   'Drew Ferguson':'Ferguson','Ben Sasse':'Sasse','Mike Crapo':'Crapo',
                   'Jim Clyburn':'Clyburn','Dave Brat':'Brat',"Tom O'Halleran":"O’Halleran",
                   'Dan Kildee':'Kildee','Ted Deutch':'Deutch','Charlie Dent':'Dent',
                   'John B. Larson':'Larson','Gerry Connolly':'Connolly','Bob Latta':'Latta',
                   'Donald McEachin':'McEachin','Jim Inhofe':'Inhofe','Ed Royce':'Royce',
                   'Matt Cartwright':'Cartwright','Tim Kaine':'Kaine','Hank Johnson':'Johnson',
                   'Ed Markey':'Markey','Dick Durbin':'Durbin','Lou Correa':'Correa',
                   'Dave Loebsack':'Loebsack','Bernie Sanders':'Sanders','Rick Nolan':'Nolan',
                   'Mike Conaway':'Conaway','Bob Casey Jr.':'Casey','Chuck Grassley':'Grassley',
                   'Jim Renacci':'Renacci','Hal Rogers':'Rogers','Mike Capuano':'Capuano',
                   "Beto O'Rourke":'O’Rourke','Maggie Hassan':'Hassan','Anthony G. Brown':'Brown',
                   'G. K. Butterfield':'Butterfield','Mike Turner':'Turner',
                   'Rick Crawford':'Crawford','Jim Sensenbrenner':'Sensenbrenner',
                   'Shelley Moore Capito':'Capito'}
#Bob Casey Jr. is Robert Patrick Casey Jr.: Senator (D-PA) 
# Robert Casey TX: not in our db

In [8]:
# df.loc[df['col1'].isin(dict1.keys()), 'col2'] = df['col1'].map(dict1)
responses.loc[responses['op_name'].isin(mismatch_fnames.keys()), 
              'first_name'] = responses['op_name'].map(mismatch_fnames)
responses.loc[responses['op_name'].isin(mismatch_lnames.keys()), 
              'last_name'] = responses['op_name'].map(mismatch_lnames)
responses

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment,last_name,first_name
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.300000,Williams,Roger
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.000000,Williams,Roger
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.800000,Williams,Roger
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937,1.000000,Williams,Roger
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000,0.000000,Williams,Roger
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795,1.785714,Williams,Roger
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140,0.000000,Williams,Roger


In [9]:
responses=responses.merge(legislators, on=['last_name','first_name'], how='left')
responses

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment,last_name,first_name,type,state
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.300000,Williams,Roger,rep,TX
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.000000,Williams,Roger,rep,TX
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.800000,Williams,Roger,rep,TX
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937,1.000000,Williams,Roger,rep,TX
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000,0.000000,Williams,Roger,rep,TX
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795,1.785714,Williams,Roger,rep,TX
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140,0.000000,Williams,Roger,rep,TX


In [10]:
optype = {87147344: 'rep'}
state = {87147344: 'AZ'}
responses.loc[responses['op_id'].isin(optype.keys()), 
              'type'] = responses['op_id'].map(optype)
responses.loc[responses['op_id'].isin(state.keys()), 
              'state'] = responses['op_id'].map(state)

In [23]:
resp_nans = responses[responses.isna().any(axis=1)]
resp_nans

MemoryError: 

### Add the region based on the state

Standard Fedaral Regional Boundaries

![alt text](../data/USFederalRegions.svg)
By Belg4mit - Own work, Public Domain, https://commons.wikimedia.org/w/index.php?curid=10180327

In [12]:
regions_dict = {'AK':'Region10', 'AL':'Region4', 'AR':'Region6', 'AS':'Region9',
                'AZ':'Region9', 'CA':'Region9', 'CO':'Region8', 'CT':'Region1',
                'DC':'Region3', 'DE':'Region3', 'FL':'Region4', 'GA':'Region4',
                'GU':'Region9', 'HI':'Region9', 'IA':'Region7', 'ID':'Region10',
                'IL':'Region5', 'IN':'Region5', 'KS':'Region7', 'KY':'Region4',
                'LA':'Region6', 'MA':'Region1', 'MD':'Region3', 'ME':'Region1',
                'MI':'Region5', 'MN':'Region5', 'MO':'Region7', 'MP':'Region9',
                'MS':'Region4', 'MT':'Region8', 'NC':'Region4', 'ND':'Region8',
                'NE':'Region7', 'NH':'Region1', 'NJ':'Region2', 'NM':'Region6',
                'NV':'Region9', 'NY':'Region2', 'OH':'Region5', 'OK':'Region6',
                'OR':'Region10', 'PA':'Region3', 'PR':'Region2', 'RI':'Region1',
                'SC':'Region4', 'SD':'Region8', 'TN':'Region4', 'TX':'Region6',
                'UT':'Region8', 'VA':'Region3', 'VI':'Region2', 'VT':'Region1',
                'WA':'Region10', 'WI':'Region5', 'WV':'Region3', 'WY':'Region8'}

In [13]:
responses['region'] = responses['state'].map(regions_dict)
responses

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment,last_name,first_name,type,state,region
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.300000,Williams,Roger,rep,TX,Region6
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.000000,Williams,Roger,rep,TX,Region6
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.800000,Williams,Roger,rep,TX,Region6
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937,1.000000,Williams,Roger,rep,TX,Region6
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000,0.000000,Williams,Roger,rep,TX,Region6
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795,1.785714,Williams,Roger,rep,TX,Region6
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140,0.000000,Williams,Roger,rep,TX,Region6


In [14]:
responses.to_csv('../data/resp_smt_state.csv', header=True, index=False)

In [24]:
responses.to_csv('../data/resp_smt_gender_state.csv', header=True, index=False)

In [2]:
responses = pd.read_csv('../data/resp_smt_gender_state.csv')

In [3]:
responses

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment,last_name,first_name,type,state,region,resp_gender
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican,-1,-0.7458,-1.300000,Williams,Roger,rep,TX,Region6,M
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican,1,0.3302,0.000000,Williams,Roger,rep,TX,Region6,F
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,F
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,M
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican,0,-0.0821,-0.800000,Williams,Roger,rep,TX,Region6,F
5,57265377,M,0,Candice,"Women, the ""hosts"" of the unborn babies you're...",Roger Williams,Congress_Republican,2,1.5937,1.000000,Williams,Roger,rep,TX,Region6,F
6,57265377,M,0,Cheri,I am Pro Choice and always will be,Roger Williams,Congress_Republican,0,0.0000,0.000000,Williams,Roger,rep,TX,Region6,F
7,57265377,M,0,Julie,Thank you for taking a stand!!!!,Roger Williams,Congress_Republican,0,0.3612,0.000000,Williams,Roger,rep,TX,Region6,F
8,57265377,M,0,Stephen,I seem to disagree with you again Mr.Williams....,Roger Williams,Congress_Republican,1,-0.1795,1.785714,Williams,Roger,rep,TX,Region6,M
9,57265377,M,0,Tony,Please demonstrate your support by providing p...,Roger Williams,Congress_Republican,1,1.2140,0.000000,Williams,Roger,rep,TX,Region6,M


In [4]:
resp_nans = responses[responses.isna().any(axis=1)]
resp_nans

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,HL_sentiment,V_sentiment,TB_sentiment,last_name,first_name,type,state,region,resp_gender
