In [None]:
# questions I have as I think about this more:
# when we do the whole deduplication, we should only be doing that on TRAIN, right?

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from scipy.sparse import hstack, vstack, csr_matrix
import pandas as pd
import numpy as np
import copy
import time
import pickle
import os

pd.set_option('max_colwidth', 100)

In [2]:
splits = "splits_1571429760//"

In [3]:
data_folder = "data//"
train_path = data_folder + splits + "train.csv"
dev_path = data_folder + splits + "dev.csv"

In [67]:
this_model_folder = "unigram_with_sa_and_responder//"
models_folder = "models//"
pkl_filename = models_folder + this_model_folder + "model.pkl"
vectorizer_filename = models_folder + this_model_folder + "vectorizer.pkl"

In [20]:
train_df = pd.read_csv(train_path, index_col=0)

In [7]:
first_sa = pd.read_csv('Vader_resps_sm1.csv')

In [17]:
second_sa = pd.read_csv('Vader_resps_sm2.csv')
third_sa = pd.read_csv('Vader_resps_sm3.csv')

In [21]:
all_sa = pd.concat([first_sa, second_sa, third_sa])

In [23]:
train_df = train_df.merge(all_sa, on='original_idx')

In [24]:
train_df.resp_gender.value_counts()

M    4926003
F    4153796
U     799217
Name: resp_gender, dtype: int64

In [31]:
train_df.head()

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,Vader_sent,resp_gender
0,0,57265377,M,0,Jerry,Protecting birth is not the same as protecting life. You may very well pledge to the former but ...,Roger Williams,Congress_Republican,-0.8483,M
1,1,57265377,M,0,Andrea,You need to protect children and leave my body to me.,Roger Williams,Congress_Republican,0.34,F
2,2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican,0.3612,F
3,3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican,0.3612,M
4,4,57265377,M,0,Joy,"Unwanted pregnancy is a sad and unfortunate situation for anyone to find themselves in, however,...",Roger Williams,Congress_Republican,-0.666,F


In [25]:
vectorizer = HashingVectorizer()

In [26]:
responses = train_df.response_text
xs = []

# we have to do this in chunks because fit_transform
# can't handle the whole thing in memory in string form
for i in range(1,11):
    print("starting chunk {}".format(i))
    chunk = responses.iloc[(i-1)*1000000:i*1000000]
    X = vectorizer.fit_transform(chunk.apply(lambda x: np.str_(x)))
    xs.append(X)

starting chunk 1
starting chunk 2
starting chunk 3
starting chunk 4
starting chunk 5
starting chunk 6
starting chunk 7
starting chunk 8
starting chunk 9
starting chunk 10


In [27]:
# this is how we recombine the chunks 
X = vstack(xs)

In [28]:
X.shape

(9879016, 1048576)

In [49]:
def responder_gender_to_number(row, which):
    if row['resp_gender'] == which:
        return 1
    else:
        return 0
    
train_df['male_ind'] = train_df.apply(responder_gender_to_number, args=('M'), axis=1) 

In [51]:
train_df['female_ind'] = train_df.apply(responder_gender_to_number, args=('F'), axis=1) 
train_df['unknown_ind'] = train_df.apply(responder_gender_to_number, args=('U'), axis=1) 

In [52]:
male_ind_sparse = csr_matrix(train_df.male_ind.values)
female_ind_sparse = csr_matrix(train_df.female_ind.values)
unknown_ind_sparse = csr_matrix(train_df.unknown_ind.values)

In [61]:
X = hstack([X, male_ind_sparse.T, female_ind_sparse.T, unknown_ind_sparse.T])

In [64]:
X = hstack([X, csr_matrix(train_df.Vader_sent.values).T])

In [66]:
Y = train_df.op_gender.values

In [68]:
model = LogisticRegression(random_state=0)

In [69]:
time_start = time.time()


model.fit(X,Y)
currentTime = time.gmtime(time.time() - time_start)

#Convert the gmtime struct to a string
timeStr = time.strftime("%M minutes, %S seconds", currentTime)

print("Model trained in {}".format(timeStr))



Model trained in 39 minutes, 14 seconds


In [74]:
os.mkdir(models_folder + this_model_folder)

In [75]:
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [76]:
with open(vectorizer_filename, 'wb') as file:
    pickle.dump(vectorizer, file)

In [77]:
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)

In [78]:
dev_df = pd.read_csv(dev_path, index_col=0)

  mask |= (ar1 == a)


In [79]:
dev_x = vectorizer.fit_transform(dev_df.response_text.apply(lambda x: np.str_(x)))

In [80]:
dev_df = dev_df.merge(all_sa, on='original_idx')

In [81]:
dev_df['male_ind'] = dev_df.apply(responder_gender_to_number, args=('M'), axis=1) 
dev_df['female_ind'] = dev_df.apply(responder_gender_to_number, args=('F'), axis=1) 
dev_df['unknown_ind'] = dev_df.apply(responder_gender_to_number, args=('U'), axis=1) 

In [82]:
dev_male_ind_sparse = csr_matrix(dev_df.male_ind.values)
dev_female_ind_sparse = csr_matrix(dev_df.female_ind.values)
dev_unknown_ind_sparse = csr_matrix(dev_df.unknown_ind.values)

In [83]:
dev_x = hstack([dev_x, dev_male_ind_sparse.T, dev_female_ind_sparse.T, dev_unknown_ind_sparse.T])
dev_x = hstack([dev_x, csr_matrix(dev_df.Vader_sent.values).T])

In [85]:
dev_y = dev_df.op_gender.values

In [86]:
model.score(dev_x, dev_y)

0.8240827037468157

In [87]:
preds = model.predict(dev_x)

In [88]:
probs = model.predict_proba(dev_x)

In [89]:
probs.shape

(2292907, 2)

In [90]:
dev_df['preds'] = preds

In [91]:
prob_m = [prob[0] for prob in probs]
prob_f = [prob[1] for prob in probs]

In [92]:
dev_df['prob_m'] = prob_m
dev_df['prob_f'] = prob_f

In [93]:
dev_df.head()

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,Vader_sent,resp_gender,male_ind,female_ind,unknown_ind,preds,prob_m,prob_f
0,1965080,86681682,M,29909,James,Thanks for passing this ridiculous legislation. https://www.youtube.com/watch?v=-y4wd8roYrE,Jim Banks,Congress_Republican,0.0065,M,1,0,0,M,0.613098,0.386902
1,1965081,86681682,M,29909,Martha,Handsome young man on the left. 😂 mom,Jim Banks,Congress_Republican,0.4939,F,0,1,0,M,0.874693,0.125307
2,1965082,86681682,M,29911,Preston,You're sideways!,Jim Banks,Congress_Republican,0.0,M,1,0,0,M,0.628705,0.371295
3,1965083,86681682,M,29911,Bobby,Hi Congressman Banks,Jim Banks,Congress_Republican,0.0,M,1,0,0,M,0.984654,0.015346
4,1965084,86681682,M,29911,Landon,"Jim, I love your office. The 5th floor is for winners.",Jim Banks,Congress_Republican,0.8074,M,1,0,0,M,0.827806,0.172194


In [94]:
#### get all wrong predictions
wrong_preds = dev_df[dev_df['op_gender']!=dev_df['preds']]

In [95]:
# even though around 20% of the data set has label W,
# 3/4 of the model's mistakes are on those rows
wrong_preds.op_gender.value_counts()

W    302257
M    101105
Name: op_gender, dtype: int64

In [96]:
# model is only predicting W about 6.7% of the time 
dev_df.preds.value_counts()['W'] / dev_df.preds.value_counts()['M']

0.06670503176751306

In [97]:
label_m_pred_f = wrong_preds[wrong_preds.op_gender=='M']

In [98]:
label_f_pred_m = wrong_preds[wrong_preds.op_gender=='W']

In [99]:
# wow, fascinating - these are mostly all about planned parenthood 
# or generally about women's reproductive rights 
label_m_pred_f.head(20)

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,Vader_sent,resp_gender,male_ind,female_ind,unknown_ind,preds,prob_m,prob_f
39,1965119,86681682,M,29911,Linda,If you repal the ACA and defund Planned Parenthood where will the poor women of this country obt...,Jim Banks,Congress_Republican,0.5423,F,0,1,0,W,0.412991,0.587009
49,1965129,86681682,M,29911,Debbie,Will you please find MIA Todd Young?,Jim Banks,Congress_Republican,-0.1613,F,0,1,0,W,0.013792,0.986208
72,1965152,86681682,M,29914,Amanda,"You seem confused by facts, so let me enlighten you. Pew research says that the majority of Amer...",Jim Banks,Congress_Republican,-0.3368,F,0,1,0,W,0.409832,0.590168
75,1965155,86681682,M,29914,Sharon,Abortion rates are the lowest since row vs wade. This is largely attributed to the provision of ...,Jim Banks,Congress_Republican,0.3612,F,0,1,0,W,0.45379,0.54621
76,1965156,86681682,M,29914,Lauren,How I spent my morning. Defending Planned Parenthood. Defending women's access to healthcare. [[...,Jim Banks,Congress_Republican,0.0,F,0,1,0,W,0.443739,0.556261
82,1965162,86681682,M,29914,Beverly,"The majority of Americans are prochoice, and support PP. They prevent unwanted pregnancies, and ...",Jim Banks,Congress_Republican,-0.9027,F,0,1,0,W,0.42758,0.57242
86,1965166,86681682,M,29914,Leah,"For the areas without community health centers, Planned Parenthood fills in the gaps. I stand wi...",Jim Banks,Congress_Republican,-0.5759,F,0,1,0,W,0.413486,0.586514
87,1965167,86681682,M,29914,Desirea,I SUPPORT PLANNED PARENTHOOD,Jim Banks,Congress_Republican,0.4019,U,0,0,1,W,0.374897,0.625103
93,1965173,86681682,M,29914,William,"Congressman Banks, I have a bill that could be proposed before the House that would help prevent...",Jim Banks,Congress_Republican,0.9214,M,1,0,0,W,0.374035,0.625965
94,1965174,86681682,M,29914,Tandy,PP provides SAFE CHOICES and so much more .. [[PHOTO]],Jim Banks,Congress_Republican,0.5622,U,0,0,1,W,0.496846,0.503154


In [100]:
# these seem more random 
label_f_pred_m.head(20)

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,Vader_sent,resp_gender,male_ind,female_ind,unknown_ind,preds,prob_m,prob_f
96309,2252167,47977187,W,43581,Pete,What a joke,Catherine Cortez Masto,Congress_Democratic,0.296,M,1,0,0,M,0.773734,0.226266
96310,2252168,47977187,W,43581,Ann,SEN Cortez Masto: I would appreciate if you would add to your website a way to write our commen...,Catherine Cortez Masto,Congress_Democratic,0.6808,F,0,1,0,M,0.740188,0.259812
96311,2252169,47977187,W,43581,Marc,i voted for you please dont us down .,Catherine Cortez Masto,Congress_Democratic,0.3182,M,1,0,0,M,0.687293,0.312707
96312,2252170,47977187,W,43581,David,Persista!,Catherine Cortez Masto,Congress_Democratic,0.0,M,1,0,0,M,0.745369,0.254631
96314,2252172,47977187,W,43581,Colleen,Thank you,Catherine Cortez Masto,Congress_Democratic,0.3612,F,0,1,0,M,0.638145,0.361855
96315,2252173,47977187,W,43581,Joseph,As We The People. We Must Not In Anyway Mistreat A Stranger From Another Country. Those Foreigne...,Catherine Cortez Masto,Congress_Democratic,-0.9331,M,1,0,0,M,0.718859,0.281141
96316,2252174,47977187,W,43581,Ted,please keep fighting for us. we need you now more than ever!,Catherine Cortez Masto,Congress_Democratic,-0.126,M,1,0,0,M,0.58419,0.41581
96317,2252175,47977187,W,43582,Samantha,Thank you!,Catherine Cortez Masto,Congress_Democratic,0.4199,F,0,1,0,M,0.637812,0.362188
96318,2252176,47977187,W,43582,Joshua,Catherine Cortez Masto on the plus side... y'all can do him what obama did to General Petraeus.....,Catherine Cortez Masto,Congress_Democratic,-0.5423,M,1,0,0,M,0.868839,0.131161
96319,2252177,47977187,W,43582,Tony,Sessions is an outright racist,Catherine Cortez Masto,Congress_Democratic,-0.6124,M,1,0,0,M,0.562177,0.437823
