In [1]:
# questions I have as I think about this more:
# when we do the whole deduplication, we should only be doing that on TRAIN, right?

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import hstack, vstack
import pandas as pd
import numpy as np
import copy
import time
import pickle

pd.set_option('max_colwidth', 100)

In [2]:
splits = "splits_1571429760//"

In [3]:
data_folder = "data//"
train_path = data_folder + splits + "train.csv"
dev_path = data_folder + splits + "dev.csv"

In [51]:
pkl_filename = "models//first_model.pkl"
vectorizer_filename = "models//first_vectorizer.pkl"

In [5]:
train_df = pd.read_csv(train_path, index_col=0)

  mask |= (ar1 == a)


In [10]:
vectorizer = HashingVectorizer()

In [8]:
responses = train_df.response_text
xs = []

# we have to do this in chunks because fit_transform
# can't handle the whole thing in memory in string form
for i in range(1,11):
    print("starting chunk {}".format(i))
    chunk = responses.iloc[(i-1)*1000000:i*1000000]
    X = vectorizer.fit_transform(chunk.apply(lambda x: np.str_(x)))
    xs.append(X)

starting chunk 1
starting chunk 2
starting chunk 3
starting chunk 4
starting chunk 5
starting chunk 6
starting chunk 7
starting chunk 8
starting chunk 9
starting chunk 10


In [9]:
# this is how we recombine the chunks 
X = vstack(xs)

In [10]:
X.shape

(9879016, 1048576)

In [11]:
Y = train_df.op_gender.values

In [12]:
model = LogisticRegression(random_state=0)

In [15]:
time_start = time.time()


model.fit(X,Y)
currentTime = time.gmtime(time.time() - time_start)

#Convert the gmtime struct to a string
timeStr = time.strftime("%M minutes, %S seconds", currentTime)

print("Model trained in {}".format(timeStr))



Model trained in 31 minutes, 08 seconds


In [26]:
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [53]:
with open(vectorizer_filename, 'wb') as file:
    pickle.dump(vectorizer, file)

In [7]:
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)

In [11]:
dev_df = pd.read_csv(dev_path, index_col=0)

  mask |= (ar1 == a)


In [12]:
dev_x = vectorizer.fit_transform(dev_df.response_text.apply(lambda x: np.str_(x)))

In [13]:
dev_y = dev_df.op_gender.values

In [14]:
model.score(dev_x, dev_y)

0.824028187798284

In [15]:
preds = model.predict(dev_x)

In [18]:
probs = model.predict_proba(dev_x)

In [20]:
probs.shape

(2292907, 2)

In [22]:
dev_df['preds'] = preds

In [25]:
prob_m = [prob[0] for prob in probs]
prob_f = [prob[1] for prob in probs]

In [26]:
dev_df['prob_m'] = prob_m
dev_df['prob_f'] = prob_f

In [28]:
dev_df.head()

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,preds,prob_m,prob_f
0,1965080,86681682,M,29909,James,Thanks for passing this ridiculous legislation...,Jim Banks,Congress_Republican,M,0.608282,0.391718
1,1965081,86681682,M,29909,Martha,Handsome young man on the left. 😂 mom,Jim Banks,Congress_Republican,M,0.875024,0.124976
2,1965082,86681682,M,29911,Preston,You're sideways!,Jim Banks,Congress_Republican,M,0.625179,0.374821
3,1965083,86681682,M,29911,Bobby,Hi Congressman Banks,Jim Banks,Congress_Republican,M,0.984224,0.015776
4,1965084,86681682,M,29911,Landon,"Jim, I love your office. The 5th floor is for ...",Jim Banks,Congress_Republican,M,0.824908,0.175092


In [29]:
#### get all wrong predictions
wrong_preds = dev_df[dev_df['op_gender']!=dev_df['preds']]

In [32]:
# even though around 20% of the data set has label W,
# 3/4 of the model's mistakes are on those rows
wrong_preds.op_gender.value_counts()

W    302271
M    101216
Name: op_gender, dtype: int64

In [36]:
# model is only predicting W about 6.7% of the time 
dev_df.preds.value_counts()['W'] / dev_df.preds.value_counts()['M']

0.0667531703813018

In [38]:
label_m_pred_f = wrong_preds[wrong_preds.op_gender=='M']

In [43]:
label_f_pred_m = wrong_preds[wrong_preds.op_gender=='W']

In [41]:
# wow, fascinating - these are mostly all about planned parenthood 
# or generally about women's reproductive rights 
label_m_pred_f.head(20)

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,preds,prob_m,prob_f
39,1965119,86681682,M,29911,Linda,If you repal the ACA and defund Planned Parent...,Jim Banks,Congress_Republican,W,0.415952,0.584048
49,1965129,86681682,M,29911,Debbie,Will you please find MIA Todd Young?,Jim Banks,Congress_Republican,W,0.0138,0.9862
72,1965152,86681682,M,29914,Amanda,"You seem confused by facts, so let me enlighte...",Jim Banks,Congress_Republican,W,0.408738,0.591262
75,1965155,86681682,M,29914,Sharon,Abortion rates are the lowest since row vs wad...,Jim Banks,Congress_Republican,W,0.458014,0.541986
76,1965156,86681682,M,29914,Lauren,How I spent my morning. Defending Planned Pare...,Jim Banks,Congress_Republican,W,0.444759,0.555241
82,1965162,86681682,M,29914,Beverly,"The majority of Americans are prochoice, and s...",Jim Banks,Congress_Republican,W,0.425613,0.574387
86,1965166,86681682,M,29914,Leah,For the areas without community health centers...,Jim Banks,Congress_Republican,W,0.41175,0.58825
87,1965167,86681682,M,29914,Desirea,I SUPPORT PLANNED PARENTHOOD,Jim Banks,Congress_Republican,W,0.397278,0.602722
91,1965171,86681682,M,29914,William,Today many Hoosiers are more concerned about t...,Jim Banks,Congress_Republican,W,0.499514,0.500486
93,1965173,86681682,M,29914,William,"Congressman Banks, I have a bill that could be...",Jim Banks,Congress_Republican,W,0.371448,0.628552


In [44]:
# these seem more random 
label_f_pred_m.head(20)

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,preds,prob_m,prob_f
96309,2252167,47977187,W,43581,Pete,What a joke,Catherine Cortez Masto,Congress_Democratic,M,0.770286,0.229714
96310,2252168,47977187,W,43581,Ann,SEN Cortez Masto: I would appreciate if you w...,Catherine Cortez Masto,Congress_Democratic,M,0.740732,0.259268
96311,2252169,47977187,W,43581,Marc,i voted for you please dont us down .,Catherine Cortez Masto,Congress_Democratic,M,0.682052,0.317948
96312,2252170,47977187,W,43581,David,Persista!,Catherine Cortez Masto,Congress_Democratic,M,0.741214,0.258786
96314,2252172,47977187,W,43581,Colleen,Thank you,Catherine Cortez Masto,Congress_Democratic,M,0.636884,0.363116
96315,2252173,47977187,W,43581,Joseph,As We The People. We Must Not In Anyway Mistre...,Catherine Cortez Masto,Congress_Democratic,M,0.709795,0.290205
96316,2252174,47977187,W,43581,Ted,please keep fighting for us. we need you now m...,Catherine Cortez Masto,Congress_Democratic,M,0.576119,0.423881
96317,2252175,47977187,W,43582,Samantha,Thank you!,Catherine Cortez Masto,Congress_Democratic,M,0.636884,0.363116
96318,2252176,47977187,W,43582,Joshua,Catherine Cortez Masto on the plus side... y'a...,Catherine Cortez Masto,Congress_Democratic,M,0.865525,0.134475
96319,2252177,47977187,W,43582,Tony,Sessions is an outright racist,Catherine Cortez Masto,Congress_Democratic,M,0.559563,0.440437


In [54]:
## when a comment mentions a female congress person, the model is very confident
## that the label should be female 
label_m_pred_f.sort_values('prob_f', ascending=False)[:20]

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,preds,prob_m,prob_f
900841,5149895,46506154,M,161080,Peter,Patty murray. Moron,Marco Rubio,Congress_Republican,W,3.070231e-08,1.0
1310057,7728351,99469231,M,296100,Katy,Patty,John McCain,Congress_Republican,W,5.887948e-08,1.0
986684,5235738,46506154,M,161309,Rita,"You go Marsha Marsha Marsha, PURE KNOWLEDGE!!",Marco Rubio,Congress_Republican,W,1.853294e-07,1.0
349253,2621876,46566554,M,67098,John,YOU SEND COPY TO Congresswoman Vicky Hartzler,Sam Graves,Congress_Republican,W,2.02756e-07,1.0
900859,5149913,46506154,M,161080,John,Patty Murray looks possessed!,Marco Rubio,Congress_Republican,W,2.043567e-07,1.0
1651787,10132646,51495713,M,370671,Wayne,MIA.,Richard Shelby,Congress_Republican,W,2.382162e-07,1.0
1589124,10036994,49028900,M,365372,Robert,Support Stop Arming Terrorist Act Congresswoman Tulsi Gabbard,Steve King,Congress_Republican,W,3.224174e-07,1.0
1308319,7726613,99469231,M,296100,Mark,Shaheen Khal,John McCain,Congress_Republican,W,3.738764e-07,1.0
944650,5193704,46506154,M,161264,Suzette,I trust my Congresswoman Marsha Blackburn to do the right thing on this.,Marco Rubio,Congress_Republican,W,5.342833e-07,0.999999
900951,5150005,46506154,M,161080,Verna,Patty Murray needs to go along with Maria Cantwell,Marco Rubio,Congress_Republican,W,5.811695e-07,0.999999


In [55]:
# same thing the other way around! The very confident mistakes are for responses
# that name male congresspeople 
label_f_pred_m.sort_values('prob_m', ascending=False)[:20]

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,preds,prob_m,prob_f
2004432,11053824,86569077,W,390077,Linda,Voted against VAWA 02/28/13 Aderholt Amash Bachmann Barton Bentivolio Bilirakis Bishop (UT) Blac...,Kirsten Gillibrand,Congress_Democratic,M,1.0,1.379446e-08
1981579,11030971,86569077,W,389860,Rob,amar Alexander (R-TN) Kelly Ayotte (R-NH) John Barrasso (R-WY) Max Baucus (D-MT) Roy Blunt (R-MO...,Kirsten Gillibrand,Congress_Democratic,M,1.0,6.874396e-08
2003909,11053301,86569077,W,390072,Matt,Rand paul,Kirsten Gillibrand,Congress_Democratic,M,1.0,2.583584e-07
1957936,11007328,86569077,W,389621,Rob,Rand paul,Kirsten Gillibrand,Congress_Democratic,M,1.0,2.583584e-07
2119143,11260123,77234050,W,399691,Brad,Amash Bachmann Barton Bass Bentivolio Bridenstine Brooks (AL) Broun (GA) Burgess Chabot...,Cathy McMorris Rodgers,Congress_Republican,M,1.0,3.44058e-07
2001015,11050407,86569077,W,390043,Cheryl,"Here is the list of senators threatening to filibuster: Rand Paul, (R-Ky), Mitch McConnell (R-Ky...",Kirsten Gillibrand,Congress_Democratic,M,0.999998,1.563278e-06
2133995,11274975,77234050,W,399828,Shawn,Compiled below is the list of every current member of Congress who voted to extend the USA PATRI...,Cathy McMorris Rodgers,Congress_Republican,M,0.999998,1.795796e-06
389197,3139288,23984618,W,83580,Ali,Rand paul 2016,Ann Wagner,Congress_Republican,M,0.999997,3.490953e-06
1938986,10988378,86569077,W,389432,Robert,"Oh look, it's Rand Paul, Hi Rand!",Kirsten Gillibrand,Congress_Democratic,M,0.999996,4.067776e-06
2002576,11051968,86569077,W,390054,Keith,GO RAND PAUL,Kirsten Gillibrand,Congress_Democratic,M,0.999996,4.430485e-06


In [65]:
### how many distinct posts/posters are represented in these wrong sets? compared to the whole thing
### 28,522 posts received at least some wrong answers 
wrong_preds.post_id.value_counts()

389341    10714
399296     8149
399295     2093
296324     2032
389337     1912
          ...  
256150        1
467504        1
402305        1
395797        1
292078        1
Name: post_id, Length: 28522, dtype: int64

In [57]:
wrong_preds.op_name.value_counts()

Kirsten Gillibrand        130715
Cathy McMorris Rodgers     83279
Marco Rubio                29859
Ann Wagner                 25707
John McCain                12613
                           ...  
Salud Carbajal                67
Sander Levin                  62
Trent Kelly                   52
Cedric Richmond               47
Adriano Espaillat             12
Name: op_name, Length: 61, dtype: int64

In [58]:
# and there are also 61 total congress people in the whole set...
# which means that we got preds wrong for every congress person 
dev_df.op_name.value_counts()

Marco Rubio               632376
John McCain               301337
Kirsten Gillibrand        150081
Cathy McMorris Rodgers     94169
Paul Ryan                  93554
                           ...  
Salud Carbajal               963
Jim Banks                    774
Cedric Richmond              694
Sander Levin                 498
Adriano Espaillat            138
Name: op_name, Length: 61, dtype: int64

In [67]:
# for the top 5 CONGRESSPEOPLE whose responses we got wrong, 
# it looks like for several of them, we got most of our preds wrong
# whereas for others, we only got a few wrong
for problem_poster_id in [86569077, 77234050, 46506154, 23984618, 99469231]:
    percent_wrong = len(wrong_preds[wrong_preds.op_id==problem_poster_id]) / len(dev_df[dev_df.op_id==problem_poster_id])
    print("proportion wrong for {}: {}".format(problem_poster_id, percent_wrong))

proportion wrong for 86569077: 0.870963013306148
proportion wrong for 77234050: 0.8843568477949219
proportion wrong for 46506154: 0.047217161941629664
proportion wrong for 23984618: 0.9305364511691885
proportion wrong for 99469231: 0.04185679156558936


In [68]:
# for the top 5 POSTS we got wrong, 
# we generally got a high proportion of responses for a given post wrong 
for problem_post_id in [389341, 399296, 399295, 296324, 389337]:
    percent_wrong = len(wrong_preds[wrong_preds.post_id==problem_post_id]) / len(dev_df[dev_df.post_id==problem_post_id])
    print("proportion wrong for {}: {}".format(problem_post_id, percent_wrong))

proportion wrong for 389341: 0.9582327162150076
proportion wrong for 399296: 0.8380296174413822
proportion wrong for 399295: 0.8009950248756219
proportion wrong for 296324: 0.3650080833483025
proportion wrong for 389337: 0.9100428367444074


In [63]:
posts_path = data_folder + "facebook_congress_posts.csv"
posts_df = pd.read_csv(posts_path)

In [74]:
for problem_post_id in [389341, 399296, 399295, 296324, 389337]:
    given_post = posts_df[posts_df.post_id==problem_post_id]
    print(given_post['post_id'].values[0])
    print(given_post['post_text'].values[0])
    print()

389341
A number of you have reached out to me about your concerns with Betsy DeVos's nomination to be Education Secretary. I want to let you know that I share your concerns. I will be voting against her confirmation and I will urge my fellow Senators to do the same. Students, parents and teachers deserve an Education Secretary whose commitment to public education and safe schools will not waver. If public education fails, America fails, and I do not believe Mrs. DeVos shares my commitment to a strong public education system.

399296
This week marks the 5th anniversary of #Obamacare being signed into law. Whether it's turned your tax filing into a nightmare, you're facing skyrocketing premiums, or your employer has reduced your work hours, I want to hear about it.   Please share your story with me so that I can better understand the challenges you're facing: http://mcmorris.house.gov/your-story/

399295
After five years of #Obamacare, too many Americans are suffering. They deserve to be

In [78]:
dev_df[dev_df.post_id==389341].sort_values('prob_m', ascending=False)[:20]

Unnamed: 0,original_idx,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category,preds,prob_m,prob_f
1923633,10973025,86569077,W,389341,Harry,Where does Chuck Schumer stand??,Kirsten Gillibrand,Congress_Democratic,M,0.999958,4.2e-05
1916326,10965718,86569077,W,389341,Sheldon,Schumer's puppet,Kirsten Gillibrand,Congress_Democratic,M,0.999904,9.6e-05
1923276,10972668,86569077,W,389341,Kristina,*Chuck:(,Kirsten Gillibrand,Congress_Democratic,M,0.999868,0.000132
1919076,10968468,86569077,W,389341,Dan,Is congressman Schumer also going to vote no?,Kirsten Gillibrand,Congress_Democratic,M,0.999843,0.000157
1915774,10965166,86569077,W,389341,Florencia,"Thank you, Senator! I hope Senator Chuck Schumer (Chuck Schumer D-NY) shares your views and also...",Kirsten Gillibrand,Congress_Democratic,M,0.999791,0.000209
1918220,10967612,86569077,W,389341,Monique,Just called Schumer. 👍,Kirsten Gillibrand,Congress_Democratic,M,0.999556,0.000444
1924580,10973972,86569077,W,389341,Rick,Did Chuck Schumer tell you to say that??,Kirsten Gillibrand,Congress_Democratic,M,0.999547,0.000453
1920355,10969747,86569077,W,389341,Linda,Counting on you and Chuck Schumer to hold them accountable.,Kirsten Gillibrand,Congress_Democratic,M,0.999478,0.000522
1920323,10969715,86569077,W,389341,Mike,Please convince Senator Burr and others...,Kirsten Gillibrand,Congress_Democratic,M,0.999434,0.000566
1925226,10974618,86569077,W,389341,Mary,Thank you Senator! Now what about Chuck Schumer?,Kirsten Gillibrand,Congress_Democratic,M,0.999432,0.000568
