In [149]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd
import numpy as np
import random
import copy
import math
import time
import os

In [2]:
posts = "data//facebook_congress_posts.csv"
responses = "data//facebook_congress_responses.csv"

In [48]:
posts_df = pd.read_csv(posts)

In [4]:
responses_df = pd.read_csv(responses)

In [49]:
posts_df.head()

Unnamed: 0,op_id,op_gender,post_id,post_text,post_type
0,57265377,M,0,"Yesterday, my colleagues and I voted to protec...",video
1,57265377,M,1,Roses are red...and so is Texas. Let's keep it...,video
2,57265377,M,2,#TBT to this classic video. #DonkeyWhisperer,video
3,57265377,M,3,Since President Donald J. Trump was sworn in o...,video
4,57265377,M,4,Remembering our 40th president today. LIKE to ...,video


In [7]:
responses_df.head()

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican


In [8]:
## add in the names that were missing initially (found from EDA notebook)
responses_df.loc[responses_df.op_id==42721680, 'op_name'] = 'Scott Tipton'
responses_df.loc[responses_df.op_id==44922372, 'op_name'] = 'Martin Heinrich'
responses_df.loc[responses_df.op_id==54138093, 'op_name'] = 'Mac Thornberry'
responses_df.loc[responses_df.op_id==96418867, 'op_name'] = 'Darren Soto'

Split version 1: just on politician id, because what we're trying to predict is about the individual politician.

In [112]:
# shuffle with random seed 
all_unique_posters = responses_df['op_id'].drop_duplicates().values
all_unique_posters2 = copy.deepcopy(all_unique_posters)
random.Random(7).shuffle(all_unique_posters2)
all_unique_posters2[:5]

array([84331141, 23790289,  4499588, 74629159, 33547363])

In [113]:
seventy_percent_split = math.floor(.7 * len(all_unique_posters))

In [114]:
fifteen_percent_split = seventy_percent_split + math.floor(.15*len(all_unique_posters))

In [115]:
fifteen_percent_split

341

In [116]:
seventy_percent_split

281

In [117]:
train_idxs = all_unique_posters2[:seventy_percent_split]
test_idxs = all_unique_posters2[seventy_percent_split:fifteen_percent_split]
dev_idxs = all_unique_posters2[fifteen_percent_split:]

In [128]:
def determine_gender_split_of_data_subsets(idx_list, list_title):
    posters = pd.DataFrame({'op_id': idx_list})
    posters = posters.merge(posts_df, on='op_id')
    posters = posters.drop_duplicates(subset='op_id')
    percent_male = len(posters[posters.op_gender=='M']) / len(posters)
    print("Proportion male in {} set: {:.3f}".format(list_title, percent_male))

In [129]:
determine_gender_split_of_data_subsets(train_idxs, "Train")
determine_gender_split_of_data_subsets(test_idxs, "Test")
determine_gender_split_of_data_subsets(dev_idxs, "Dev")

Proportion male in Train set: 0.776
Proportion male in Test set: 0.717
Proportion male in Dev set: 0.738


In [189]:
train_rows = responses_df[responses_df.op_id.isin(train_idxs)]

In [190]:
test_rows = responses_df[responses_df.op_id.isin(test_idxs)]

In [191]:
dev_rows = responses_df[responses_df.op_id.isin(dev_idxs)]

In [192]:
def determine_relative_dataset_sizes(train_rows, test_rows, dev_rows):
    train_size = len(train_rows)
    test_size = len(test_rows)
    dev_size = len(dev_rows)
    full_size = train_size + dev_size + test_size
    print("Proportion train: {:.3f}".format(train_size / full_size))
    print("Proportion test: {:.3f}".format(test_size / full_size))
    print("Proportion dev: {:.3f}".format(dev_size / full_size))
    
determine_relative_dataset_sizes(train_rows, test_rows, dev_rows)

Proportion train: 0.712
Proportion test: 0.122
Proportion dev: 0.165


In [194]:
epoch = int(time.time())
subfolder = "data//splits_{}".format(epoch)

In [195]:
os.mkdir(subfolder)

In [196]:
train_path = subfolder + "//train.csv"
test_path = subfolder + "//test.csv"
dev_path = subfolder + "//dev.csv"

In [197]:
def write_to_csv(dataset, path):
    rows = dataset.reset_index()
    rows.columns = ['original_idx', 
                    'op_id', 
                    'op_gender', 
                    'post_id', 
                    'responder_id', 
                    'response_text', 
                    'op_name', 
                    'op_category']
    rows.to_csv(path)

In [198]:
write_to_csv(dev_rows, dev_path)
write_to_csv(train_rows, train_path)
write_to_csv(test_rows, test_path)