In [1]:
import pickle
import numpy as np
from tensorflow.contrib import predictor
from nltk import word_tokenize
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.linear_model import LogisticRegression

This is the notebook for the voting system done by the three LSTMs, each trained on a different given dataset.

All datasets are loaded, evaluated by all models, and then results are saved to another CSV file. Those examples for which all models agree correctly are saved to a new dataset to be used in a separate LSTM.

# mark votes

In [5]:
train = pd.read_csv('df/train_df.csv')
test = pd.read_csv('df/test_df.csv')
byarticle = pd.read_csv('df/byarticle_df.csv')

In [8]:
max_sent_length = 100
word_embed_size = 50
embed_vocab_size = len(word2idx.keys())
word2idx = pickle.load(open("word-embeddings/embed_total_idx.pkl", "rb" ))

def process(sent):
    # Takes text, transforms to padded, uniform length vectors of word indices in embedding.
    
    vector = np.zeros([max_sent_length])
    
    if type(sent) == str:
            sent = word_tokenize(sent)
    elif type(sent) == np.str_:
            sent = word_tokenize(sent)
    else:
        sent = []
            
    for word_idx in range(max_sent_length):
        try:
            word = str(sent[word_idx]).lower()
            vector[word_idx] = word2idx[word]
        except KeyError:
            vector[word_idx] = embed_vocab_size
        except IndexError:
            vector[word_idx] = embed_vocab_size

    return [vector]

In [4]:
train_pred = predictor.from_saved_model("/project/cramerus/LSTM-text-train")
test_pred = predictor.from_saved_model("/project/cramerus/LSTM-text-test")
byarticle_pred = predictor.from_saved_model("/project/cramerus/LSTM-text-byarticle")

INFO:tensorflow:Restoring parameters from /project/cramerus/LSTM-text-train/variables/variables
INFO:tensorflow:Restoring parameters from /project/cramerus/LSTM-text-test/variables/variables
INFO:tensorflow:Restoring parameters from /project/cramerus/LSTM-text-byarticle/variables/variables


In [17]:
def add_pred_cols(df):
    
    df['train_pred'] = np.nan
    df['train_pred'] = df['train_pred'].astype(object)
    df['test_pred'] = np.nan
    df['test_pred'] = df['test_pred'].astype(object)
    df['byarticle_pred'] = np.nan
    df['byarticle_pred'] = df['byarticle_pred'].astype(object)

    for idx, row in tqdm_notebook(df.iterrows(), total = df.shape[0]):
        if pd.isnull(row['text']):
            continue
        else:
            df.at[idx, 'train_pred'] = train_pred({"Sentence": process(row['text'])})['Prediction'][0]
            df.at[idx, 'test_pred'] = test_pred({"Sentence": process(row['text'])})['Prediction'][0]
            df.at[idx, 'byarticle_pred'] = byarticle_pred({"Sentence": process(row['text'])})['Prediction'][0]
            
    return df        

In [20]:
byarticle = add_pred_cols(byarticle)

HBox(children=(IntProgress(value=0, max=645), HTML(value='')))




In [21]:
test = add_pred_cols(test)

HBox(children=(IntProgress(value=0, max=150000), HTML(value='')))




In [23]:
byarticle.to_csv('df/byarticle_df_vote.csv', index=False)

In [24]:
test.to_csv('df/test_df_vote.csv', index=False)

In [25]:
train = add_pred_cols(train)

HBox(children=(IntProgress(value=0, max=600000), HTML(value='')))

In [26]:
train.to_csv('df/train_df_vote.csv', index=False)

In [27]:
train.head()

Unnamed: 0,title,text,label,bias,train_pred,test_pred,byarticle_pred
0,After DeVos Announced Plans To Reexamine Title...,When explaining her decision to reevaluate Tit...,True,right,0,1,1
1,University To Award Trayvon Martin With Posthu...,A Florida university will honor Trayvon Martin...,True,right,1,0,1
2,Texas State University suspends Greek life aft...,Texas State University has suspended all Greek...,False,right-center,0,1,1
3,Jewish Organization's Huge Day Of Unity On Tue...,Against the backdrop of an increasingly polari...,True,right,1,1,1
4,"BREAKING: Trump Reaches Agreement To Keep 1,00...",President-elect Donald Trump has reached an ag...,True,right,1,1,0


# explore votes

In [3]:
train = pd.read_csv('df/train_df_vote.csv').dropna()
test = pd.read_csv('df/test_df_vote.csv').dropna()
byarticle = pd.read_csv('df/byarticle_df_vote.csv').dropna()

In [5]:
train['train_pred'] = train['train_pred'].astype(int)
train['test_pred'] = train['test_pred'].astype(int)
train['byarticle_pred'] = train['byarticle_pred'].astype(int)
train['label'] = train['label'].astype(int)

test['train_pred'] = test['train_pred'].astype(int)
test['test_pred'] = test['test_pred'].astype(int)
test['byarticle_pred'] = test['byarticle_pred'].astype(int)
test['label'] = test['label'].astype(int)

byarticle['train_pred'] = byarticle['train_pred'].astype(int)
byarticle['test_pred'] = byarticle['test_pred'].astype(int)
byarticle['byarticle_pred'] = byarticle['byarticle_pred'].astype(int)
byarticle['label'] = byarticle['label'].astype(int)

In [56]:
combined = pd.concat([byarticle[byarticle['label']==True].sample(200),
                      byarticle[byarticle['label']==False].sample(200),
                      test[test['label']==True].sample(200),
                      test[test['label']==False].sample(200),
                      train[train['label']==True].sample(200),
                      train[train['label']==False].sample(200)])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [69]:
clf = LogisticRegression()
clf.fit(combined[['train_pred', 'test_pred', 'byarticle_pred']], combined['label'])

print('byarticle score: ')
print(clf.score(byarticle[['train_pred', 'test_pred', 'byarticle_pred']], byarticle['label']))
print('train score: ')
print(clf.score(train[['train_pred', 'test_pred', 'byarticle_pred']], train['label']))
print('test score: ')
print(clf.score(test[['train_pred', 'test_pred', 'byarticle_pred']], test['label']))



byarticle score: 
0.6201550387596899
train score: 
0.6852039451688399
test score: 
0.633235304796137


In [68]:
comb_df = pd.concat([byarticle[byarticle['label']==True].sample(200),
                      byarticle[byarticle['label']==False].sample(200),
                      test[test['label']==True].sample(200),
                      test[test['label']==False].sample(200),
                      train[train['label']==True].sample(200),
                      train[train['label']==False].sample(200)
                    ])

clf = LogisticRegression()
clf.fit(comb_df[['train_pred', 'test_pred', 'byarticle_pred']], comb_df['label'])

print('training score: ')
print(clf.score(comb_df[['train_pred', 'test_pred', 'byarticle_pred']], comb_df['label']))
print('byarticle score: ')
print(clf.score(byarticle[['train_pred', 'test_pred', 'byarticle_pred']], byarticle['label']))
print('train score: ')
print(clf.score(train[['train_pred', 'test_pred', 'byarticle_pred']], train['label']))
print('test score: ')
print(clf.score(test[['train_pred', 'test_pred', 'byarticle_pred']], test['label']))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


training score: 
0.6483333333333333
byarticle score: 
0.6201550387596899
train score: 
0.6852039451688399
test score: 
0.633235304796137


In [83]:
train_agree = train[train['label'] == train['train_pred']][train['label'] == train['test_pred']][train['label'] == train['byarticle_pred']].drop('bias', axis=1)
test_agree = test[test['label'] == test['train_pred']][test['label'] == test['test_pred']][test['label'] == test['byarticle_pred']].drop('bias', axis=1)
byarticle_agree = byarticle[byarticle['label'] == byarticle['train_pred']][byarticle['label'] == byarticle['test_pred']][byarticle['label'] == byarticle['byarticle_pred']]

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [97]:
count = 0
for idx, row in train_agree.iterrows():
    print(str(row['label']) + '\t' + row['text'][:100])
    count += 1
    if count > 50:
        break

1	Against the backdrop of an increasingly polarized political landscape, one organization is issuing a
0	LINCOLN, Neb. (AP) _ The winning numbers in Tuesday evening’s drawing of the “2 By 2” game were: Red
0	The Centers for Medicare and Medicaid Services said Friday after markets closed that it expects cost
0	Wildlife removal experts discovered a large venomous rattlesnake underneath a home in Louisiana. Gul
0	Just touch the stamp with your finger, and the heat transforms the image of the blacked-out sun into
1	Empires die. Some abruptly, by cataclysmic defeat and destruction; most in stages, over years, decad
0	LAS CRUCES, N.M. — Authorities have identified an elderly couple who were found shot to death inside
1	Google Inc's posted a better-than-expected quarterly profit for the first time in the last six quart
0	CONCORD, N.H. (AP) _ The winning numbers in Saturday evening's drawing of the "Megabucks Plus" game 
1	In this segment, they discuss a nonfamous fellow, DuPont employee Willi

In [98]:
train_agree.to_csv('df/train_agree_df.csv', index=False)
test_agree.to_csv('df/test_agree_df.csv', index=False)
byarticle_agree.to_csv('df/byarticle_agree_df.csv', index=False)

In [6]:
train_agree = train[train['label'] == train['train_pred']][train['label'] == train['test_pred']].drop('bias', axis=1)
test_agree = test[test['label'] == test['train_pred']][test['label'] == test['test_pred']].drop('bias', axis=1)

  """Entry point for launching an IPython kernel.
  


In [2]:
train_agree = pd.read_csv('data/agree/train_agree_df.csv')
test_agree = pd.read_csv('data/agree/test_agree_df.csv')
byarticle_agree = pd.read_csv('data/agree/byarticle_agree_df.csv')

In [4]:
print(len(train_agree))
print(sum(train_agree.label))

133868
46449


In [5]:
print(len(test_agree))
print(sum(test_agree.label))

28048
13856


In [6]:
print(len(byarticle_agree))
print(sum(byarticle_agree.label))

130
86


In [7]:
133868+28048+130

162046

In [8]:
46449+13856+86

60391

In [9]:
60391/162046

0.3726781284326673