In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
from tqdm import tqdm
import ast
import csv
import glob
import pickle

csv.field_size_limit(100000000)

131072

# Read in csv datasets

In [3]:
files = glob.glob('data/train_df_spacy*.csv')

In [5]:
# attempting on a .1 random sampled set

test_df = pd.read_csv('data/test_df_spacy.csv').sample(frac=.1, random_state=123)

frames = []
for file in files:
    frames.append(pd.read_csv(file).sample(frac=.1, random_state=123))
train_df = pd.concat(frames)

In [13]:
test_df.head()

Unnamed: 0,title,label,bias,title_lemmas,title_pos,title_tags,title_deps
0,SAN FRANCISCO / Head of Juvenile Probation Dep...,False,left-center,"('san', 'francisco', '/', 'head', 'of', 'juven...","('PROPN', 'PROPN', 'SYM', 'PROPN', 'ADP', 'PRO...","('NNP', 'NNP', 'SYM', 'NNP', 'IN', 'NNP', 'NNP...","('compound', 'nsubj', 'punct', 'appos', 'prep'..."
1,"University leaders ban pro-life flag display, ...",True,right,"('university', 'leader', 'ban', 'pro', '-', 'l...","('NOUN', 'NOUN', 'VERB', 'ADJ', 'PUNCT', 'NOUN...","('NN', 'NNS', 'VBP', 'JJ', ',', 'NN', 'NN', 'N...","('compound', 'nsubj', 'ROOT', 'amod', 'punct',..."
2,"DONALD TRUMP, GET YOUR TINY PIGGY PERVERT HAND...",True,left,"('donald', 'trump', ',', 'get', 'your', 'tiny'...","('NUM', 'NOUN', 'PUNCT', 'VERB', 'ADJ', 'ADJ',...","('CD', 'NN', ',', 'VB', 'PRP$', 'JJ', 'NNP', '...","('compound', 'nsubj', 'punct', 'ROOT', 'poss',..."
3,DIRTY: Hillary Clinton Implies Trump is a Nazi...,True,right,"('dirty', ':', 'hillary', 'clinton', 'implies'...","('NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PROPN', '...","('NNS', ':', 'NNP', 'NNP', 'NNP', 'NNP', 'VBZ'...","('ROOT', 'punct', 'compound', 'nsubj', 'compou..."
4,A Bipartisan Work Plan,False,right-center,"('a', 'bipartisan', 'work', 'plan')","('DET', 'PROPN', 'NOUN', 'NOUN')","('DT', 'NNP', 'NN', 'NN')","('det', 'compound', 'compound', 'ROOT')"


In [5]:
train_df.head()

Unnamed: 0,title,label,bias,title_lemmas,title_pos,title_tags,title_deps
0,After DeVos Announced Plans To Reexamine Title...,True,right,"('after', 'devos', 'announced', 'plans', 'to',...","('ADP', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'PRO...","('IN', 'NNP', 'NNP', 'NNPS', 'IN', 'NNP', 'NNP...","('prep', 'pobj', 'compound', 'ROOT', 'prep', '..."
1,University To Award Trayvon Martin With Posthu...,True,right,"('university', 'to', 'award', 'trayvon', 'mart...","('PROPN', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'A...","('NNP', 'IN', 'NNP', 'NNP', 'NNP', 'IN', 'NNP'...","('compound', 'prep', 'pobj', 'compound', 'ROOT..."
2,Texas State University suspends Greek life aft...,False,right-center,"('texas', 'state', 'university', 'suspend', 'g...","('PROPN', 'PROPN', 'PROPN', 'VERB', 'ADJ', 'NO...","('NNP', 'NNP', 'NNP', 'VBZ', 'JJ', 'NN', 'IN',...","('compound', 'compound', 'nsubj', 'ROOT', 'amo..."
3,Jewish Organization's Huge Day Of Unity On Tue...,True,right,"('jewish', 'organization', ""'s"", 'huge', 'day'...","('PROPN', 'PROPN', 'PART', 'PROPN', 'PROPN', '...","('NNP', 'NNP', 'POS', 'NNP', 'NNP', 'IN', 'NNP...","('amod', 'poss', 'case', 'compound', 'ROOT', '..."
4,"BREAKING: Trump Reaches Agreement To Keep 1,00...",True,right,"('break', ':', 'trump', 'reaches', 'agreement'...","('VERB', 'PUNCT', 'PROPN', 'PROPN', 'NOUN', 'P...","('VBG', ':', 'NNP', 'NNP', 'NN', 'TO', 'VB', '...","('ROOT', 'punct', 'compound', 'compound', 'ROO..."


# setup for initial model experimentation

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

In [4]:
def dummy(doc):
    # use as param in vectorizers
    return ast.literal_eval(doc)

In [5]:
count_uni = CountVectorizer(stop_words = 'english',
                    analyzer='word',
                    tokenizer=dummy,
                    preprocessor=None,
                    max_features = 5000)
tfidf_uni = TfidfVectorizer(stop_words = 'english',
                    analyzer='word',
                    tokenizer=dummy,
                    preprocessor=None,
                    max_features = 5000)
count_bi = CountVectorizer(stop_words = 'english',
                    analyzer='word',
                    tokenizer=dummy,
                    preprocessor=None,
                    ngram_range=(1,2),
                    max_features = 10000)
tfidf_bi = TfidfVectorizer(stop_words = 'english',
                    analyzer='word',
                    tokenizer=dummy,
                    preprocessor=None,
                    ngram_range=(1,2),
                    max_features = 10000)

In [6]:
dt = DecisionTreeClassifier()
lr = LogisticRegression(verbose=True)
svm = LinearSVC(verbose=True)
mlp = MLPClassifier(verbose=True)

In [7]:
def model(model, vectorizer, x_train, y_train, x_test, y_test):
    # train a model (can be various kinds) & evaluate
    
    x_train_use = vectorizer.fit_transform(x_train) # if using dask, add x_train.compute()
    x_test_use = vectorizer.transform(x_test)
    
    model = model.fit(x_train_use, y_train)
    y_pred = model.predict(x_test_use)
    
    acc = sk.metrics.accuracy_score(y_test, y_pred)
    print("accuracy: {0:.2f}%".format(acc*100))
    
    mapping = list(zip(vectorizer.get_feature_names(), model.coef_[0]))
    mapping.sort(key=lambda x: abs(x[1]))
    mapping.reverse()
    
    return mapping

In [8]:
y_train = train_df['label']
y_test = test_df['label']

In [9]:
x_train = train_df['title_lemmas']
x_test = test_df['title_lemmas']

# logistic regression + count vectors: pre-vocab cut

In [10]:
lr_count_uni = model(lr, count_uni, x_train, y_train, x_test, y_test)

[LibLinear]accuracy: 57.35%


In [11]:
lr_count_uni

[('abq', -5.232931584343739),
 ('nm', -5.085423494676702),
 ('unm', -4.854699152587665),
 ('new mexico', -4.8337045556664275),
 ('lobo', -4.518975078048932),
 ('lobos', -4.488506540816385),
 ('truthdigger', 4.412166277205591),
 ('lottery', -4.364574812142593),
 ('apd', -4.164392767614455),
 ('topes', -4.115008008766362),
 ('aps', -4.11276172181072),
 ('tope', -3.9388203686084706),
 ('aggie', -3.8593462847602185),
 ('nmsu', -3.8003278759889914),
 ('editorial', -3.7917793447341865),
 ('baptists', -3.7879718771974216),
 ('sbc', -3.7369465487718463),
 ('albuquerque', -3.732258927299884),
 ('fe', -3.7307242356079127),
 ('calwatchdog', -3.6985887257982255),
 ('rr', -3.6250714767328827),
 ('cruces', -3.588170579619929),
 ('pnm', -3.5699972619071056),
 ('wp', -3.554707325789891),
 ('abby', -3.5521510042370275),
 ('forex', -3.5249344785421237),
 ('goings', -3.4640105704343522),
 ('lanl', -3.43915501498149),
 ('slumped', 3.4299379400481658),
 ('sandia', -3.3915608068208445),
 ('jumped', 3.363330

In [12]:
# test on own train set just to see
test_on_train = model(lr, count_uni, x_train, y_train, x_train, y_train)

[LibLinear]accuracy: 77.71%


In [14]:
# test on other sample of training set
test_train = pd.read_csv('data/train_df_spacy1.csv').sample(frac=.1, random_state=1)
y_test_train = test_train['label']
x_test_train = test_train['text_lemmas']
test_on_other_train = model(lr, count_uni, x_train, y_train, x_test_train, y_test_train)

[LibLinear]accuracy: 75.49%


# pre-vocab-cut with TFIDF instead of count

In [19]:
lr_tfidf_uni = model(lr, tfidf_uni, x_train, y_train, x_test, y_test)

[LibLinear]accuracy: 58.46%


In [20]:
lr_tfidf_uni

[('globalpost', -15.209367484290894),
 ('advertisement', -13.980293089637476),
 ('—', 11.26303306996885),
 ('albuquerque', -10.963246458902262),
 (' ', 9.344577484072838),
 ('say', -9.152463455789329),
 ('mr.', 6.271199251298934),
 ('mexico', -6.0622505930501696),
 ('nyse', 5.950101456753023),
 ('baptist', -5.74739112840326),
 ('\u200a', 5.693008391410383),
 ('%', 5.684710325103299),
 ('principles', -5.229156455902334),
 ('fe', -5.091643546829893),
 ('/', 4.928437576947869),
 ('blade', -4.918727977089896),
 ('fusion', -4.855452429678226),
 ('ap', -4.8115759118369),
 ('fool', 4.714885364305213),
 ('--', -4.624555091041627),
 ('standards', -4.488040821508517),
 ('motley', 4.454509496770389),
 ('2017', -4.4181035605464976),
 ('…', 4.405356234723576),
 ('california', -4.256611394040326),
 ('fox', 4.209820802144774),
 ('ethics', -4.2093115396258),
 ('reuters', -4.051947493167888),
 ('hillary', 3.9392980580087786),
 ('season', -3.936277075979115),
 ('click', 3.870803994261502),
 ("buy'sell",

# remove tokens overrepresented in the dataset from vocabulary (they are likely publisher-related)

See 4.1 in the paper.

In [15]:
def examine_tokens(df, token):
    num = 0
    for index, row in df.iterrows():
        if token in row['text_lemmas']:
            num += 1
            if num <= 20:
                print(row['text'])
    return num

In [16]:
def token_disparity(x_train, x_test, tokens):
    for token in tokens:
        token = str(token[0])
        train_count = 0
        for train_item in x_train:
            train_count += train_item.count(token)
        test_count = 0
        for test_item in x_test:
            test_count += test_item.count(token)
        train_norm = train_count/len(x_train)
        test_norm = test_count/len(x_test)
        if test_norm == 0:
            val = 'does not appear in test'
            print(token)
        else:
            val = train_norm/test_norm
            if val > 5:
                print(token)

In [18]:
token_disparity(x_train, x_test, lr_count_uni)

abq
new mexico
lobo
lobos
truthdigger
lottery
apd
topes
tope
aggie
nmsu
editorial
baptists
sbc
albuquerque
calwatchdog
cruces
pnm
forex
goings
lanl
slumped
sandia
jumped
carlsbad
farmington
cbf
mojo
gsk
brodner
wipp
taos
chatter
navajo
cnm
bernalillo
rancho
diocese
mulesoft
roswell
bosque
i-25
i-40
paseo
fmr
councilor
earnings
playoff
mudslide
cps
pearce
sandoval
baptist
cueva
shares
therapeutics
cibola
tourney
™
5a
fiesta
popped
mwc
bcso
factcheck
stocks
udall
investors
rapture
exhibition
archdiocese
tariff
scheer
grain
priest
mesa
futures
traders
clovis
skier
j.c.
6a
snapshot
cardinals
aussie
resorts
viking
opener
ministry
pharmaceuticals
slaying
dividend
ltd.
dividends
climbs
clergy
evening’
selloff
p.m.
brands
avalanche
motors
jaguar
armour
lifts
plc
corp.
fitbit
procter
inc.
sciences
nvidia
401(k
ftse
recalls
embezzlement
volleyball
quarterly
hsbc
chipotle
pretribulation
midday’
5’
pfizer
volume
valeant
nokia
amazon.com
nasdaq
pct
biotech
highs
gopro
gallery
aramco
cash’
pandora
l

KeyboardInterrupt: 

In [25]:
token_disparity(x_train, x_test, mapping)

thomson
councilor
jackpot
romero
seminary
slideshow
lobos
navajo
lobo
touchdown
rebound
innings
scoring
gardner
playoff
composite
investing
quarterback
sophomore
coach
millions
championship
powerball
qualcomm
tournament
gadget
tesla
ncaa
wynn
payout
midday
retailer
chipotle
alphabet
hockey
alamos
teammate
holdings
halftime


# upgraded vectorization - own counter

In [20]:
from collections import Counter
import re

In [21]:
def dummy_two(doc):
    # use as param in vectorizers
    return doc

In [22]:
def countvocab(text_lemmas):
    
    vocab = Counter()
    word = re.compile('[a-z]*\Z')
    
    except_file = open('token_exceptions.txt')
    exceptions = except_file.read().split('\n')
    except_file.close()
    stopwords_file = open('sklearn_stopwords.txt')
    stopwords = stopwords_file.read().split('\n')
    stopwords_file.close()
    
    for text in text_lemmas:
        for token in ast.literal_eval(text):
            if word.match(token) == None:
                continue
            if token in exceptions:
                continue
            if token in stopwords:
                continue
            vocab[token] += 1
    
    return [couple[0] for couple in vocab.most_common(4000)]

In [23]:
vocab = countvocab(x_train)

In [24]:
vocab_file = open('vocab.txt', 'w')
for word in vocab:
    vocab_file.write(word+'\n')
vocab_file.close()

In [25]:
count_uni = CountVectorizer(vocabulary = vocab,
                    analyzer='word',
                    tokenizer=dummy_two,
                    preprocessor=None,
                    lowercase = False)

In [26]:
train_fvs = count_uni.transform(x_train)

In [27]:
test_fvs = count_uni.transform(x_test)

In [28]:
def count_fvs(text):
    
    text = ast.literal_eval(text)
    
    vector = [0]*len(vocab)
    
    for idx in range(len(vocab)):
        vector[idx] = text.count(vocab[idx])
    
    return vector

In [29]:
train_fvs = x_train.apply(count_fvs)
test_fvs = x_test.apply(count_fvs)

In [7]:
# Unigram logistic regression with updated vocabulary

model = lr.fit(train_fvs, y_train)
y_pred = model.predict(test_fvs)
    
acc = sk.metrics.accuracy_score(y_test, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))
    
mapping = list(zip(vocab, model.coef_[0]))
mapping.sort(key=lambda x: abs(x[1]))
mapping.reverse()

In [None]:
mapping

In [99]:
pickle.dump(model, open('hyperpartisan-model.sav', 'wb'))

# Evaluation

In [8]:
# Read datasets, read hyperpartisan model, give percentages.