In [2]:
import pandas as pd
import nltk
import spacy
from glob import glob
import sklearn as sk
import ast
import csv
import re
from collections import Counter
from collections import defaultdict

csv.field_size_limit(100000000)

100000000

# Feature Extraction

These are the functions used to extract the linguistic features from the spaCy tags previously created.

There are also functions used to extract counts of words from specific lexica that have been referenced.

In [3]:
# POS tag counts

def count_adj(pos):
    # takes unevaluated list of POS tags for a single text
    pos = ast.literal_eval(pos)
    if len(pos) == 0:
        return 0.0
    return pos.count('ADJ')/len(pos)

def count_adv(pos):
    # takes unevaluated list of POS tags for a single text
    pos = ast.literal_eval(pos)
    if len(pos) == 0:
        return 0.0
    return pos.count('ADV')/len(pos)

def count_pron(pos):
    # takes unevaluated list of POS tags for a single text
    pos = ast.literal_eval(pos)
    if len(pos) == 0:
        return 0.0
    return pos.count('PRON')/len(pos)

def count_noun(pos):
    # takes unevaluated list of POS tags for a single text
    pos = ast.literal_eval(pos)
    if len(pos) == 0:
        return 0.0
    return pos.count('NOUN')/len(pos)

def count_propn(pos):
    # takes unevaluated list of POS tags for a single text
    pos = ast.literal_eval(pos)
    if len(pos) == 0:
        return 0.0
    return pos.count('PROPN')/len(pos)

In [4]:
# MPQA lexicons: intensifiers, subjectivity clues (strong, weak)

with open('lexica/intensifiers.tff') as f:
    intensifiers = [line[29:-15].split(' ')[0] for line in f.readlines()]
    
with open('lexica/subjclueslen1-HLTEMNLP05.tff') as f:
    clues = [(line[5:].split(' ')[0], line[5:].split(' ')[2][6:]) for line in f.readlines()]
    strongsubj = []
    weaksubj = []
    for strength, word in clues:
        if strength == 'weaksubj':
            weaksubj.append(word)
        else:
            strongsubj.append(word)
            
def count_strongsubj(lemmas):
    lemmas = ast.literal_eval(lemmas)
    if len(lemmas) == 0:
        return 0.0
    clues = 0
    for word in strongsubj:
        if word in lemmas:
            clues += 1
    return clues/len(set(lemmas))

def count_weaksubj(lemmas):
    lemmas = ast.literal_eval(lemmas)
    if len(lemmas) == 0:
        return 0.0
    clues = 0
    for word in weaksubj:
        if word in lemmas:
            clues += 1
    return clues/len(set(lemmas))

In [5]:
# count passives, modality

def count_aux(dep):
    dep = ast.literal_eval(dep)
    if len(dep) == 0:
        return 0.0
    return (dep.count('aux') + dep.count('auxpass')) / len(dep)

def count_pass(dep):
    dep = ast.literal_eval(dep)
    if len(dep) == 0:
        return 0.0
    return (dep.count('nsubjpass') + dep.count('csubjpass')) / len(dep)

In [6]:
# readability features

def avg_sent_length(text):
    if type(text) != str:
        return 0.0
    words = nltk.word_tokenize(text)
    sents = nltk.sent_tokenize(text)
    if len(sents) == 0:
        return 0.0
    return len(words)/len(sents)

def avg_word_length(text):
    if type(text) != str:
        return 0.0
    words = nltk.word_tokenize(text)
    if len(words) == 0:
        return 0.0
    return len(text)/len(words)

def exclamation_marks(text):
    if type(text) != str:
        return 0.0
    if len(text) == 0:
        return 0.0
    return text.count('!')/len(text)

def question_marks(text):
    if type(text) != str:
        return 0.0
    if len(text) == 0:
        return 0.0
    return text.count('?')/len(text)

def multiple_punct(text):
    if type(text) != str:
        return 0.0
    if len(text) == 0:
        return 0.0
    count = 0
    count += text.count('!!')
    count += text.count('??')
    count += text.count('?!')
    count += text.count('!?')
    return count/len(text)

In [7]:
# EmoLex

emotions = defaultdict(lambda: [])

with open('lexica/NRC-Sentiment-Emotion-Lexicons/NRC-Affect-Intensity-Lexicon/NRC-AffectIntensity-Lexicon.txt') as f:
    pairs = [(line[:-1].split('\t')[0], line[:-1].split('\t')[2]) for line in f.readlines()[1:]]
    for word, emotion in pairs:
        emotions[emotion].append(word)

def count_anger(lemmas):
    lemmas = ast.literal_eval(lemmas)
    if len(lemmas) == 0:
        return 0.0
    clues = 0
    for word in emotions['anger']:
        if word in lemmas:
            clues += 1
    return clues/len(set(lemmas))

def count_fear(lemmas):
    lemmas = ast.literal_eval(lemmas)
    if len(lemmas) == 0:
        return 0.0
    clues = 0
    for word in emotions['fear']:
        if word in lemmas:
            clues += 1
    return clues/len(set(lemmas))

def count_sadness(lemmas):
    lemmas = ast.literal_eval(lemmas)
    if len(lemmas) == 0:
        return 0.0
    clues = 0
    for word in emotions['sadness']:
        if word in lemmas:
            clues += 1
    return clues/len(set(lemmas))

def count_joy(lemmas):
    lemmas = ast.literal_eval(lemmas)
    if len(lemmas) == 0:
        return 0.0
    clues = 0
    for word in emotions['joy']:
        if word in lemmas:
            clues += 1
    return clues/len(set(lemmas))

# Hapax Legomena

In [97]:
def countvocab(df):
    
    vocab = Counter()
    word = re.compile('[a-z]*\Z')
    
    #except_file = open('token_exceptions.txt')
    #exceptions = except_file.read().split('\n')
    #except_file.close()
    #stopwords_file = open('sklearn_stopwords.txt')
    #stopwords = stopwords_file.read().split('\n')
    #stopwords_file.close()
    
    for text in df['text_lemmas']:
        for token in ast.literal_eval(text):
            if word.match(token) == None:
                continue
            #if token in exceptions:
            #    continue
            #if token in stopwords:
            #    continue
            vocab[token] += 1
            
    hapax_legomena = [word for word in vocab if vocab[word]==1]
            
    true = 0
    true_total = 0
    false = 0
    false_total = 0
    
    for row, item in df.iterrows():
        if item['label'] == True:
            true_total += 1
        else:
            false_total += 1
            
        if len(list(set(hapax_legomena).intersection(ast.literal_eval(item['text_lemmas'])))) != 0:
            if item['label'] == True:
                true += 1
            else:
                false += 1
    
    print('Proportion of biased articles with hapax legomena: ' + str(float(true) / true_total))
    print('Proportion of unbiased articles with hapax legomena: ' + str(float(false) / false_total))
    
    return hapax_legomena

In [98]:
countvocab(train_df)

Proportion of biased articles with hapax legomena: 0.4539581043727312
Proportion of unbiased articles with hapax legomena: 0.3920528475628065


['canadean',
 'lbix',
 'happywater',
 'lapic',
 'waterbox',
 'undermotivated',
 'condesa',
 'mayekiso',
 'tiwi',
 'redesignate',
 'tudeh',
 'nonpeak',
 'prologistix',
 'dibella',
 'kassicieh',
 'aisles',
 'locater',
 'overprocess',
 'hilar',
 'kittens',
 'dcfc',
 'blung',
 'dyrice',
 'vanishedwill',
 'minkowing',
 'hoedown',
 'manian',
 'lanni',
 'gooden',
 'gorse',
 'senara',
 'poldark',
 'smit',
 'shoeprint',
 'porthcurno',
 'harborfront',
 'godrevy',
 'bodelva',
 'pentewan',
 'overgrowth',
 'marazion',
 'bossiney',
 'barnoon',
 'riseat',
 'contenton',
 'tocover',
 'assetsof',
 'leadthe',
 'onfacebook',
 'gamoloco',
 'obstat',
 'montreat',
 'churchworks',
 'youthministry',
 'chandon',
 'sweatily',
 'bumfordshire',
 'visitingdisney',
 'parentcomcast',
 'activization',
 'reasonability',
 'nivida',
 'sinawhen',
 'ethelr',
 'svein',
 'forkbeard',
 'dansk',
 'overbr',
 'inutterably',
 'soberminded',
 'sunnylands',
 'harping',
 'destructur',
 'pagnoulle',
 'significantincrease',
 'medicine

# Testing

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier

In [9]:
feature_cols = ['ADJ_count', 'ADV_count', 'PRON_count', 'NOUN_count', 'PROPN_count', 
                'strongsubj_count', 'weaksubj_count', 'aux_count', 'pass_count',
               'sent_length', 'word_length', 'excl_count', 'quest_count', 'extra_punct_count',
               'anger_count', 'fear_count', 'sadness_count', 'joy_count']

## Small article dataset

In [10]:
df = pd.read_csv('byarticle_df_spacy.csv', encoding='utf8', engine='python')

In [37]:
# balance classes in byarticle dataset (238, 242 -> 480)
biased = df.loc[df.label == True]
unbiased = df.loc[df.label == False]
df = pd.concat([biased, unbiased.sample(n=242).reset_index(drop=True)])

In [38]:
feature_df = pd.DataFrame()
feature_df['label'] = df.label

In [39]:
feature_df['ADJ_count'] = df['text_pos'].apply(count_adj)
feature_df['ADV_count'] = df['text_pos'].apply(count_adv)
feature_df['PRON_count'] = df['text_pos'].apply(count_pron)
feature_df['NOUN_count'] = df['text_pos'].apply(count_noun)
feature_df['PROPN_count'] = df['text_pos'].apply(count_propn)

feature_df['strongsubj_count'] = df['text_lemmas'].apply(count_strongsubj)
feature_df['weaksubj_count'] = df['text_lemmas'].apply(count_weaksubj)

feature_df['aux_count'] = df['text_deps'].apply(count_aux)
feature_df['pass_count'] = df['text_deps'].apply(count_pass)

feature_df['sent_length'] = df['text'].apply(avg_sent_length)
feature_df['word_length'] = df['text'].apply(avg_word_length)
feature_df['excl_count'] = df['text'].apply(exclamation_marks)
feature_df['quest_count'] = df['text'].apply(question_marks)
feature_df['extra_punct_count'] = df['text'].apply(multiple_punct)

feature_df['anger_count'] = df['text_lemmas'].apply(count_anger)
feature_df['fear_count'] = df['text_lemmas'].apply(count_fear)
feature_df['sadness_count'] = df['text_lemmas'].apply(count_sadness)
feature_df['joy_count'] = df['text_lemmas'].apply(count_joy)

In [40]:
feature_df.head()

Unnamed: 0,label,ADJ_count,ADV_count,PRON_count,NOUN_count,PROPN_count,strongsubj_count,weaksubj_count,aux_count,pass_count,sent_length,word_length,excl_count,quest_count,extra_punct_count,anger_count,fear_count,sadness_count,joy_count
0,True,0.081754,0.040877,0.021327,0.199052,0.05628,0.085561,0.187166,0.057464,0.005924,27.442623,5.193548,0.0,0.000115,0.0,0.02139,0.048128,0.034759,0.053476
1,True,0.104247,0.088803,0.015444,0.185328,0.084942,0.129496,0.172662,0.046332,0.0,24.0,5.299242,0.0,0.0,0.0,0.05036,0.079137,0.071942,0.05036
2,True,0.063492,0.063492,0.021164,0.201058,0.095238,0.116505,0.116505,0.047619,0.010582,17.727273,5.041026,0.0,0.003052,0.0,0.038835,0.058252,0.058252,0.038835
3,True,0.110145,0.06087,0.014493,0.194203,0.063768,0.185185,0.197531,0.034783,0.005797,14.666667,4.965909,0.000572,0.000572,0.0,0.055556,0.049383,0.04321,0.055556
5,True,0.101499,0.075993,0.036813,0.197213,0.023928,0.113235,0.216176,0.043913,0.008151,30.520325,4.9374,0.0,0.00027,0.0,0.013235,0.017647,0.019118,0.045588


In [41]:
# you want all rows, and the feature_cols' columns
X = feature_df.loc[:, feature_cols]
X.shape

(480, 18)

In [42]:
y = feature_df.label
y.shape

(480,)

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)

In [43]:
lr_small = RidgeClassifier()

In [44]:
lr_small.fit(X, y)
y_pred = lr_small.predict(X)
acc = sk.metrics.accuracy_score(y, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 72.92%


In [50]:
y_pred = lr_small.predict(X_train)
acc = sk.metrics.accuracy_score(y_train, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 61.57%


In [51]:
y_pred = lr_small.predict(X_test)
acc = sk.metrics.accuracy_score(y_test, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 53.67%


In [61]:
mapping = list(zip(feature_cols, lr_small.coef_[0]))
mapping

[('ADJ_count', 1.5973991360031634),
 ('ADV_count', 1.3348986491919477),
 ('PRON_count', 0.6973130886604662),
 ('NOUN_count', -1.23016321062661),
 ('PROPN_count', -1.1620468124807801),
 ('strongsubj_count', 2.978597137992384),
 ('weaksubj_count', 2.645595357455885),
 ('aux_count', -0.09251754547012407),
 ('pass_count', -0.27660180128839884),
 ('sent_length', -0.007577823158137686),
 ('word_length', 0.1117325295379342),
 ('excl_count', 0.00815072501107081),
 ('quest_count', 0.04067398305819997),
 ('extra_punct_count', -0.003773850110652416),
 ('anger_count', 0.6622751920714581),
 ('fear_count', 0.5013505249165046),
 ('sadness_count', 0.2743968475857176),
 ('joy_count', 0.2566404470378008)]

## Complete dataset

In [45]:
train_files = glob('data/train_df_spacy*.csv')

test_df = pd.read_csv('data/test_df_spacy.csv').sample(frac=.01, random_state=123)

frames = []
for file in train_files:
    frames.append(pd.read_csv(file).sample(frac=.01, random_state=123))
train_df = pd.concat(frames)

In [46]:
train_feature_df = pd.DataFrame()
train_feature_df['label'] = train_df.label
test_feature_df = pd.DataFrame()
test_feature_df['label'] = test_df.label

In [47]:
train_feature_df['ADJ_count'] = train_df['text_pos'].apply(count_adj)
train_feature_df['ADV_count'] = train_df['text_pos'].apply(count_adv)
train_feature_df['PRON_count'] = train_df['text_pos'].apply(count_pron)
train_feature_df['NOUN_count'] = train_df['text_pos'].apply(count_noun)
train_feature_df['PROPN_count'] = train_df['text_pos'].apply(count_propn)

train_feature_df['strongsubj_count'] = train_df['text_lemmas'].apply(count_strongsubj)
train_feature_df['weaksubj_count'] = train_df['text_lemmas'].apply(count_weaksubj)

train_feature_df['aux_count'] = train_df['text_deps'].apply(count_aux)
train_feature_df['pass_count'] = train_df['text_deps'].apply(count_pass)

train_feature_df['sent_length'] = train_df['text'].apply(avg_sent_length)
train_feature_df['word_length'] = train_df['text'].apply(avg_word_length)
train_feature_df['excl_count'] = train_df['text'].apply(exclamation_marks)
train_feature_df['quest_count'] = train_df['text'].apply(question_marks)
train_feature_df['extra_punct_count'] = train_df['text'].apply(multiple_punct)

train_feature_df['anger_count'] = train_df['text_lemmas'].apply(count_anger)
train_feature_df['fear_count'] = train_df['text_lemmas'].apply(count_fear)
train_feature_df['sadness_count'] = train_df['text_lemmas'].apply(count_sadness)
train_feature_df['joy_count'] = train_df['text_lemmas'].apply(count_joy)

test_feature_df['ADJ_count'] = test_df['text_pos'].apply(count_adj)
test_feature_df['ADV_count'] = test_df['text_pos'].apply(count_adv)
test_feature_df['PRON_count'] = test_df['text_pos'].apply(count_pron)
test_feature_df['NOUN_count'] = test_df['text_pos'].apply(count_noun)
test_feature_df['PROPN_count'] = test_df['text_pos'].apply(count_propn)

test_feature_df['strongsubj_count'] = test_df['text_lemmas'].apply(count_strongsubj)
test_feature_df['weaksubj_count'] = test_df['text_lemmas'].apply(count_weaksubj)

test_feature_df['aux_count'] = test_df['text_deps'].apply(count_aux)
test_feature_df['pass_count'] = test_df['text_deps'].apply(count_pass)

test_feature_df['sent_length'] = test_df['text'].apply(avg_sent_length)
test_feature_df['word_length'] = test_df['text'].apply(avg_word_length)
test_feature_df['excl_count'] = test_df['text'].apply(exclamation_marks)
test_feature_df['quest_count'] = test_df['text'].apply(question_marks)
test_feature_df['extra_punct_count'] = test_df['text'].apply(multiple_punct)

test_feature_df['anger_count'] = test_df['text_lemmas'].apply(count_anger)
test_feature_df['fear_count'] = test_df['text_lemmas'].apply(count_fear)
test_feature_df['sadness_count'] = test_df['text_lemmas'].apply(count_sadness)
test_feature_df['joy_count'] = test_df['text_lemmas'].apply(count_joy)

In [48]:
train_feature_df.head()

Unnamed: 0,label,ADJ_count,ADV_count,PRON_count,NOUN_count,PROPN_count,strongsubj_count,weaksubj_count,aux_count,pass_count,sent_length,word_length,excl_count,quest_count,extra_punct_count,anger_count,fear_count,sadness_count,joy_count
42083,True,0.079596,0.057175,0.03139,0.202915,0.085202,0.087819,0.172805,0.045964,0.007848,28.419355,5.140749,0.0,0.0,0.0,0.016997,0.031161,0.014164,0.042493
71825,True,0.115226,0.062757,0.049383,0.197531,0.012346,0.151261,0.210084,0.061728,0.012346,22.511628,5.076446,0.0,0.001221,0.0,0.02521,0.028011,0.030812,0.081232
99535,False,0.04898,0.065306,0.028571,0.216327,0.093878,0.073529,0.191176,0.040816,0.004082,27.111111,5.114754,0.0,0.0,0.0,0.051471,0.066176,0.051471,0.036765
47879,True,0.08871,0.024194,0.018433,0.185484,0.130184,0.114613,0.197708,0.048387,0.012673,47.722222,5.484284,0.0,0.0,0.0,0.04298,0.068768,0.025788,0.048711
36734,False,0.082324,0.053269,0.050847,0.152542,0.099274,0.1,0.152632,0.05569,0.012107,25.6875,4.909976,0.0,0.0,0.0,0.036842,0.052632,0.031579,0.078947


In [49]:
X_train = train_feature_df.loc[:, feature_cols]
X_test = test_feature_df.loc[:, feature_cols]
y_train = train_feature_df.label
y_test = test_feature_df.label

In [55]:
lr = RidgeClassifier()

In [56]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
acc = sk.metrics.accuracy_score(y_test, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 53.13%


In [57]:
y_pred = lr.predict(X_train)
acc = sk.metrics.accuracy_score(y_train, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 66.27%


In [58]:
y_pred = lr.predict(X)
acc = sk.metrics.accuracy_score(y, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 69.38%


In [59]:
mapping = list(zip(feature_cols, lr.coef_[0]))
mapping

[('ADJ_count', 2.037294218853471),
 ('ADV_count', 4.756361079894811),
 ('PRON_count', -2.866549812845596),
 ('NOUN_count', -0.32467116357211917),
 ('PROPN_count', -1.0361594868163944),
 ('strongsubj_count', 3.280423701948726),
 ('weaksubj_count', 2.852551020529775),
 ('aux_count', -2.1035910075925166),
 ('pass_count', -2.8342403779351155),
 ('sent_length', -0.00049036613253624),
 ('word_length', -0.00886369926566209),
 ('excl_count', 0.2644870115728918),
 ('quest_count', 0.4030525881400419),
 ('extra_punct_count', 0.04551642843694889),
 ('anger_count', 0.06463109944606628),
 ('fear_count', -2.09121604597752),
 ('sadness_count', -0.9375010537154642),
 ('joy_count', -4.387512056020496)]

In [60]:
from sklearn.model_selection import cross_val_score

In [103]:
X_total = pd.concat([X_train.sample(frac=.25, random_state=123), X_test])
y_total = [0]*len(X_test) + [1]*len(X_test)

In [105]:
scores = cross_val_score(lr, X_total, y_total, cv=15)

In [106]:
scores

array([0.655, 0.605, 0.63 , 0.67 , 0.655, 0.62 , 0.6  , 0.63 , 0.63 ,
       0.585, 0.66 , 0.655, 0.605, 0.64 , 0.61 ])

In [107]:
sum(scores)/len(scores)

0.63

In [84]:
lr.fit(X_test, y_test)
y_pred = lr.predict(X_train)
acc = sk.metrics.accuracy_score(y_train, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 53.57%


In [85]:
y_pred = lr.predict(X_test)
acc = sk.metrics.accuracy_score(y_test, y_pred)
print("accuracy: {0:.2f}%".format(acc*100))

accuracy: 58.33%
