### SemEval2019 Hyperpartisan News Detection
#### Using GloVe as document representation

In [1]:
from lxml.etree import iterparse
import xml

import os
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

from utils import *

In [2]:
def readFiles(textFile, labelFile):
    X,y = [], []    
    with open(labelFile) as labelFile:
        xml.sax.parse(labelFile, GroundTruthHandler(y))
       
    for event, elem in iterparse(textFile):
        if elem.tag == "article":
            title = elem.attrib['title']
            text = "".join(elem.itertext())
            title = cleanQuotations(title)
            text = cleanQuotations(text)
            text = cleanText(fixup(text))
            text = ' '.join(text.split()[:1000])
            X.append(title + ". " + text)
            elem.clear()
            
    return np.asarray(X), np.asarray(y)

In [4]:
def read_glove(path, dim):
    '''
    read the glove vectors from path with dimension dim
    '''
    df = pd.read_csv(path + 'glove.6B.' + str(dim) + 'd.txt', sep=" ", quoting=3, header=None, index_col=0)
    glove = {key: val.values for key, val in df.T.items()}
    return glove

In [3]:
# set path for data
dataPath = 'data/'
pretrained_wv_path = "pretrained_wv/"

textFile = dataPath + 'articles-training-byarticle.xml'
labelFile = dataPath + "ground-truth-training-byarticle.xml"

# read in data and glove vectors
texts, labels = readFiles(textFile, labelFile)
glove = read_glove(pretrained_wv_path, 300)

# split the samples with the same seed to compare results with other methods
id1, id2 = fixedTestSplit(labels)

In [5]:
def gloveVectorize(glove, text):
    '''
    Find the pretrained glove vectors of the first 1000 words in the articles.
    The final vector is the average of the vectors
    '''
    dim = len(glove["the"])
    X = np.zeros( (len(text), dim) )
    for text_id, t in enumerate(text):
        tmp = np.zeros((1, dim))
        
        # tokenize and remove stopwords
        words = customTokenize(t, rm_stopwords=True)
        words = [w for w in words if w in glove.keys()]
        for word in words:
            tmp[:] += glove[word]
        X[text_id, :] = tmp/len(words)
    return X

In [7]:
glove_texts = gloveVectorize(glove, texts)
train_x = glove_texts[id1]
test_x = glove_texts[id2]

In [8]:
C = [0.5, 0.6, 0.7, 0.9,1,1.1, 1.2, 5,10]
for c in C:
    kernel_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=c, gamma="auto", max_iter = 1000))
    ])
    print("[KernelSVM] C=%f | acc=%f" %(c,np.mean(cross_val_score(kernel_svm, train_x, labels[id1], cv=10))))

[KernelSVM] C=0.500000 | acc=0.764067
[KernelSVM] C=0.600000 | acc=0.776472
[KernelSVM] C=0.700000 | acc=0.776567
[KernelSVM] C=0.900000 | acc=0.779793
[KernelSVM] C=1.000000 | acc=0.773442
[KernelSVM] C=1.100000 | acc=0.776472
[KernelSVM] C=1.200000 | acc=0.776472
[KernelSVM] C=5.000000 | acc=0.751176
[KernelSVM] C=10.000000 | acc=0.751075


In [156]:
C = [0.05, 0.1, 0.5, 0.8, 0.9, 1, 2, 3, 5]
for c in C:
    lr = LogisticRegression(solver = 'lbfgs', C = c, max_iter=1000)
    print("[LogisticR] C=%f | acc=%f" %(c,np.mean(cross_val_score(lr, train_x, labels[id1], cv=10))))

[LogisticR] C=0.050000 | acc=0.645760
[LogisticR] C=0.100000 | acc=0.693038
[LogisticR] C=0.500000 | acc=0.748714
[LogisticR] C=0.800000 | acc=0.757805
[LogisticR] C=0.900000 | acc=0.764061
[LogisticR] C=1.000000 | acc=0.764061
[LogisticR] C=2.000000 | acc=0.751750
[LogisticR] C=3.000000 | acc=0.748525
[LogisticR] C=5.000000 | acc=0.745400


In [9]:
model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=0.9, gamma="auto", max_iter = 5000))
        ])

# model = LogisticRegression(solver = 'lbfgs', C = 1, max_iter=1000)
model.fit(train_x, labels[id1])
trn_pred = model.predict(train_x)
tst_pred = model.predict(test_x)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))
confusion_matrix(labels[id2], tst_pred)

Train accuracy:  0.9130434782608695
Test accuracy:  0.7956656346749226
Test precision:  0.7676767676767676
Test recall:  0.6386554621848739


array([[181,  23],
       [ 43,  76]], dtype=int64)

In [15]:
# fit the model to all samples
model.fit(glove_texts, labels)
# save the model
pickle.dump(model, open('trained_clsf/svm_glove.sav', 'wb'))
# save the predictions
np.save("predictions/glove_svm_pred", tst_pred)