### SemEval2019 Hyperpartisan News Detection
#### Make a baseline system that uses only the test set

There are only 645 samples, of which half would be used for training, and half for testing

What can we try with ~300 sample?
1. feature dimension shouldn't really go beyond 300
    - need dense and compact representation wod2vec/glove
2. complexity of classifier shouldn't be too high
    - linear SVM
    - LR
    - ensemble
3. pre-trained LM

In [11]:
from lxml.etree import iterparse
import xml.etree.ElementTree as etree

import os
import numpy as np
import pandas as pd
import nltk
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score


from utils import *

In [37]:
class GroundTruthHandler(xml.sax.ContentHandler):
    def __init__(self, gt):
        xml.sax.ContentHandler.__init__(self)
        self.gt = gt

    def startElement(self, name, attrs):
        if name == "article":
            articleId = attrs.getValue("id")
            self.gt.append(attrs.getValue("hyperpartisan"))
            
def readFiles(textFile, labelFile):
    X,y = [], []
    
    with open(labelFile) as labelFile:
        xml.sax.parse(labelFile, GroundTruthHandler(y))
       
    for event, elem in iterparse(textFile):
        if elem.tag == "article":
            title = elem.attrib['title']
            text = "".join(elem.itertext())
            title = cleanQuotations(title)
            text = cleanQuotations(text)
            text = cleanText(fixup(text))
            text = ' '.join(text.split()[:1000])
            X.append(title + ". " + text)
            elem.clear()
            
    return np.asarray(X), np.asarray(y)

In [38]:
# read in labels and texts
textFile = '../data/articles-training-byarticle.xml'
labelFile = "../data/ground-truth-training-byarticle.xml"
texts, labels = readFiles(textFile, labelFile)

In [39]:
id1, id2 = fixedTestSplit(labels)

In [40]:
def gloveVectorize(glove, text):
    dim = len(glove["the"])
    X = np.zeros( (len(text), dim) )
    for text_id, t in enumerate(text):
        tmp = np.zeros((1,300))
        words = customTokenize(t)
        words = [w for w in words if w in glove.keys()]
        for word in words:
            tmp[:] += glove[word]
            X[text_id, :] = tmp/len(words)
    return X

In [41]:
glove_texts = gloveVectorize(read_glove(300), texts)
glove_texts.shape

(645, 300)

In [43]:
train_x = glove_texts[id1]
test_x = glove_texts[id2]

#### Model1: SVM

In [44]:
C = [0.01, 0.1, 1, 10]
for c in C:
    for p in ['l1', 'l2']:
        svm = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", LinearSVC(C=c, penalty=p, dual=False, tol=1e-4, max_iter=10000))
        ])
        print("[Linear] C=%f P=%s | acc=%f" %(c,p,np.mean(cross_val_score(svm, train_x, labels[id1], cv=10))))

[Linear] C=0.005000 P=l1 | acc=0.645659
[Linear] C=0.005000 P=l2 | acc=0.683171
[Linear] C=0.010000 P=l1 | acc=0.736504
[Linear] C=0.010000 P=l2 | acc=0.683071
[Linear] C=0.100000 P=l1 | acc=0.711309
[Linear] C=0.100000 P=l2 | acc=0.645653
[Linear] C=0.500000 P=l1 | acc=0.670280
[Linear] C=0.500000 P=l2 | acc=0.642528
[Linear] C=1.000000 P=l1 | acc=0.651726
[Linear] C=1.000000 P=l2 | acc=0.642528


In [50]:
C = [0.7, 0.8, 0.9, 1, 1.1, 2, 3]
for c in C:
    kernel_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=c, gamma="auto", max_iter = 5000))
    ])
    print("[Kernel] C=%f | acc=%f" %(c,np.mean(cross_val_score(kernel_svm, train_x, labels[id1], cv=10))))

[Kernel] C=0.700000 | acc=0.770506
[Kernel] C=0.800000 | acc=0.776662
[Kernel] C=0.900000 | acc=0.776662
[Kernel] C=1.000000 | acc=0.773442
[Kernel] C=1.100000 | acc=0.779597
[Kernel] C=2.000000 | acc=0.770122
[Kernel] C=3.000000 | acc=0.769837


In [46]:
# we test LR with differen regularization params and 2 different glove dimensions
C = [0.01, 0.1, 1, 10, 100]
for c in C:
    lr_clf = Pipeline([
            #("scaler", StandardScaler()),
            #("pca", PCA(n_components=0.95)),
            ("lr", LogisticRegression(solver = 'lbfgs', C = c, max_iter=1000))
    ])
    print("C=%f | acc=%f" %(c,np.mean(cross_val_score(lr_clf, train_x, labels[id1], cv=10))))
    

C=0.010000 | acc=0.630425
C=0.100000 | acc=0.693038
C=1.000000 | acc=0.764061
C=10.000000 | acc=0.717464
C=100.000000 | acc=0.680236


### Best accuracy: 77.96 with SVM, C=1.1, glove=300

In [51]:
model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=1.1, gamma="auto", max_iter = 5000))
        ])

model.fit(train_x, labels[id1])
tst_pred = model.predict(test_x)
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
confusion_matrix(labels[id2], tst_pred)

Test accuracy:  0.7894736842105263


array([[179,  25],
       [ 43,  76]], dtype=int64)

In [53]:
# fit the model to all samples
model.fit(glove_texts, labels)
# save the model
pickle.dump(model, open('./svm_glove.sav', 'wb'))