In [8]:
from lxml.etree import iterparse
import xml

import os
import numpy as np
import pandas as pd
import nltk
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

import re
from lxml import etree
import html

from utils import fixedTestSplit, cleanQuotations, cleanText, read_glove, customTokenize

In [2]:
def fixup(x):
    '''
    fix some HTML codes and white spaces
    '''
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))


In [44]:
class GroundTruthHandler(xml.sax.ContentHandler):
    def __init__(self, gt, source):
        xml.sax.ContentHandler.__init__(self)
        self.gt = gt
        self.source = source

    def startElement(self, name, attrs):
        if name == "article":
            articleId = attrs.getValue("id")
            url = attrs.getValue("url")
            url = '/'.join(url.split('/')[:3])
            self.source.append(url)

            self.gt.append(attrs.getValue("hyperpartisan"))
            
def readFiles(textFile, labelFile):
    X,y = [], []
    sources = []
    
    with open(labelFile) as labelFile:
        xml.sax.parse(labelFile, GroundTruthHandler(y, sources))
       
    for event, elem in iterparse(textFile):
        if elem.tag == "article":
            title = elem.attrib['title']
            text = "".join(elem.itertext())
            title = cleanQuotations(title)
            text = cleanQuotations(text)
            text = cleanText(fixup(text))
            text = ' '.join(text.split()[:1000])
            X.append(title + ". " + text)
            elem.clear()
            
    return np.asarray(X), np.asarray(y), np.asarray(sources)

In [45]:
# read in labels and texts
dataPath = 'C:/Users/sharo/Documents/SemEval2019/data/'
trainFiles = [dataPath + 'articles-training-bypublisher.xml', dataPath + 'articles-validation-bypublisher.xml']
textFile = dataPath + 'articles-training-byarticle.xml'
labelFile = dataPath + "ground-truth-training-byarticle.xml"
texts, labels, sources = readFiles(textFile, labelFile)
id1, id2 = fixedTestSplit(labels)


In [23]:
def gloveVectorize(glove, text):
    dim = len(glove["the"])
    X = np.zeros( (len(text), dim) )
    for text_id, t in enumerate(text):
        tmp = np.zeros((1,300))
        words = customTokenize(t, True)
        words = [w for w in words if w in glove.keys()]
        for word in words:
            tmp[:] += glove[word]
        X[text_id, :] = tmp/len(words)
    return X

In [6]:
glove = read_glove("C:/Users/sharo/Documents/SemEval2019/pretrained_wv/", 300)

In [24]:
glove_texts = gloveVectorize(glove, texts)
train_x = glove_texts[id1]
test_x = glove_texts[id2]

In [25]:
C = [0.5, 0.6, 0.7, 0.9,1,1.1, 1.2, 5,10]
for c in C:
    kernel_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=c, gamma="auto", max_iter = 1000))
    ])
    print("[KernelSVM] C=%f | acc=%f" %(c,np.mean(cross_val_score(kernel_svm, train_x, labels[id1], cv=10))))

[KernelSVM] C=0.500000 | acc=0.764067
[KernelSVM] C=0.600000 | acc=0.776472
[KernelSVM] C=0.700000 | acc=0.776567
[KernelSVM] C=0.900000 | acc=0.779793
[KernelSVM] C=1.000000 | acc=0.773442
[KernelSVM] C=1.100000 | acc=0.776472
[KernelSVM] C=1.200000 | acc=0.776472
[KernelSVM] C=5.000000 | acc=0.751176
[KernelSVM] C=10.000000 | acc=0.751075


In [26]:
C = [0.05, 0.1, 0.5, 0.8, 0.9, 1, 2, 3, 5]
for c in C:
    lr = LogisticRegression(solver = 'lbfgs', C = c, max_iter=1000)
    print("[LogisticR] C=%f | acc=%f" %(c,np.mean(cross_val_score(lr, train_x, labels[id1], cv=10))))

[LogisticR] C=0.050000 | acc=0.645760
[LogisticR] C=0.100000 | acc=0.693038
[LogisticR] C=0.500000 | acc=0.748714
[LogisticR] C=0.800000 | acc=0.757805
[LogisticR] C=0.900000 | acc=0.764061
[LogisticR] C=1.000000 | acc=0.764061
[LogisticR] C=2.000000 | acc=0.751750
[LogisticR] C=3.000000 | acc=0.748525
[LogisticR] C=5.000000 | acc=0.745400


In [152]:
model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=0.9, gamma="auto", max_iter = 5000))
        ])

model.fit(train_x, labels[id1])
trn_pred = model.predict(train_x)
tst_pred = model.predict(test_x)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))
confusion_matrix(labels[id2], tst_pred)


Train accuracy:  0.9130434782608695
Test accuracy:  0.7956656346749226
Test precision:  0.7676767676767676
Test recall:  0.6386554621848739


array([[181,  23],
       [ 43,  76]], dtype=int64)

In [154]:
np.save("glove_pred_svm", tst_pred)

In [103]:
model = LogisticRegression(solver = 'lbfgs', C = 1, max_iter=1000)
model.fit(train_x, labels[id1])
trn_pred = model.predict(train_x)
tst_pred = model.predict(test_x)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))
confusion_matrix(labels[id2], tst_pred)


Train accuracy:  0.8043478260869565
Test accuracy:  0.7244582043343654
Test precision:  0.6442307692307693
Test recall:  0.5630252100840336


array([[167,  37],
       [ 52,  67]], dtype=int64)

In [15]:
# fit the model to all samples
model.fit(glove_texts, labels)
# save the model
pickle.dump(model, open('./svm_glove_original.sav', 'wb'))

In [29]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format("C:/Users/sharo/Documents/SemEval2019/pretrained_wv/" + "GoogleNews-vectors-negative300.bin", binary=True)



In [60]:
def wvVectorize(wv, text):
    dim = len(word_vectors.word_vec('the') )
    X = np.zeros( (len(text), dim) )
    for text_id, t in enumerate(text):
        tmp = np.zeros((1,300))
        words = customTokenize(t)
        words = [w for w in words if w in word_vectors.vocab]
        for word in words:
            tmp[:] += word_vectors.word_vec(word)
        X[text_id, :] = tmp/len(words)
    return X

In [61]:
wv_texts = wvVectorize(word_vectors, texts)
train_x = wv_texts[id1]
test_x = wv_texts[id2]

In [64]:
C = [0.1, 0.3, 0.5, 0.7, 1, 5]
for c in C:
    kernel_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=c, gamma="auto", max_iter = 1000))
    ])
    print("[KernelSVM] C=%f | acc=%f" %(c,np.mean(cross_val_score(kernel_svm, train_x, labels[id1], cv=10))))

[KernelSVM] C=0.100000 | acc=0.652212
[KernelSVM] C=0.300000 | acc=0.773353
[KernelSVM] C=0.500000 | acc=0.779698
[KernelSVM] C=0.700000 | acc=0.776283
[KernelSVM] C=1.000000 | acc=0.776182
[KernelSVM] C=5.000000 | acc=0.766990


In [66]:
C = [0.05, 0.1, 0.5, 1, 5, 10]
for c in C:
    lr = LogisticRegression(solver = 'lbfgs', C = c, max_iter=1000)
    print("[LogisticR] C=%f | acc=%f" %(c,np.mean(cross_val_score(lr, train_x, labels[id1], cv=10))))

[LogisticR] C=0.050000 | acc=0.630425
[LogisticR] C=0.100000 | acc=0.630425
[LogisticR] C=0.500000 | acc=0.689913
[LogisticR] C=1.000000 | acc=0.723909
[LogisticR] C=5.000000 | acc=0.764156
[LogisticR] C=10.000000 | acc=0.760930


In [67]:
model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=0.5, gamma="auto", max_iter = 5000))
        ])

model.fit(train_x, labels[id1])
trn_pred = model.predict(train_x)
tst_pred = model.predict(test_x)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))
confusion_matrix(labels[id2], tst_pred)

Train accuracy:  0.8633540372670807
Test accuracy:  0.7585139318885449
Test precision:  0.7204301075268817
Test recall:  0.5630252100840336


array([[178,  26],
       [ 52,  67]], dtype=int64)

In [148]:
#fnIdx = np.intersect1d(np.where(gt == 'true')[0], (np.where(pred == 'false')[0]))
#fpIdx = np.intersect1d(np.where(gt == 'false')[0], (np.where(pred == 'true')[0]))


corr = [i for i in range(len(labels[id2])) if labels[id2][i] == tst_pred[i]]
wrong = texts[id2[np.concatenate((fnIdx, fpIdx))]]
correct = texts[id2[corr]]