### SemEval2019 Hyperpartisan News Detection
#### Using Doc2Vec as document representation

In [1]:
from lxml.etree import iterparse
import xml

import os
import numpy as np
import pickle

from gensim import models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import SaveLoad, simple_preprocess

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

from utils import *



In [8]:
class IterCorpus():
    def __init__(self, file):
        self.file = file
    def __iter__(self):
        for event, elem in iterparse(self.file):
            if elem.tag == "article":
                articleId = elem.attrib['id']
                title = elem.attrib['title']
                text = "".join(elem.itertext())
                text = textCleaning(title, text)             
                yield(articleId, text)
                
                
class TaggedDoc(object):
    '''
    prepare tagged documents for the doc2vec model
    '''
    def __init__(self, file):
        if isinstance(file, str):
            self.file = [file]
        else:
            self.file = file

    def __iter__(self):
        for f in self.file:
            if 'byarticle' in f:
                fileIdx = 2
            elif 'validation' in f:
                fileIdx = 1
            else:
                fileIdx = 0
            corpus = IterCorpus(f)
            for text in corpus:
                ind = text[0] + '_' + str(fileIdx)
                yield TaggedDocument(simple_preprocess(text[1]), [ind])
            
            
def extract_doc_rep(textFile, model):
    '''
    Extract the representation for file docFile
    '''
    vectors = np.zeros((400, 645))
    cnt = 0
    corpus = TaggedDoc(textFile)
    for doc in corpus:
        vectors[:, cnt] = model.infer_vector(doc.words, epochs=100, alpha=0.025)
        cnt = cnt+1
    return vectors
                        
def readLabels(labelFile):
    y = []
    with open(labelFile) as labelFile:
        xml.sax.parse(labelFile, GroundTruthHandler(y))
       
    return np.asarray(y)

In [4]:
# set path for data
dataPath = 'data/'
modelPath = "tmp/"
textFile = dataPath + 'articles-training-byarticle.xml'
labelFile = dataPath + "ground-truth-training-byarticle.xml"
labels = readLabels(labelFile)

# split the samples with the same seed to compare results with other methods
id1, id2 = fixedTestSplit(labels)

In [6]:
# load the trained model
model = Doc2Vec.load(modelPath + "doc2vec_400_300")
word_vectors = model.wv

In [8]:
# explore a few words and their similar words
print(word_vectors.similar_by_word("amazon"))
print(word_vectors.similar_by_word("trump"))

[('apple', 0.7739475965499878), ('google', 0.7013559341430664), ('tesla', 0.6806751489639282), ('microsoft', 0.6742485761642456), ('netflix', 0.6667925715446472), ('boeing', 0.652466356754303), ('china', 0.6521828174591064), ('it', 0.6418113112449646), ('intel', 0.6391427516937256), ('he', 0.634005069732666)]
[('obama', 0.8785072565078735), ('bush', 0.8242968320846558), ('clinton', 0.7938897609710693), ('romney', 0.785071849822998), ('he', 0.766777515411377), ('sanders', 0.7613655924797058), ('pence', 0.7346415519714355), ('putin', 0.7339059114456177), ('the', 0.7321258783340454), ('she', 0.7216925024986267)]


  if np.issubdtype(vec.dtype, np.int):


In [11]:
vectors = extract_doc_rep(textFile, model)
vectors = vectors.T

In [19]:
trainX = vectors[id1,:]
testX = vectors[id2,:]

In [20]:
C = [0.5, 1,3, 5, 10]
for c in C:
    svm = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=c, gamma="auto", max_iter = 5000))
        ])
    print("[KernelSVM] C=%f | acc=%f" %(c,np.mean(cross_val_score(svm, trainX, labels[id1], cv=10))))

[KernelSVM] C=0.500000 | acc=0.639706
[KernelSVM] C=1.000000 | acc=0.739168
[KernelSVM] C=3.000000 | acc=0.776378
[KernelSVM] C=5.000000 | acc=0.779503
[KernelSVM] C=10.000000 | acc=0.779503


In [16]:
C = [0.001, 0.05, 0.07, 0.1, 0.5, 1]
for c in C:
    lr = LogisticRegression(solver = 'lbfgs', C = c, max_iter=5000)
    print("[LogisticR] C=%f | acc=%f" %(c,np.mean(cross_val_score(lr, trainX, labels[id1], cv=10))))

[LogisticR] C=0.001000 | acc=0.729875
[LogisticR] C=0.050000 | acc=0.723341
[LogisticR] C=0.070000 | acc=0.723247
[LogisticR] C=0.100000 | acc=0.717186
[LogisticR] C=0.500000 | acc=0.707994
[LogisticR] C=1.000000 | acc=0.708095


In [21]:
# use the classifier that has highest cv accuracy as the final model
model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=5, gamma="auto", max_iter = 5000))
        ])
#model = LogisticRegression(solver = 'lbfgs', C = 0.05, max_iter=1000)
model.fit(trainX, labels[id1])
trn_pred = model.predict(trainX)
tst_pred = model.predict(testX)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))
confusion_matrix(labels[id2], tst_pred)

Train accuracy:  1.0
Test accuracy:  0.7894736842105263
Test precision:  0.7802197802197802
Test recall:  0.5966386554621849


array([[184,  20],
       [ 48,  71]], dtype=int64)

In [24]:
# fit the model to all samples
model.fit(vectors, labels)
# save the model
pickle.dump(model, open('trained_clsf/svm_doc2vec.sav', 'wb'))
# save the predictions
np.save("predictions/doc2vec_svm_pred", tst_pred)

#### If we want to stack features to doc2vec

In [9]:
# load extracted features
features = np.load("features.npy")
final = np.hstack((vectors, features))
trainX = final[id1]
testX = final[id2]

In [23]:
C = [0.1, 0.5, 1,3, 5, 10]
for c in C:
    svm = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=c, gamma="auto", max_iter = 5000))
        ])
    print("[KernelSVM] C=%f | acc=%f" %(c,np.mean(cross_val_score(svm, trainX, labels[id1], cv=10))))

[KernelSVM] C=0.100000 | acc=0.630425
[KernelSVM] C=0.500000 | acc=0.729875
[KernelSVM] C=1.000000 | acc=0.764067
[KernelSVM] C=3.000000 | acc=0.763688
[KernelSVM] C=5.000000 | acc=0.763688
[KernelSVM] C=10.000000 | acc=0.763688


In [25]:
C = [0.001, 0.005, 0.05, 0.1]
for c in C:
    lr = LogisticRegression(solver = 'lbfgs', C = c, max_iter=5000)
    print("[LogisticR] C=%f | acc=%f" %(c,np.mean(cross_val_score(lr, trainX, labels[id1], cv=10))))

[LogisticR] C=0.001000 | acc=0.729875
[LogisticR] C=0.005000 | acc=0.748448
[LogisticR] C=0.050000 | acc=0.723341
[LogisticR] C=0.100000 | acc=0.717186


In [26]:
# use the classifier that has highest cv accuracy as the final model
model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=1, gamma="auto", max_iter = 5000))
        ])
model = LogisticRegression(solver = 'lbfgs', C = 0.005, max_iter=1000)
model.fit(trainX, labels[id1])
trn_pred = model.predict(trainX)
tst_pred = model.predict(testX)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))
confusion_matrix(labels[id2], tst_pred)

Train accuracy:  0.9316770186335404
Test accuracy:  0.7770897832817337
Test precision:  0.7422680412371134
Test recall:  0.6050420168067226


array([[179,  25],
       [ 47,  72]], dtype=int64)