### SemEval2019 Hyperpartisan News Detection
#### Using tfidf as document representation

There are 645 labeled samples, of which half would be used for training, and half for testing

In [1]:
from lxml.etree import iterparse
import xml
import os
import numpy as np
import nltk
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.model_selection import cross_val_score

from utils import *

In [2]:
class IterCorpus():
    def __init__(self, files):
        self.files = files
    def __iter__(self):
        for file in self.files:
            for event, elem in iterparse(file):
                if elem.tag == "article":
                    title = elem.attrib['title']
                    text = "".join(elem.itertext())
                    text = textCleaning(title, text)
                    elem.clear()
                    yield(text)
                
def readFiles(textFile, labelFile):
    X,y = [], []
    
    with open(labelFile) as labelFile:
        xml.sax.parse(labelFile, GroundTruthHandler(y))
       
    for event, elem in iterparse(textFile):
        if elem.tag == "article":
            title = elem.attrib['title']
            text = "".join(elem.itertext())
            X.append(textCleaning(title, text))
            elem.clear()
            
    return np.asarray(X), np.asarray(y)

In [3]:
# read in training texts
dataPath = 'data/'
trainFiles = [dataPath + 'articles-training-bypublisher.xml', dataPath + 'articles-validation-bypublisher.xml']

# read by-article labels and texts
textFile = dataPath + 'articles-training-byarticle.xml'
labelFile = dataPath + "ground-truth-training-byarticle.xml"
texts, labels = readFiles(textFile, labelFile)

# split the samples with the same seed to compare results with other methods
id1, id2 = fixedTestSplit(labels)

In [4]:
def normalizer(tokens):
    """ 
    Map all numeric tokens to a placeholder.
    Map all non-alphabetic tokens to a placeholder
    """
    def switcher(token):
        if token.isalpha():
            return token
        elif token[0].isdigit():
            return "#NUMBER"
        else: 
            return "#OTHERS" 

    return (switcher(token) for token in tokens)

class NumberNormalizingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(normalizer(tokenize(doc)))

corpus = IterCorpus(trainFiles)
vectorizer = NumberNormalizingVectorizer(min_df=5, max_features=50000, max_df=0.7)

# fit to the training corpus
vectorizer.fit(corpus)
pickle.dump(vectorizer, open("unigram.pickle", "wb"))

In [16]:
vectorizer = pickle.load(open("unigram.pickle", "rb"))
# check features
vectorizer.get_feature_names()

['#OTHERS',
 'aa',
 'aaa',
 'aap',
 'aapl',
 'aaron',
 'aaronson',
 'aarp',
 'ab',
 'aba',
 'ababa',
 'aback',
 'abad',
 'abadi',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abandons',
 'abate',
 'abated',
 'abatement',
 'abating',
 'abb',
 'abba',
 'abbas',
 'abbasi',
 'abbe',
 'abbey',
 'abbie',
 'abbot',
 'abbott',
 'abbottabad',
 'abbreviated',
 'abbreviation',
 'abbvie',
 'abby',
 'abc',
 'abcnews',
 'abd',
 'abdallah',
 'abdel',
 'abdelaziz',
 'abdeslam',
 'abdi',
 'abdicate',
 'abdicated',
 'abdicating',
 'abdication',
 'abdomen',
 'abdominal',
 'abduct',
 'abducted',
 'abducting',
 'abduction',
 'abductions',
 'abdul',
 'abdulaziz',
 'abdullah',
 'abdulmutallab',
 'abdulrahman',
 'abdurrahman',
 'abe',
 'abed',
 'abedin',
 'abel',
 'abenomics',
 'abercrombie',
 'aberdeen',
 'abernathy',
 'aberrant',
 'aberration',
 'aberrations',
 'abet',
 'abets',
 'abetted',
 'abetting',
 'abeyance',
 'abeyta',
 'abhisit',
 'abhor',
 'abhorred',
 'abhorrence',
 'abhorrent',
 'a

In [17]:
vectors = vectorizer.transform(texts)
trainX= vectors[id1]
testX = vectors[id2]

In [18]:
C = [0.1, 0.5, 1, 2, 3, 5, 6, 8, 10]
for c in C:
    svm = LinearSVC(C=c, max_iter = 1000)
    print("[LinearSVM] C=%f | acc=%f" %(c, np.mean(cross_val_score(svm, trainX, labels[id1], cv=10))))

[LinearSVM] C=0.100000 | acc=0.705260
[LinearSVM] C=0.500000 | acc=0.770128
[LinearSVM] C=1.000000 | acc=0.773158
[LinearSVM] C=2.000000 | acc=0.772962
[LinearSVM] C=3.000000 | acc=0.775987
[LinearSVM] C=5.000000 | acc=0.782237
[LinearSVM] C=6.000000 | acc=0.782237
[LinearSVM] C=8.000000 | acc=0.785362
[LinearSVM] C=10.000000 | acc=0.785362


In [8]:
C = [0.5, 1, 5, 7, 10, 50, 60]
for c in C:
    lr = LogisticRegression(solver = 'lbfgs', C = c, max_iter=1000)
    print("[LogisticR] C=%f | acc=%f" %(c,np.mean(cross_val_score(lr, trainX, labels[id1], cv=10))))

[LogisticR] C=0.500000 | acc=0.658651
[LogisticR] C=1.000000 | acc=0.705260
[LogisticR] C=5.000000 | acc=0.773158
[LogisticR] C=7.000000 | acc=0.770128
[LogisticR] C=10.000000 | acc=0.776188
[LogisticR] C=50.000000 | acc=0.776384
[LogisticR] C=60.000000 | acc=0.776384


In [20]:
# use the classifier that has highest cv accuracy as the final model
model = LinearSVC(C=2, max_iter = 1000)
#model = LogisticRegression(solver = 'lbfgs', C = 50, max_iter=1000)
model.fit(trainX, labels[id1])
trn_pred = model.predict(trainX)
tst_pred = model.predict(testX)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))
confusion_matrix(labels[id2], tst_pred)

Train accuracy:  1.0
Test accuracy:  0.7801857585139319
Test precision:  0.7727272727272727
Test recall:  0.5714285714285714


array([[184,  20],
       [ 51,  68]], dtype=int64)

In [18]:
# fit the model to all samples
model.fit(vectorizer.transform(texts), labels)
# save the model
pickle.dump(model, open('trained_clsf/linearSVM_tfidf.sav', 'wb'))
# save the predictions
np.save('predictions/tfidf_svm_pred', tst_pred)