### Build a classifier to classify which articles in the training set are similar to the test set

In [232]:
import xml.sax
import numpy as np
import xml.etree.ElementTree as ET
import random
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from utils import customTokenize, cleanText, read_glove

In [251]:
def readFiles(textFile, labelFile, num=645, train=False):
    X, y = [], []
    articleId = []
    
    with open(labelFile, encoding="utf-8") as f:
        tree = ET.parse(f)
        root = tree.getroot()
        for article in root.iter('article'):
            articleId.append(article.attrib['id'])
            y.append(article.attrib['hyperpartisan'])
    
    if train:
        random.seed(1)
        indices = random.sample(range(600000), num)
        y = np.asarray(y)[indices]
        articleId = np.asarray(articleId)[indices]

    with open (textFile, encoding = 'utf-8') as f:
        for idx, line in enumerate(f):
            if train:
                if idx in indices:
                    tmp = line.split('::')
                    assert(tmp[0] in articleId)
                    text = tmp[1] + ' ' + tmp[2]
                    X.append(cleanText(text))
            else:
                tmp = line.split('::')
                text = tmp[1] + ' ' + tmp[2]
                X.append(cleanText(text))
    
    return np.asarray(X), np.asarray(y)

In [253]:
# process testset
art_texts, art_labels = readFiles('../data/articles-training-byarticle.txt', '../data/ground-truth-training-byarticle.xml')

# split into training and held-out test set with balanced class
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state = 1)
split_idx = list(sss.split(np.zeros(len(art_labels)), art_labels))[0]
art_text = art_texts[split_idx[0]]
held_out_idx = split_idx[1]

In [257]:
# random sample from the training set
pub_label_file = '../data/ground-truth-training-bypublisher.xml'
pub_text, pub_labels = readFiles('../data/articles-training-bypublisher.txt', pub_label_file, len(art_text), True)

In [262]:
# assemble training data and labels for the quality classifier
trn_texts = np.concatenate([art_text, pub_text])
trn_labels = np.concatenate([[1]*len(art_text), [0] * len(pub_text)])

In [263]:
# split into training and development set with balanced class
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state = 1)
split_idx = list(sss.split(np.zeros(len(trn_labels)), trn_labels))[0]
trn_text = trn_texts[split_idx[0]]
dev_text = trn_texts[split_idx[1]]
trn_label = trn_labels[split_idx[0]]
dev_label = trn_labels[split_idx[1]]

In [275]:
# Count unigrams
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words = nltk.corpus.stopwords.words('english'), min_df=2)
trn = vectorizer.fit_transform(trn_text)
dev = vectorizer.transform(dev_text)

In [276]:
# model training and select parameters
cc = [0.001, 0.01, 0.1, 1, 10]
for c in cc:
    model = LogisticRegression(solver = 'lbfgs', C=c)
    model.fit(trn, trn_label)
    trn_pred = model.predict(trn) 
    result_trn = accuracy_score(trn_label, trn_pred)
    dev_pred = model.predict(dev)
    dev_pred2 = [model.decision_function(dev) > -2][0]
    result_dev = accuracy_score(dev_label, dev_pred)
    print(c, 'Accuracy: ', result_trn, result_dev)

0.001 Accuracy:  0.9281553398058252 0.8294573643410853
0.01 Accuracy:  0.9844660194174757 0.8449612403100775
0.1 Accuracy:  1.0 0.8217054263565892
1 Accuracy:  1.0 0.8372093023255814
10 Accuracy:  1.0 0.8372093023255814


In [296]:
from sklearn import svm
from sklearn import preprocessing
scaler = preprocessing.MaxAbsScaler().fit(trn)
trn_trans = scaler.transform(trn)
dev_trans = scaler.transform(dev)
cc = [0.01, 0.1, 1, 10, 100]
for c in cc:
    model = svm.SVC(C = c, gamma='scale')
    model.fit(trn_trans, trn_label)
    trn_pred = model.predict(trn_trans) 
    result_trn = accuracy_score(trn_label, trn_pred)
    dev_pred = model.predict(dev_trans)
    result_dev = accuracy_score(dev_label, dev_pred)
    print(c, 'Accuracy: ', result_trn, result_dev)

0.01 Accuracy:  0.5009708737864078 0.49612403100775193
0.1 Accuracy:  0.5009708737864078 0.49612403100775193
1 Accuracy:  0.5572815533980583 0.5581395348837209
10 Accuracy:  0.9689320388349515 0.6821705426356589
100 Accuracy:  1.0 0.7131782945736435


In [266]:
confusion_matrix(dev_label, dev_pred2)

array([[50, 14],
       [ 8, 57]], dtype=int64)

In [267]:
final_trn = np.concatenate([trn_text, dev_text])
final_lab = np.concatenate([trn_label, dev_label])
trn = vectorizer.fit_transform(final_trn)
model = LogisticRegression(solver = 'lbfgs', C=0.01)
model.fit(trn, final_lab)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [271]:
dev = vectorizer.transform(dev_text)
pred = model.predict(dev)
pred2 = [model.decision_function(dev) > -2][0]
print(accuracy_score(dev_label, pred))
print(confusion_matrix(dev_label, pred))

0.9922480620155039
[[64  0]
 [ 1 64]]


In [219]:
def predict(textFile):
    def clean(text):
        yield(cleanText(text))
        
    preds = []
    with open (textFile, encoding = 'utf-8') as f:
        for line in f:
            tmp = line.split('::')
            text = tmp[1] + ' ' + tmp[2]
            X = vectorizer.transform(clean(text))
            preds.append(model.decision_function(X))
    return np.asarray(preds)

In [220]:
preds = predict('../data/articles-training-bypublisher.txt')

In [247]:
# sort confidence and take the top 10%
ids = np.where(preds > 0)[0]

In [248]:
print(len(ids))
np.save('./remain', remained_id[0])

82480
