### Semi-Supervised Learning

#### First we test how label propagation works on labeled data, treating half or the labels as unknown

In [174]:
import xml.sax
import numpy as np
import xml.etree.ElementTree as ET
import random
import nltk

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from utils import customTokenize, cleanText, read_glove
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import LabelPropagation

In [87]:
def readFiles(textFile, labelFile, num=645, train=False):
    '''
    Read articles and labels. 
    When train is True, randomly sample num articles from the whole training set
    '''
    X, y = [], []
    articleId = []
    
    with open(labelFile, encoding="utf-8") as f:
        tree = ET.parse(f)
        root = tree.getroot()
        for article in root.iter('article'):
            articleId.append(article.attrib['id'])
            if article.attrib['hyperpartisan'] == 'true':
                y.append(1)
            else:
                y.append(0)
    
    if train:
        random.seed(1)
        indices = random.sample(range(600000), num)
        y = np.asarray(y)[indices]
        articleId = np.asarray(articleId)[indices]

    with open (textFile, encoding = 'utf-8') as f:
        for idx, line in enumerate(f):
            if train:
                if idx in indices:
                    tmp = line.split('::')
                    assert(tmp[0] in articleId)
                    text = tmp[1] + ' ' + tmp[2]
                    X.append(cleanText(text))
            else:
                tmp = line.split('::')
                text = tmp[1] + ' ' + tmp[2]
                X.append(cleanText(text))
    
    return np.asarray(X), np.asarray(y)

In [128]:
# process testset
art_texts, art_labels = readFiles('../data/articles-training-byarticle.txt', '../data/ground-truth-training-byarticle.xml')

# split into training and held-out test set with balanced class
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state = 1)
split_idx = list(sss.split(np.zeros(len(art_labels)), art_labels))[0]
art_text = art_texts[split_idx[0]]
art_label = art_labels[split_idx[0]]
held_out_text  = art_texts[split_idx[1]]
held_out_label = art_labels[split_idx[1]]

#### Representation of text
1. BOW / ngram
2. GLOVE

In [166]:
bow = CountVectorizer(ngram_range=(1, 1), stop_words = nltk.corpus.stopwords.words('english'), min_df=2)
bigram = CountVectorizer(ngram_range=(1, 2), stop_words = nltk.corpus.stopwords.words('english'), min_df=3)
glove = read_glove(300)

rep = {'BOW': bow, 'Bi-gram': bigram, 'glove': glove}

In [None]:
def glove_fit(texts, word_num, glove):
    X = np.zeros( (word_num, 300, len(texts)) )
    for text_id, text in enumerate(texts):
        words = customTokenize(text)
        words = [w for w in words if w in glove.keys()]
        for idx, word in enumerate(words):
            if idx < word_num:
                X[idx, :, text_id] = glove[word]
    X = np.mean(X, 0).T
    return X

In [180]:
trn = np.concatenate([art_text, held_out_text])
trn_labels = np.concatenate([art_label, [-1]*len(held_out_text)])

In [181]:
'''
LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_neighbors=7, tol=0.001)

* alpha: clamping
* kernel: 'rbf' (gamma) or 'knn' (n_neighbors)
'''

import warnings
warnings.filterwarnings('ignore')

kernel = {'rbf':[0.1, 0.3, 0.5, 0.7], 'knn':[5,7]}
alpha = [0.2,0.5,0.8]

for r, re in rep.items():
    if r !='glove':
        trn_texts = re.fit_transform(trn).toarray()
    else:
        trn_texts = glove_fit(trn, 300, re)
        
    print(trn_texts.shape)
    
    for a in alpha:
        for k, param in kernel.items():
            for p in param:
                if k == 'rbf':
                    model = LabelSpreading(alpha=a, kernel = k, gamma=p)
                else:
                    trn_texts = StandardScaler().fit_transform(trn_texts)
                    model = LabelSpreading(alpha=a, kernel = k, n_neighbors=p)
                
                model.fit(trn_texts, trn_labels)
                pred = model.transduction_[len(art_label):]
                acc = accuracy_score(held_out_label, pred)
                print("%s: kernel: %s; alpha: %f, p: %f | acc= %f.2" %(r,k,a,p,acc))

(645, 10834)
BOW: kernel: rbf; alpha: 0.200000, p: 0.100000 | acc= 0.715170.2
BOW: kernel: rbf; alpha: 0.200000, p: 0.300000 | acc= 0.640867.2
BOW: kernel: rbf; alpha: 0.200000, p: 0.500000 | acc= 0.606811.2
BOW: kernel: rbf; alpha: 0.200000, p: 0.700000 | acc= 0.572755.2
BOW: kernel: knn; alpha: 0.200000, p: 5.000000 | acc= 0.393189.2
BOW: kernel: knn; alpha: 0.200000, p: 7.000000 | acc= 0.393189.2
BOW: kernel: rbf; alpha: 0.500000, p: 0.100000 | acc= 0.377709.2
BOW: kernel: rbf; alpha: 0.500000, p: 0.300000 | acc= 0.597523.2
BOW: kernel: rbf; alpha: 0.500000, p: 0.500000 | acc= 0.625387.2
BOW: kernel: rbf; alpha: 0.500000, p: 0.700000 | acc= 0.643963.2
BOW: kernel: knn; alpha: 0.500000, p: 5.000000 | acc= 0.393189.2
BOW: kernel: knn; alpha: 0.500000, p: 7.000000 | acc= 0.393189.2
BOW: kernel: rbf; alpha: 0.800000, p: 0.100000 | acc= 0.377709.2
BOW: kernel: rbf; alpha: 0.800000, p: 0.300000 | acc= 0.597523.2
BOW: kernel: rbf; alpha: 0.800000, p: 0.500000 | acc= 0.625387.2
BOW: kernel:

### The highest achieved accuracy of label propagation is 0.715

In [214]:
trn_texts = bow.fit_transform(trn).toarray()
model = LabelSpreading(alpha=0.2, kernel = 'rbf', gamma=0.1)
model.fit(trn_texts, trn_labels)
pred = model.transduction_[len(art_label):]
accuracy_score(held_out_label, pred)

0.7151702786377709

In [215]:
t = model.label_distributions_[len(art_label):]

In [219]:
accuracy_score(held_out_label[np.where(t[:,0]>0.95)], pred[np.where(t[:,0]>0.95)] )

0.7835051546391752

In [220]:
accuracy_score(held_out_label[np.where(t[:,1]>0.95)], pred[np.where(t[:,1]>0.95)] )

0.7

Even if we only consider samples with high confidence (>0.95), the accuracy cannot reach 80%
### Conclusion: The representation is not effective in separating the sample 