# DataDataData

Here's a sample class wrapping the main abstraction, the `Dataset`.

In [65]:
import csv
import random


class Dataset(object):
    def __init__(self, text_to_label=None):
        self.text_to_label = text_to_label or {}
        
    def add_label(self, text, label):
        self.text_to_label[text] = label
        
    def iter_texts(self, accept=lambda label: True):
        return ((text, label) for text, label in 
                 self.text_to_label.items() if accept(label))
    
    def to_csv(self, fname):
        with open(fname, 'w') as f:
            w = csv.writer(f, delimiter=',')
            for text, label in self.text_to_label.items():
                w.writerow((label, text))
    
    @classmethod
    def from_csv(cls, fname):
        with open(fname) as f:
            return cls({text: label for text, label in
                        csv.reader(f, delimiter=',')})
        
    def update(self, other):
        self.text_to_label.update(other.text_to_label)

# 20 newgroups

* Take the training set.
* Remove every second label.

In [66]:
from sklearn.datasets import fetch_20newsgroups

def build_newsgroups(subset='train'):
    newsgroups_train = fetch_20newsgroups(subset=subset)
    label_names = {index: name for index, name in 
                   enumerate(newsgroups_train.target_names)}
    return Dataset({text: label_names[index] 
                    for text, index in zip(newsgroups_train.data, 
                                           newsgroups_train.target)})

def unlabel(dataset, p=0.5):
    for text, label in dataset.iter_texts():
        if random.random() > p:
            dataset.add_label(text, None)


train = build_newsgroups()
unlabel(train)

# Active sampling with classifier

Here is a straw man active sampler that:
* trains a classifier on the labelled data
* predicts the labels of unlabelled data
* selects text with a specific label profile

In [62]:
from collections import Counter

import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline


def iter_active_sample(dataset, cross_validate=True, accept=lambda i: True):
    X, y = [], []
    for text, label in list(dataset.iter_texts(lambda l: l is not None)):
        X.append(text)
        y.append(label)
    print('Got {} labelled samples'.format(len(X)))
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('clf', SGDClassifier(loss='log')),
    ])
    if cross_validate:
        print('Cross-validating')
        scores = cross_val_score(pipeline, X, y, cv=3)
        print("Cross-validated accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print('Refitting')
    pipeline.fit(X, y)
    X, y = [], []
    for text, label in list(dataset.iter_texts(lambda l: l is None)):
        X.append(text)
        y.append(label)
    print('Predicting {} unlabelled'.format(len(X)))
    for probs, text in zip(pipeline.predict_proba(X), X):
        predictions = dict(zip(pipeline.classes_, probs))
        if accept(predictions):
            yield (predictions, text)

In [77]:
for preds, X in iter_active_sample(train, 
                                   cross_validate=False, 
                                   accept=lambda preds: max(preds.values()) < 0.7):
    print('{}\t{}'.format(preds, repr(text)))
    break

Got 5569 labelled samples
Refitting




Predicting 5745 unlabelled
{'alt.atheism': 3.7254615575514024e-132, 'comp.graphics': 5.9078108888745949e-39, 'comp.os.ms-windows.misc': 0.50000107012676465, 'comp.sys.ibm.pc.hardware': 1.7130306339580281e-17, 'comp.sys.mac.hardware': 9.1751144692482513e-25, 'comp.windows.x': 1.5612169217704451e-46, 'misc.forsale': 3.7149814655818378e-17, 'rec.autos': 9.5618936804095389e-12, 'rec.motorcycles': 1.2703873625525703e-47, 'rec.sport.baseball': 0.49999892986367345, 'rec.sport.hockey': 2.1858647319627581e-34, 'sci.crypt': 2.894246204064473e-61, 'sci.electronics': 3.6691150545235971e-23, 'sci.med': 1.8785525903306693e-35, 'sci.space': 1.7350890327670784e-45, 'soc.religion.christian': 6.2996141248465761e-99, 'talk.politics.guns': 9.304136797469697e-89, 'talk.politics.mideast': 2.1207557875253542e-76, 'talk.politics.misc': 3.237194377273919e-95, 'talk.religion.misc': 8.71616735528201e-138}	'From: vince@sscl.uwo.ca\nSubject: Re: Early BBDDD Returns?\nOrganization: Social Science Computing Laborato

  np.exp(prob, prob)
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


# Data programming

We can weakly supervise using precise functions. Note that we have no fancy model above the labelling functions.

In [74]:
def sport(text):
    if 'puck' in text.lower():
        return 'rec.sport.hockey'
    elif 'home run' in text.lower():
        return 'rec.sport.baseball'


def iter_apply_funcs(dataset, funcs):
    for text, _ in dataset.iter_texts(accept=lambda l: l is None):
        for func in funcs:
            label = func(text)
            if label:
                yield label, text 

In [76]:
for label, text in iter_apply_funcs(train, [sport]):
    print('{}\t{}'.format(label, repr(text)))
    break

rec.sport.baseball	'From: vince@sscl.uwo.ca\nSubject: Re: Early BBDDD Returns?\nOrganization: Social Science Computing Laboratory\nNntp-Posting-Host: vaxi.sscl.uwo.ca\nLines: 11\n\nIn article <1993Apr16.073051.9160@news.cs.brandeis.edu>, st902415@pip.cc.brandeis.edu (Adam Levin) writes:\n> Just curious if anyone has started to standout early in the season in the\n> BB DDD this year. ...\n> \n> A concerned fan of the BB DDD,\n\nI am hoping to produce the first update of the BB DDD this week;\nplease send info about the most significant (longest, most critical,\netc.) home run that you have seen yet this season.\n\nVince.\n'
