# DataDataData

Here's a sample class wrapping the main abstraction, the `Dataset`.

In [1]:
import csv
import random


class Dataset(object):
    """ Encapsulates unlabelled and labelled examples. """
    def __init__(self, text_to_label=None):
        self.text_to_label = text_to_label or {}
        
    def add_label(self, text, label):
        self.text_to_label[text] = label
        
    def iter_texts(self, accept=lambda label: True):
        return ((text, label) for text, label in 
                 self.text_to_label.items() if accept(label))
    
    def to_csv(self, fname):
        with open(fname, 'w') as f:
            w = csv.writer(f, delimiter=',')
            for text, label in self.text_to_label.items():
                w.writerow((label, text))
    
    @classmethod
    def from_csv(cls, fname):
        with open(fname) as f:
            return cls({text: label for text, label in
                        csv.reader(f, delimiter=',')})
        
    def update(self, other):
        self.text_to_label.update(other.text_to_label)

# 20 newgroups

* Take the training set.
* Remove every second label.

In [2]:
from sklearn.datasets import fetch_20newsgroups

def build_newsgroups(subset='train'):
    """ Fetches newsgroupd data and returns a Dataset. """
    newsgroups_train = fetch_20newsgroups(subset=subset)
    label_names = {index: name for index, name in 
                   enumerate(newsgroups_train.target_names)}
    return Dataset({text: label_names[index] 
                    for text, index in zip(newsgroups_train.data, 
                                           newsgroups_train.target)})

def unlabel(dataset, p=0.5):
    """ Randomly removes some labels. """
    for text, label in dataset.iter_texts():
        if random.random() > p:
            dataset.add_label(text, None)


train = build_newsgroups()
unlabel(train)

# Active sampling with classifier

Here is a straw man active sampler that:
* trains a classifier on the labelled data
* predicts the labels of unlabelled data
* selects text with a specific label profile

In [3]:
from collections import Counter

import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline


def iter_active_sample(dataset, accept=lambda i: True, limit=None, cross_validate=True):
    """ Trains a model from the labelled data.
    Yields unlabelled (but predicted examples that evaluate True in accept()).
    """
    X, y = [], []
    for text, label in list(dataset.iter_texts(lambda l: l is not None)):
        X.append(text)
        y.append(label)
    print('Got {} labelled samples'.format(len(X)))
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('clf', SGDClassifier(loss='log')),
    ])
    if cross_validate:
        print('Cross-validating')
        scores = cross_val_score(pipeline, X, y, cv=3)
        print("Cross-validated accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print('Refitting')
    pipeline.fit(X, y)
    X, y = [], []
    for text, label in list(dataset.iter_texts(lambda l: l is None)):
        X.append(text)
        y.append(label)
    print('Predicting {} unlabelled'.format(len(X)))
    yielded = 0
    for probs, text in zip(pipeline.predict_proba(X), X):
        predictions = dict(zip(pipeline.classes_, probs))
        if accept(predictions):
            yield text, predictions
            yielded += 1
            if limit and yielded == limit:
                break
    

def iter_random_unlabelled(dataset, limit=None):
    unlabelled = list(dataset.iter_texts(accept=lambda label: label is None))
    random.shuffle(unlabelled)
    yielded = 0
    for text, label in unlabelled:
        yield text, label
        yielded += 1
        if limit and yielded == limit:
            break

In [4]:
for text, preds in iter_active_sample(train, 
                                      accept=lambda preds: max(preds.values()) < 0.7,
                                      limit=5,
                                      cross_validate=True):
    print('Labels\t{}\nText\t{}\n'.format(
        list(sorted(preds.items(), key=lambda i: i[1], reverse=True))[:3],
        repr(text)
    ))

Got 5656 labelled samples
Cross-validating




Cross-validated accuracy: 0.73 (+/- 0.02)
Refitting




Predicting 5658 unlabelled
Labels	[('comp.os.ms-windows.misc', 0.49999519650775315), ('soc.religion.christian', 0.49998825435273836), ('comp.windows.x', 1.6549139380699382e-05)]
Text	'From: yuting@Xenon.Stanford.EDU (Eugene Y. Kuo)\nSubject: Any updated Canon BJ-200 driver?\nOrganization: Computer Science Department, Stanford University.\nLines: 8\n\nHi ... can anyone tell me where I can get a copy of updated Canon BJ-200\nprinter driver for Windows 3.1, if any ? I have ver 1.0 which comes with\nmy BJ-200 printer, I just wonder if there is any newer version.\n\nThanks very much, please email.\n\n\n\n'

Labels	[('rec.autos', 0.5), ('talk.politics.guns', 0.5), ('rec.sport.baseball', 1.4280381443855174e-27)]

Labels	[('comp.windows.x', 0.59616316267707037), ('comp.sys.mac.hardware', 0.40383649439565755), ('rec.motorcycles', 2.260889691136638e-07)]
Text	"From: pallis@server.uwindsor.ca (PALLIS  DIMITRIOS        )\nSubject: Re: Genoa Blitz 24 hits 1600x1200x256 NI !\nLines: 3\n\ni am sorry,

  np.exp(prob, prob)
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


In [5]:
for text, label in iter_random_unlabelled(train, limit=3):
    print('Label\t{}\nText\t{}\n'.format(
        label,
        repr(text)
    ))

Label	None
Text	'From: ccdarg@dct.ac.uk (Alan Greig)\nSubject: Re: ATF BURNS DIVIDIAN RANCH! NO SURVIVORS!!!\nOrganization: Dundee Institute of Technology\nLines: 22\n\nIn article <C5tEnu.112F@ns1.nodak.edu>, green@plains.NoDak.edu (Bill Green) writes:\n\n> And a few other questions.  Like I said, I believe the actions taken, in\n> general, were proper.  But I still have some reservations.\n\nWe\'ve heard a lot of talk about brainwashing in Waco but the brainwashing\nof the general population never ceases to amaze me. Here is an\nexample of action being taken which results in the worst possible\noutcome and despite people\'s deep intuition telling them something\nis wrong the programming will still cut in and say that the\nagents probably acted in good faith. NO THEY DIDN\'T. They either did\nnot have enough information to act in good faith or else they acted\nknowing the risk. Sums up human stupidity all over and one of these\ndays it will destroy the fucking planet: "Oh sorry. Didn\'

# Data programming

We can weakly supervise using precise functions. Note that we have no fancy model above the labelling functions.

In [6]:
from collections import Counter


def hockey_puck(text):
    if 'puck' in text.lower():
        return 'rec.sport.hockey'


def baseball_home_run(text):
    if 'home run' in text.lower():
        return 'rec.sport.baseball'
    

def hockey_NHL(text):
    if not 'nhl' in text.lower():
        return '!rec.sport.hockey'


def iter_apply_funcs(data, funcs):
    for text, _ in data:
        votes = Counter(func(text) for func in funcs).most_common(1)
        if votes[0][0] is None:
            continue
        yield text, votes

In [7]:
for text, votes in iter_apply_funcs(iter_random_unlabelled(train, limit=250), 
                                    [hockey_NHL, hockey_puck, baseball_home_run]):
    print('Label\t{}\nText\t{}\n'.format(votes, repr(text)))

Label	[('!rec.sport.hockey', 1)]

Label	[('!rec.sport.hockey', 1)]
Text	"From: rudy@netcom.com (Rudy Wade)\nSubject: Re: YANKKES 1 GAME CLOSER\nArticle-I.D.: netcom.rudyC52rBD.86w\nOrganization: Home of the Brave\nLines: 18\n\nMy god, hope we don't have to put up with this kind of junk all season!\n\nIn article <002251w.5.734117130@axe.acadiau.ca> 002251w@axe.acadiau.ca (JASON WALTER WORKS) writes:\n>    The N.Y.Yankees, are now one game closer to the A.L.East pennant.  They \n>clobbered Cleveland, 9-1, on a fine pitching performance by Key, and two \n>homeruns by Tartabull(first M.L.baseball to go out this season), and a three \n\nHow many home runs by Tartabull?  Just 1, right, you must be thinking\nof Dean Palmer or Juan Gonzalez (both of Texas) who each had 2 homers.\n\n>run homer by Nokes.  For all of you who didn't pick Boggs in your pools, \n>tough break, he had a couple hits, and drove in a couple runs(with many more \n\nI don't know how many to follow, but he was 1 for 4.\n\n>

# Annotation tool


In [8]:
from IPython.display import clear_output, display
import ipywidgets as widgets
import functools

In [9]:
class AnnotationPane(object):
    def __init__(self, queue, label_scheme):
        self.text_pane = widgets.HTML()
        self.buttons = []
        for desc, label in label_scheme:
            button = widgets.Button(
                description=desc
            )
            button.on_click(functools.partial(self.on_click, label))
            self.buttons.append(button)
        
        self.view = widgets.VBox([
            widgets.HBox(self.buttons),
            self.text_pane,  
        ])
        self.queue = list(queue)
        self.draw()
        display(self.view)
        self.responses = []

    def on_click(self, label, button):
        self.responses.append((text, label))
        self.draw()
        
    def draw(self):
        if not self.queue:
            for button in self.buttons:
                button.disabled = True
            self.text_pane.value = '<p>Finished</p>'
        else:
            self.text = self.queue.pop(0)
            self.text_pane.value = '<p>{}</p>'.format(self.text)

In [10]:
pane = AnnotationPane(list(iter_random_unlabelled(train, limit=10)),
                      [('Yes', True),
                       ('No', False)])