# 0: DataDataData

Here's a sample class wrapping the main abstraction, the `Dataset`.

* TODO Support for multiple users? @ben

In [98]:
from collections import Counter
import csv
import random

from sklearn.datasets import fetch_20newsgroups


class Dataset(object):
    """ Encapsulates unlabelled and labelled examples. """
    def __init__(self, text_to_label=None):
        self.text_to_label = text_to_label or {}
        
    def add_label(self, text, label):
        self.text_to_label[text] = label
        
    def __iter__(self):
        return ((text, label) for text, label in
                 self.text_to_label.items())
    
    def to_csv(self, fname):
        with open(fname, 'w') as f:
            w = csv.writer(f, delimiter=',')
            for text, label in self.text_to_label.items():
                w.writerow((label, text))
    
    @classmethod
    def from_csv(cls, fname):
        with open(fname) as f:
            return cls({text: label for text, label in
                        csv.reader(f, delimiter=',')})
        
    def update(self, other):
        self.text_to_label.update(other.text_to_label)
    
    @property
    def label_distribution(self):
        return dict(Counter(self.text_to_label.values()))
    

def build_newsgroups(subset='train'):
    """ Fetches newsgroupd data and returns a Dataset. """
    newsgroups_train = fetch_20newsgroups(subset=subset)
    label_names = {index: name for index, name in 
                   enumerate(newsgroups_train.target_names)}
    # Transform to guns or not.
    for i, name in list(label_names.items()):
        label_names[i] = name == 'talk.politics.guns'
    return Dataset({text: label_names[index] 
                    for text, index in zip(newsgroups_train.data, 
                                           newsgroups_train.target)})

def unlabel(dataset, p=0.01):
    """ Randomly removes some labels. """
    for text, label in dataset:
        if random.random() > p:
            dataset.add_label(text, None)


train = build_newsgroups()
unlabel(train)
print(train.label_distribution)

{None: 11196, False: 111, True: 7}


# 1.1 Annotation & reliability

* TODO Learning curve @ben
* TODO Agreement @ben


## Samplers

For example, a random unlabelled sampler.

In [99]:
import random


class Random(object):
    def __init__(self, limit=None):
        self.limit = limit
        
    def __call__(self, dataset):
        unlabelled = list(((text, label) 
                          for (text, label) in dataset 
                          if label is None))
        random.shuffle(unlabelled)
        yielded = 0
        for text, label in unlabelled:
            yield text, label
            yielded += 1
            if self.limit and yielded == self.limit:
                break

## Annotation tool

In [103]:
from IPython.display import clear_output, display
import ipywidgets as widgets
import functools


class AnnotationPane(object):
    LABELS = [
        ('Yes', True),
        ('No', False),
    ]
    def __init__(self, dataset, sampler):
        self.text_pane = widgets.HTML()
        self.buttons = []
        for desc, label in self.LABELS:
            button = widgets.Button(
                description=desc
            )
            button.on_click(functools.partial(self.on_click, label))
            self.buttons.append(button)
        
        self.view = widgets.VBox([
            widgets.HBox(self.buttons),
            self.text_pane,  
        ])
        self.queue = [text for text, label in sampler(dataset)]
        self.dataset = dataset
        self.draw()
        display(self.view)   

    def on_click(self, label, button):
        self.dataset.add_label(self.text, label)
        self.draw()
        
    def draw(self):
        if not self.queue:
            for button in self.buttons:
                button.disabled = True
            self.text_pane.value = '<p>Finished</p>'
        else:
            self.text = self.queue.pop(0)
            self.text_pane.value = '<p>{}</p>'.format(self.text)

## Manually label some examples

In [104]:
pane = AnnotationPane(train, Random(10))

## See our new labels in the dataset

In [105]:
print(train.label_distribution)

{None: 11176, False: 121, True: 17}


# 1.1: Active learning

Here is a straw man active sampler that:
* trains a classifier on the labelled data
* predicts the labels of unlabelled data
* selects text with a specific label profile

In [113]:
from collections import Counter

import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline


def train_classifier(dataset, cv=3):
    X, y = [], []
    for text, label in dataset:
        assert isinstance(text, str)
        assert label in {True, False}
        X.append(text)
        y.append(label)
    print('Got {} labelled samples'.format(len(X)))
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('clf', SGDClassifier(loss='log')),
    ])
    if cv:
        print('Cross-validating')
        scores = cross_val_score(pipeline, X, y, cv=cv)
        print("Cross-validated accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))
    print('Refitting')
    pipeline.fit(X, y)
    return pipeline


class Active(object):
    def __init__(self, pipeline, 
                 query=lambda i: i[1] is None, 
                 accept=lambda i: True, limit=None):
        self.pipeline = pipeline
        self.limit = limit
        self.query = query
        self.accept = accept
    
    def __call__(self, dataset):
        X, y = [], []
        for text, label in dataset:
            if self.query((text, label)):
                X.append(text)
                y.append(label)
        print('Predicting {} unlabelled'.format(len(X)))
        yielded = 0
        for probs, text in zip(self.pipeline.predict_proba(X), X):
            predictions = dict(zip(self.pipeline.classes_, probs))
            if self.accept(predictions):
                yield text, predictions
                yielded += 1
                if self.limit and yielded == self.limit:
                    break

In [114]:
labelled = ((text, label) for text, label in train if label in {True, False})
pipeline = train_classifier(labelled, cv=3)
pane = AnnotationPane(train, Active(pipeline, 
                                    accept=lambda pred: pred[True] > 0.8,
                                    limit=10))

Got 138 labelled samples
Cross-validating




Cross-validated accuracy: 0.84 (+/- 0.06)
Refitting




Predicting 11176 unlabelled


  np.exp(prob, prob)


In [115]:
print(train.label_distribution)

{None: 11146, False: 137, True: 31}


# 2: Live shared task

We can weakly supervise using precise functions. Note that we have no fancy model above the labelling functions.

* TODO Incorporate `snorkel` @ben
* TODO Setup task @will
  * Create `/submisssions/$USER`
  * Copy notebook template
  * Load shared task data pool.
* TODO Push task @will
  * Save data to `.csv`
  * `git commit/push`