# 0: DataDataData

Here's a sample class wrapping the main abstraction, the `Dataset`.

In [None]:
from dataset import Dataset
from sklearn.datasets import fetch_20newsgroups

In [None]:
def guns_dataset_factory(subset='train', labelled=False):
    """ Fetches newsgroup data and returns a Dataset. """
    newsgroups = fetch_20newsgroups(subset=subset)
    
    # Transform to guns or not.
    labels = {i: name == 'talk.politics.guns' for i, name in enumerate(newsgroups.target_names)}
    dataset = Dataset({text: labels[i] for text, i in zip(newsgroups.data, newsgroups.target)})
    
    return dataset

pool = guns_dataset_factory(subset='train')
test = guns_dataset_factory(subset='test')

# 1.1 Annotation & reliability

* TODO Learning curve @ben
* TODO Agreement @ben


In [None]:
from samplers import Random
import itertools
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


def run_simulation(sampler, pool, test, seed_size=100):
    """ Run a simulated learning-curve experiment. """
    # get test data
    X_test, y_test = zip(*test.oracle_items)
    # evaluate seed labels if present
    print('..seed..')
    pool.seed(seed_size)
    yield sampler.fit_and_score(pool, X_test, y_test)
    # sample until pool is empty, yielding train/test f1
    for i in itertools.count():
        batch = list(sampler(pool))
        if not batch:
            break
        print('..batch {}..'.format(i))
        for text, label in sampler(pool):
            label = pool.get_oracle_label(text)
            pool.add_label(text, label)
        yield sampler.fit_and_score(pool, X_test, y_test)


def run_n_simulations(sampler, pool, test, n=5, seed_size=3000):
    """
    Run n simulated learning-curve experiments.
    
    sampler - an function for sampling from pool
    pool - pool dataset (with oracle labels)
    test - test dataset (with oracle labels)
    n - number of experiments for confidence intervals (default=10)
    seed_size - seed pool with this many labelled items
    """
    # run simulations
    runs = []
    for i in range(n):
        print('Running simulation {}..'.format(i))
        runs.append(zip(*list(run_simulation(sampler, pool.copy, test, seed_size=seed_size))))
    # return train_sizes, train_scores, test_scores
    return (list(zip(*i)) for i in zip(*runs))


# run a simulated experiment and plot learning curve
random_sampler = Random(batch_size=3000)
train_sizes, train_scores, test_scores = run_n_simulations(random_sampler, pool, test)

In [None]:
print(train_sizes)
print(train_scores)
print(test_scores)

In [None]:
import numpy as np
import matplotlib.pyplot as plt


def plot_learning_curve(train_sizes, train_scores, test_scores):
    plt.clf()
    plt.figure()
    plt.xlabel("N training examples")
    plt.ylabel("F1 score")
    train_sizes_mean = np.mean(train_sizes, axis=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(train_scores, axis=1)
    plt.grid()
    
    plt.fill_between(train_sizes_mean, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes_mean, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1,
                     color="g")
    plt.plot(train_sizes_mean, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes_mean, test_scores_mean, 'o-', color="g",
             label="Test score")
    
    plt.legend(loc="best")
    return plt
    

plot_learning_curve(np.asarray(train_sizes), np.asarray(train_scores), np.asarray(test_scores))
plt.show()

## Samplers

For example, a random unlabelled sampler.

## Manually label some examples

In [None]:
from annotator import AnnotationPane
from samplers import Random

pane = AnnotationPane(pool, Random(10))

## See our new labels in the dataset

In [None]:
print(pool.label_distribution)