In [43]:
import matplotlib.pyplot as plt
from loaddata import loadtrain
from sklearn.base import clone
from sklearn.metrics import f1_score
from oob import gen_oob
import numpy as np
from ipywidgets import interact, interactive, fixed, interact_manual
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import LinearSVC
import ipywidgets as widgets
from scipy import sparse
from IPython.display import display, clear_output
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.feature_extraction.text import TfidfTransformer

X, y = loadtrain()

# What Didn't Work

## OTS GBTrees

* XGBoost was too slow

* LGBM was slow and surprisingly bad

## Deep Learning

... no matter how much compute I dumped on it

![netgraph]

[netgraph]: ./model.png

## Too-clever Ensembling

* Soft-margin SVM with randomized feature subselect
* XGBoost/LGBM with wide forests and highly randomized training

* All too slow and not good

# What Did Work (At Least A Little)

## Linear SVM

Base model is linear SVM w/ C=1

## Simple Data Transformation

* Tf-Idf
* Scaling

Better results, also faster training

In [44]:
X = TfidfTransformer(norm=None).fit_transform(X)
X = Normalizer().fit_transform(X)

## Data Augmentation

Some rare labels, so generate more

### Generative Model

$\textbf{x} \sim \text{Multinomial}(n, \bf{\mu})$

$n \sim \text{Gaussian}(\mu_{words \mid label}, \sigma_{words \mid label})$

In [45]:
m = LinearSVC(C=1)

@interact_manual(label=widgets.BoundedIntText(value=22,min=0,max=y.shape[1],step=1,description='Label:'))
def drawcomparison(label):
    label = int(label)
    ydata = y[:, label]
    fig = plt.figure(figsize=(10, 8))

    original = plt.subplot(311)
    aug = plt.subplot(312)
    diff = plt.subplot(313)
    n_aug = 2000

    newdata = gen_oob(X, y, label, n_aug)

    Xdata = np.asarray(np.mean(X[y[:, label] == 1], axis=0)).reshape(-1)
    augdata = np.asarray(np.mean(newdata, axis=0)).reshape(-1)

    original.plot(Xdata)
    original.set_title('Histogram of Original Feature Frequencies')
    original.set_xlabel('Feature')
    original.set_ylabel('Frequency')
    
    aug.plot(augdata)
    aug.set_title('Histogram of Generated Feature Frequencies')
    aug.set_xlabel('Feature')
    aug.set_ylabel('Frequency')

    
    diff.plot(Xdata - augdata)
    diff.set_title('Difference')
    diff.set_ylim([-np.max(Xdata), np.max(Xdata)])
    diff.set_xlabel('Feature')
    diff.set_ylabel('Original - Augmented')

    
    plt.tight_layout()
    plt.show()

### Results: Marginal Improvement

* But not always, and not for all labels.
* CV with sparse labels is hard because the training data has a lot of randomness

In [46]:
@interact_manual(label=widgets.BoundedIntText(value=22,min=0,max=y.shape[1],step=1,description='Label:'))
def beforeafter(label):
    display('Generating data for {}'.format(label))
    clear_output(wait=True)
    label = int(label)
    n_aug = 2000
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    newtrain = gen_oob(X_train, y_train, label, n_aug)

    display('Stacking')
    clear_output(wait=True)
    Xafter = sparse.vstack([X_train, newtrain])
    yafter = np.hstack([y_train[:, label], np.ones(n_aug)])    
    
    beforemodel = clone(m)
    aftermodel = clone(m)
    
    display('Fitting benchmarks...'.format(label))
    clear_output(wait=True)

    beforemodel.fit(X_train, y_train[:, label] == 1)
    aftermodel.fit(Xafter, yafter)
    
    bp = beforemodel.predict(X_test)
    ap = aftermodel.predict(X_test)
    
    bf1 = f1_score(y_test[:, label] == 1, bp)
    af1 = f1_score(y_test[:, label] == 1, ap)
    
    print('CV F1 before augmenting:\t{}'.format(bf1))
    print('CV F1 after augmenting:\t{}'.format(af1))

## Label Modeling

* Biggest lift I got
* Train a model to capture feature -> label association, and another to capture label -> label association

# Things That Will Probably Work, But Haven't Yet

## Heirarchical Classification/Subsets

* Data is heirarchical, so take advantage of that
* Some improvement in F1 for some labels, but slightly depresses macro F1
* Future Work:
    * Softer grouping
    * Actual math