# Label corruption synthetic experiment using simple dataset with 4 classes


In [1]:
%load_ext autoreload
%autoreload 2
%pdb on

import sys
import os
sys.path.append('..')
sys.path.append(os.path.join('..','..'))

import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from labelcorrupt import KnnLabelCorrupter, PercentLabelCorrupter, CorruptTrainLabelsClassifier
from label_corruption_experiment import make_dataset, synthetic_corruption_experiment, plot_results

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

def plot_X_y(X,y):
    # Create a trace
    data = [
        go.Scatter(
            x = X[y==label,0],
            y = X[y==label,1],
            mode = 'markers'
        )
        for label in np.unique(y)
    ]
    py.iplot(data)

Automatic pdb calling has been turned ON


In [2]:
# Load dataset
dataset = 'double_moons'
n_samples = 1000
X, y = make_dataset(dataset, n_samples, random_state=0)
plot_X_y(X,y)


In [3]:
# Basic usage
knn_corrupter = KnnLabelCorrupter(corruption_size=10, noise_rate=0.2, random_state=0)
y_knn_corrupt = knn_corrupter.fit_transform(X,y)
plot_X_y(X,y_knn_corrupt)

In [4]:
# This is just a convenient wrapper arouund KnnLabelCorrupter
#  that uses intuitive percent corrupt size so that corruption size
#  is dependent on the number of samples.
# Also, this allows for variance in the sizes of corruption
percent_corrupter = PercentLabelCorrupter(corrupt_chunk_perc=0.01, corrupt_chunk_perc_std=0.005, 
                                          noise_rate=0.2, random_state=0)
y_perc_corrupt = percent_corrupter.fit_transform(X,y)
plot_X_y(X,y_perc_corrupt)

In [5]:
# Wrap any estimator in this to train on corrupt data
# Useful for using with GridSearchCV to find the best parameters
corrupt_wrapper = CorruptTrainLabelsClassifier(
    estimator=KNeighborsClassifier(),
    corrupter=percent_corrupter,
)
# Must prefix parameters by "estimator__" to access wrapped estimator
cv_estimator = GridSearchCV(corrupt_wrapper, {'estimator__n_neighbors': range(1,100,2)}, cv=3)
cv_estimator.fit(X,y)

# NOTE: You MUST unwrap the final model and parameters
best_params = {k[len('estimator__'):]: v for k,v in cv_estimator.best_params_.items()}
best_estimator = cv_estimator.best_estimator_.estimator
print(best_params)
print(best_estimator)

{'n_neighbors': 27}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=27, p=2,
           weights='uniform')


In [6]:
# Run experiment
results, D_arr = synthetic_corruption_experiment(
    X, y, 
    estimator_configs = [
        dict(label='DecisionTree',
             estimator=DecisionTreeClassifier(random_state=0),
             cv_params={'max_depth': range(1,15)}),
        dict(label='GaussianNB',
              estimator=GaussianNB(),
              cv_params={}),
        dict(label='KNN',
             estimator=KNeighborsClassifier(),
             cv_params={'n_neighbors': list(range(1,100,4))}),
        dict(label='SVM (Linear)',
             estimator=LinearSVC(),
             cv_params={'C': np.logspace(-3,3,25)}),
        dict(label='SVM (RBF)',
             estimator=SVC(),
             cv_params={'C': np.logspace(-2,2,4), 'gamma': np.logspace(-2,4,8)}),
        dict(label='RandomForest',
             estimator=RandomForestClassifier(), 
             cv_params={'max_depth': list(np.round(np.linspace(1,10,4))),
                        'n_estimators': np.array(np.round(np.linspace(1,50,5)), dtype=np.int) }),
        dict(label='QuadDiscAnalysis',
             estimator=QuadraticDiscriminantAnalysis(), 
             cv_params={'reg_param': np.logspace(-4,0,10)}),
        dict(label='AdaBoost',
             estimator=AdaBoostClassifier(),
             cv_params={'n_estimators': np.array(np.round(np.linspace(1,50,5)), dtype=np.int) }),
    ],
    # Experimental parameters
    n_cv_splits_outer = 3,
    n_cv_splits_inner = 3,
    n_corrupted_datasets = 3,
    # Corruption parameters
    noise_rate=0.2,
    corrupt_chunk_perc=0.02, # Approximate percentage to flip together
    corrupt_chunk_perc_std=0.02/2, # Variation around it
    # Inconsequential parameters
    n_jobs=4,
)

    0.05 s ( 0.00 m) - Create corrupted datasets
    9.20 s ( 0.15 m) -       Outer split 0 (3 inner splits)
    7.78 s ( 0.13 m) -       Outer split 1 (3 inner splits)
    7.00 s ( 0.12 m) -       Outer split 2 (3 inner splits)
   23.97 s ( 0.40 m) -     Synthetic corruption dataset 0
    5.55 s ( 0.09 m) -       Outer split 0 (3 inner splits)
    4.93 s ( 0.08 m) -       Outer split 1 (3 inner splits)
    4.83 s ( 0.08 m) -       Outer split 2 (3 inner splits)
   15.33 s ( 0.26 m) -     Synthetic corruption dataset 1
    5.18 s ( 0.09 m) -       Outer split 0 (3 inner splits)
    4.99 s ( 0.08 m) -       Outer split 1 (3 inner splits)
    5.85 s ( 0.10 m) -       Outer split 2 (3 inner splits)
   16.03 s ( 0.27 m) -     Synthetic corruption dataset 2
   55.35 s ( 0.92 m) -   Methodology (Oracle) CV with Clean Test
    4.83 s ( 0.08 m) -       Outer split 0 (3 inner splits)
    4.90 s ( 0.08 m) -       Outer split 1 (3 inner splits)
    4.78 s ( 0.08 m) -       Outer split 2 (3 inner 

In [7]:
def plot_X_y(X,y):
    # Create a trace
    data = [
        go.Scatter(
            name = name,
            x = X[y==label,0],
            y = X[y==label,1],
            mode = 'markers',
            marker = dict(symbol=symbol, size=size),
        )
        for label,symbol,size,name in zip(
            np.unique(y),
            ['circle','triangle-up','star','x'], 
            [6,7.6,7.5,7.5],
            ['Class %d' % (i+1) for i in range(4)],
        )
    ]
    axis_layout = dict(
        showgrid=False,
        showline=False,
        zeroline=False,
        showticklabels=False,
    )
    layout = go.Layout(
        legend=dict(x=0.8,y=0.05),
        xaxis=axis_layout,
        yaxis=axis_layout,
        autosize=False,
        width=600,
        height=300,
        margin=go.Margin(
            l=0,
            r=0,
            t=0,
            b=0,
        ),
    )
    py.iplot(go.Figure(data=data,layout=layout))
plot_X_y(X,y)
for i in range(3):
    plot_X_y(D_arr[i].X,D_arr[i].y)

## Explanation of Model Selection Methodologies
Let $y$ be the clean labels (in a real situation these are hidden). Let $c_1()$ and $c_2()$ be a label corruption functions (note that both depend on the data distribution X). In a real situation $c_1()$ is unknown/hidden and $c_2()$ is some sort of synthetic corruption (in our case KNN label corruption).

"CV" (blue) - Traditional cross validation
- train: $c_1(y)$
- test: $c_1(y)$

"Corrupt CV" (orange) - Cross validation where the training labels are synthetically corrupted before training but are left alone for testing. Note that the train/test split is computed based on $c_1(y)$ and then the corruption happens, i.e. train_test_split -> corruption_of_train.
- train: $c_2(c_1(y))$
- test: $c_1(y)$

"(Oracle) CV with Clean Test" (green) - Oracle/unrealistic cross validation where the model can only train on the corrupt labels but can test on clean labels. This is the best that any model could do if it can only train on corrupt labels.  This is what "Corrupt CV" is trying to mimic in terms of model selection.
- train: $c_1(y)$
- test: $y$

"(Oracle) CV with Clean Train & Test" (red) - Cross validation if full access to clean labels both for train and test.  This is the classic academic setting where we assume there is no label corruption. This difference between the previous oracle and this one shows how well the method could do if given perfectly clean labels.
- train: $y$
- test: $y$

## Observations
- The "Corrupt CV" curve approximately tracks the "(Oracle) CV with Clean Test" with a shift.  This shows that "Corrupt CV" is likely a better model selection methodology than "CV".
- "Corrupt CV" often performs significantly better than "CV" because it does not overfit the corrupt labels.

In [9]:
plot_results(results)