## Label Quality Demo
This demo shows the labelquality module.  There are tools for estimating label quality, predicting whether a label is corrupt, and correcting labels based on quality estimation.

In [1]:
%pdb on
import sys
import numpy as np

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
from sklearn.neighbors import KNeighborsClassifier

sys.path.append('../../')
from labelcorrupt import *
from labelquality import ProbLabelQuality, MetaProbLabelCorruptClassifier, ProbLabelCorrector

sys.path.append('../')
from label_corruption_experiment import make_dataset

def plot_X_y(X,y,title=None):
    # Create a trace
    data = [
        go.Scatter(
            x = X[y==label,0],
            y = X[y==label,1],
            mode = 'markers'
        )
        for label in np.unique(y)
    ]
    layout = go.Layout(title=title)
    py.iplot(go.Figure(data=data, layout=layout))
    
def plot_with_quality(X,y,y_quality,title=None):
    data = [
        go.Scatter(
            x=X[y==label,0],
            y=X[y==label,1],
            mode='markers',
            marker=go.Marker(
                line=dict(
                    color=np.array(y_quality)[y==label],
                    width=2,
                    colorscale=[[0, 'rgb(0,0,0)'], [1, 'rgb(255,255,255)']],
                ),
            ),
        )
        for label in np.unique(y_corrupt)
    ]
    layout = go.Layout(title=title)
    py.iplot(go.Figure(data=data, layout=layout))

Automatic pdb calling has been turned ON


In [2]:
# Make dataset
X, y = make_dataset('double_moons', 1000, random_state=0)
plot_X_y(X,y,'Original Dataset')

## Corrupt some labels
In the next plots, corruption is shown by the outline of the points.
- High label quality (near 1) = white
- Mid label quality (near 0.5) = gray
- Low label quality (near 0) = black

NOTE: Label quality is 1-is_label_corrupt (i.e. the inverse of corrupt).

In [3]:
# Make corrupt y
corrupt_params=dict(noise_rate=0.2, corrupt_chunk_perc=0.01, corrupt_chunk_perc_std=0.005)
corrupter = PercentLabelCorrupter(**corrupt_params, random_state=1)
y_corrupt = corrupter.fit_transform(X, y)
y_is_corrupt = np.array([int(a != b) for a,b in zip(y,y_corrupt)])
plot_with_quality(X,y_corrupt,1-y_is_corrupt,'Dataset with Corrupt Y (shown with black outlines)')

In [4]:
from sklearn.naive_bayes import GaussianNB
    
# Setup naive Bayes estimator
nb_quality_estimator = ProbLabelQuality(estimator=GaussianNB())

# Setup KNN estimator (used later so parameters saved)
perc_n = 0.1
n_neighbors=int(np.round(perc_n*len(y)))
quality_params = dict(
    estimator = KNeighborsClassifier(n_neighbors=n_neighbors),
    quality_scorer = 'classprob',
)
knn_quality_estimator = ProbLabelQuality(**quality_params)

# Get quality
y_quality_nb = nb_quality_estimator.fit_predict(X,y_corrupt)
y_quality_knn = knn_quality_estimator.fit_predict(X,y_corrupt)

plot_with_quality(X,y_corrupt,y_quality_nb, 'Gaussian Naive Bayes Label Quality Estimates')
plot_with_quality(X,y_corrupt,y_quality_knn, 'KNN Label Quality Estimates (k=%d)' % n_neighbors )

In [5]:
# Predict whether labels are corrupt
meta_classifier = MetaProbLabelCorruptClassifier(threshold=0.3, **quality_params)
Xy = list(zip(X,y_corrupt)) # This is a meta classifier so zip x and y together
meta_classifier.fit(Xy)

from sklearn.metrics import precision_recall_curve, roc_curve
prec, recall, thresh = precision_recall_curve(y_is_corrupt, meta_classifier.decision_function(Xy))
fpr, tpr, thresh2 = roc_curve(y_is_corrupt, meta_classifier.decision_function(Xy))

py.iplot(go.Figure(
    data=[
        go.Scatter(
            name='Precision Recall Curve',
            x=recall,
            y=prec,
            text=['threshold=%.2f' % t for t in thresh],
        ),
        go.Scatter(
            name='ROC Curve',
            x=fpr,
            y=tpr,
            text=['threshold=%.2f' % t for t in thresh2],
        ),
    ],
    layout=go.Layout(
        title = 'Performance With Different Thresholds'
    ),
))

In [6]:
# Choose threshold based on precision recall curve
from sklearn.metrics import precision_score, recall_score
meta_classifier.set_params(threshold=0.8)
y_is_corrupt_pred = meta_classifier.predict(Xy)
acc = meta_classifier.score(Xy, y_is_corrupt)
prec = precision_score(y_is_corrupt, y_is_corrupt_pred)
recall = recall_score(y_is_corrupt, y_is_corrupt_pred)

# Show results
print('Accuracy with threshold=%g of meta classifier = %g' % (meta_classifier.threshold, acc))
print('Precision with threshold=%g of meta classifier = %g' % (meta_classifier.threshold, prec))
print('Recall with threshold=%g of meta classifier = %g' % (meta_classifier.threshold, recall))
#plot_with_quality(X, y_corrupt, 1-np.array(y_is_corrupt))
plot_with_quality(X, y_corrupt, 1-np.array(y_is_corrupt_pred))

Accuracy with threshold=0.8 of meta classifier = 0.971
Precision with threshold=0.8 of meta classifier = 0.983051
Recall with threshold=0.8 of meta classifier = 0.87


In [7]:
# Correct labels
label_corrector = ProbLabelCorrector(pred_threshold=0.4, **quality_params)
y_cleaned = label_corrector.fit_transform(X,y_corrupt)
plot_with_quality(X,y_cleaned,(np.array(y_cleaned) == np.array(y)).astype(np.int))

## Attempt to learn MetaClassifier parameters even with corrupt data
Supposing that we are given corrupt data to begin with (and thus do not have y or y_is_corrupt), can we still estimate the parameters of a good meta model by adding synthetic corruption?

In the next examples, we show that we can approximately estimate the right parameters even if we only have access to the corrupt labels (i.e. y_corrupt).

In [8]:
# Try to find the right k for KNN to correct labels
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV

# Create double corrupt data (i.e. corrupt already corrupted data)
second_corrupter = clone(corrupter).set_params(
    corrupt_chunk_perc=corrupter.corrupt_chunk_perc,
    random_state=10,
)
y_double_corrupt = second_corrupter.fit_transform(X, y_corrupt)
y_is_double_corrupt = [a != b for a,b in zip(y_double_corrupt,y_corrupt)]

plot_with_quality(X,y_double_corrupt,1-np.array(y_is_double_corrupt).astype(np.int),title='Double Corrupt')

In [9]:
from sklearn.metrics import make_scorer, average_precision_score, roc_auc_score
# Setup GridSearchCV estimators
scoring = {
    'prec': make_scorer(average_precision_score, needs_threshold=True),
    'roc_auc': make_scorer(roc_auc_score, needs_threshold=True),
}
cv_est = GridSearchCV(
    # Suppress warnings since y_meta will be ignored in training by meta_classifier
    estimator=clone(meta_classifier).set_params(suppress_warning=True),
    param_grid={
        'estimator__n_neighbors': [int(np.round(x)) for x in np.logspace(0,2.7,10)],
    },
    scoring=scoring,
    refit=False,
)
cv_est_corrupt = clone(cv_est)

# Fit estimators with different data
cv_est.fit(list(zip(X,y_corrupt)), y_is_corrupt)
cv_est_corrupt.fit(list(zip(X,y_double_corrupt)), y_is_double_corrupt)

# Plot CV results
cv_arr = [dict(color='orange',name='Corrupt CV', cv_est=cv_est_corrupt), 
           dict(color='red',name='(Oracle) CV', cv_est=cv_est),]
py.iplot(go.Figure(
    data=[
        go.Scatter(
            name='%s %s' % (cv['name'], score_name),
            x=[str(p) for p in cv['cv_est'].cv_results_['params']],
            y=1-cv['cv_est'].cv_results_['mean_test_%s' % score_name],
            line=dict(
                color=cv['color'],
                dash='dash' if score_name=='roc_auc' else None,
            ),
        )
        for cv in cv_arr
        for score_name in scoring.keys()
    ], 
    layout=go.Layout(
        yaxis=dict(
            title='Error (1 - Score)',
            zeroline=True,
        ),
        
    )
))