In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
import redefine

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans, DBSCAN, HDBSCAN, AgglomerativeClustering
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [158]:
data = pd.read_csv('data/iris_modified.csv')

In [159]:
data

Unnamed: 0,id,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa
2,2,4.7,3.2,1.3,0.2,setosa
3,3,4.6,3.1,1.5,0.2,setosa
4,4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,virginica
146,146,6.3,2.5,5.0,1.9,versicolor
147,147,6.5,3.0,5.2,2.0,virginica
148,148,6.2,3.4,5.4,2.3,virginica


In [161]:
rd = redefine.REDEFINE('iris_true.csv', data, 'target', 'id')

In [162]:
results = rd.run_redefine_test('Random Forest', {'n_estimators':'5'}, 'KMeans', {}, 'Standard')

In [163]:
X = rd.get_X()
IDs = rd.get_IDs()
Y = rd.get_Y()
Y_names = rd.get_Y_names()

In [164]:
flagged_ids = rd.flagged_ids
results_df = rd.results_df

In [165]:
scale = StandardScaler()
x_scale = scale.fit_transform(X)

In [166]:
tsne = TSNE(n_components=2, perplexity=10, random_state=1)
x_tsne = tsne.fit_transform(x_scale)

In [167]:
pca = PCA(n_components=2, random_state=1)
x_pca = pca.fit_transform(x_scale)

In [168]:
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, Legend
from bokeh.plotting import figure, show
from bokeh.models import CategoricalColorMapper
from bokeh.palettes import Accent8


output_notebook()

In [169]:
len(x_pca)

150

In [170]:
flag_idxs = np.nonzero(np.array(flagged_ids)[:, None] == np.array(IDs))[1]


In [171]:
x_pca_true = x_pca.copy()
x_pca_true = np.delete(x_pca_true, flag_idxs, axis=0)
len(x_pca_true)

138

In [172]:
results_df[results_df.index.isin(flagged_ids)]

Unnamed: 0_level_0,Label,ClassificationResult,ClusterResult,Flagged
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11,versicolor,setosa,setosa,True
46,virginica,setosa,setosa,True
62,virginica,versicolor,versicolor,True
70,versicolor,virginica,virginica,True
77,versicolor,virginica,virginica,True
87,setosa,versicolor,versicolor,True
106,virginica,versicolor,versicolor,True
113,virginica,versicolor,versicolor,True
119,virginica,versicolor,versicolor,True
126,virginica,versicolor,versicolor,True


In [174]:
TOOLTIPS = [
    ("ID", "@id"),
    ("Original Label", "@label"),
    ("Supervised Label", "@sup_label"),
    ("Unsupervised Label", "@unsup_label")
]

p = figure(width=400, height=500,
         tooltips = TOOLTIPS)
# p.toolbar.active_inspection[hover_tool]
flag_idxs = np.nonzero(np.array(flagged_ids)[:, None] == np.array(IDs))[1]

x_true = np.delete(x_tsne, flag_idxs, axis=0)
results_true = results_df.copy().drop(flagged_ids, axis=0)

full_data = ColumnDataSource(dict(
    x1=x_true[:,0],
    x2=x_true[:,1],
    label=results_true['Label'],
    sup_label=results_true['ClassificationResult'],
    unsup_label=results_true['ClusterResult'],
    id = results_true.index
))

color_mapper = CategoricalColorMapper(factors=Y_names, palette=Accent8)

true_points = p.circle(x='x1', y='x2', source=full_data, size=7, alpha=0.8,
                       color={'field': 'label', 'transform': color_mapper},
                       legend_field='label')

# flagged points
flagged_X = x_tsne[flag_idxs]
flagged_results = results_df[results_df.index.isin(flagged_ids)]

flagged_data = ColumnDataSource(dict(
    x1=flagged_X[:,0],
    x2=flagged_X[:,1],
    label=flagged_results['Label'],
    sup_label=flagged_results['ClassificationResult'],
    unsup_label=flagged_results['ClusterResult'],
    id = flagged_results.index
))

misclass_points = p.circle(x='x1', y='x2', source=flagged_data, size=7, alpha=0.8,
                           color='red', legend_label='Potentially Misclassified Points')

p.legend.visible = False

leg = Legend(items=p.legend.items, location='left')

p.add_layout(leg, 'below')

p.title.text = "Plot"
show(p)