In [103]:
import pandas as pd
import numpy as np
import redefine

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

In [104]:
from pprint import pprint

In [202]:
data = pd.read_csv('data/iris_modified.csv')

In [203]:
data

Unnamed: 0,id,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa
2,2,4.7,3.2,1.3,0.2,setosa
3,3,4.6,3.1,1.5,0.2,setosa
4,4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,virginica
146,146,6.3,2.5,5.0,1.9,versicolor
147,147,6.5,3.0,5.2,2.0,virginica
148,148,6.2,3.4,5.4,2.3,virginica


In [204]:
rd = redefine.REDEFINE(data, 'target', 'id')

In [205]:
X = rd.get_X()
ids = rd.get_IDs()
Y = rd.get_Y()
Y_names = rd.get_Y_names()

In [206]:
k = 10
random_seed = 123
n = len(X)
idxs = np.linspace(0, n, k+1).astype(int)

In [207]:
# Shuffle data
np.random.seed(random_seed)

idx = np.arange(0, len(X))
np.random.shuffle(idx)

x = X.copy()[idx]
y = Y.copy()[idx]

In [271]:
# test train split
for i in range(1,len(idxs)):
    idx1 = idxs[i-1]
    idx2 = idxs[i]

    xtest = x[idx1:idx2]
    xtrain = np.concatenate([x[idxs[0]:idx1], x[idx2:idxs[-1]]])

    ytest = y[idx1:idx2]
    ytrain = np.concatenate([y[idxs[0]:idx1], y[idx2:idxs[-1]]])

    scale = StandardScaler()
    xtrain_scale = scale.fit_transform(xtrain)
    xtest_scale = scale.transform(xtest)

    model = KMeans(n_clusters=3, n_init='auto')

    model.fit(xtrain_scale, ytrain)
    yhat = model.predict(xtest_scale) # change back to xtest
    
    # relabeling for clustering
    # label_map = {0:'virginica', 1:'setosa', 2:'versicolor'}
    # yhat = np.vectorize(label_map.__getitem__)(yhat)
    
    yhat_train = model.predict(xtrain_scale)
    label_map = {}
    for i in range(len(Y_names)):
        where_i = np.flatnonzero(ytrain == Y_names[i])
        val_i = np.bincount(yhat_train[where_i]).argmax()
        label_map[val_i] = Y_names[i]

    if len(label_map) == len(Y_names):
        yhat = np.vectorize(label_map.__getitem__)(yhat)
    
    print(accuracy_score(ytest, yhat))

    # print(ytest, yhat)
    # 
    

0.8666666666666667
0.8
0.8
0.7333333333333333
0.8
0.8
0.8
0.7333333333333333
1.0
0.7333333333333333


In [249]:
ytest

array(['setosa', 'versicolor', 'setosa', 'virginica', 'virginica',
       'versicolor', 'versicolor', 'virginica', 'virginica', 'versicolor',
       'setosa', 'versicolor', 'versicolor', 'virginica', 'virginica'],
      dtype=object)

In [250]:
yhat

array(['setosa', 'versicolor', 'setosa', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'virginica'], dtype='<U10')

In [270]:
dsf = np.flatnonzero(ytest==yhat)
dsf

array([ 0,  1,  2,  3,  5,  6,  9, 10, 11, 12, 14])

In [273]:
len(dsf)/len(ytest)

0.7333333333333333

In [None]:
ytrain

array(['versicolor', 'virginica', 'virginica', 'versicolor', 'setosa',
       'virginica', 'versicolor', 'setosa', 'setosa', 'versicolor',
       'virginica', 'setosa', 'versicolor', 'virginica', 'virginica',
       'virginica', 'setosa', 'setosa', 'versicolor', 'setosa', 'setosa',
       'virginica', 'setosa', 'virginica', 'setosa', 'setosa', 'setosa',
       'virginica', 'virginica', 'setosa', 'virginica', 'virginica',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'setosa',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'versicolor', 'setosa', 'setosa', 'virginica', 'setosa', 'setosa',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'versicolor', 'virginica', 'setosa', 'virginica',
       'versicolor', 'setosa', 'setosa', 'virginica', 'versicolor',
       'virginica', 'virginica', 'setosa', 'versicolor', 'versicolor',
       'virginica', 'set

In [209]:
my_dict = {}

In [199]:
i = 2
where_i = np.where(ytrain == Y_names[i])
val_i = np.bincount(yhat[where_i]).argmax()
my_dict[Y_names[i]] = val_i

In [213]:
yhat_train

array([1, 1, 1, 1, 0, 1, 1, 2, 2, 1, 1, 0, 1, 1, 1, 1, 1, 2, 0, 1, 2, 1,
       0, 0, 1, 2, 2, 1, 1, 1, 1, 1, 1, 0, 2, 1, 0, 2, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 0,
       2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 2, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       2, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 0, 2,
       1, 1, 1], dtype=int32)

In [214]:
where0 = np.where(yhat_train == 0)
where1 = np.where(yhat_train == 1)
where2 = np.where(yhat_train == 2)


In [217]:
where2

(array([  7,   8,  17,  20,  25,  26,  34,  37,  45,  58,  62,  66,  76,
         88,  92, 111, 122, 126, 131]),)

In [227]:
print(ytrain[where2])
# np.bincount(yhat[where2]).argmax()
# pd.value_counts(ytrain[where0])
vc = pd.Series(ytrain[where2]).value_counts()

['setosa' 'setosa' 'setosa' 'versicolor' 'versicolor' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'versicolor' 'versicolor']


In [118]:
my_dict = {'virginica': 0, 'setosa': 1, 'versicolor': 2}
# ytest2 = ytest.map()
ytest2 = np.vectorize(my_dict.__getitem__)(ytest)
ytest2

array([1, 2, 1, 0, 0, 2, 2, 0, 0, 2, 1, 2, 2, 0, 0])

In [233]:
max(vc)

15

# TODO:
- scaling
    - option in the UI
    - types of scaling
        - standard
        - min max
        - robust
        - absolute max
- convert to function in redefine + show results in UI

- clustering
  - labelling the data for scoring
    - look in nick's code

In [99]:
class_models = {"Nearest Neighbor":NearestNeighbors, "Random Forest":RandomForestClassifier}

In [100]:
params = {
            "n_estimators": "100",
            "criterion": None,
            "max_depth": "6",
            "random_state": None
            }

In [101]:
entry = "Random Forest"
model = class_models[entry](**params)
model