In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import codecs
import math
import copy
import time

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder # one-hot encoding
from sklearn.decomposition import PCA # PCA
from sklearn.metrics import confusion_matrix

from sklearn import metrics
from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.model_selection import KFold # K-fold validation
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import graphviz
import random
from matplotlib.colors import ListedColormap
import seaborn as sns

In [35]:
def dist(data, centroid):
    d = []
    for i in range(len(data)):
        d.append(0)
        for j in range(len(centroid)):
            d[i] += (data[i][j]-centroid[j])**2
        d[i] = math.sqrt(d[i])
    return d

In [36]:
""" Data input """
col_names = ['area', 'perimeter', 'compactness', 'length-of-kernel', 'width-of-kernel', 'asymmetry-coefficient', 'length-of-kernel-groove', 'label']
data = pd.read_csv('seeds_dataset.txt', sep="\s+", engine='python', names=col_names)

""" Data preprocessing """

# shuffle
data = data.sample(frac=1)

# feature/label split

X = data.iloc[:,0:-1]
y = data.iloc[:,-1]

In [95]:
""" Model Construciton """

def assignment(df, centroids):
    for i in centroids.keys():
        df['distance_from_{}'.format(i)] = dist(X.values, centroids[i])
    cen_dist_col = ['distance_from_{}'.format(i) for i in centroids.keys()]
    df['closest'] = df.loc[:, cen_dist_col].idxmin(axis=1)
    df['closest'] = df['closest'].map(lambda x:int(x.lstrip('distance_from_')))
    return df

def update(df, c):
    for i in centroids.keys():
        Filter = df['closest'] == i
        for j in range(len(c[1])):
            print('j = ', j)
            tmp = df.loc[Filter, :]
            mean = np.mean(tmp.iloc[:, j])
            #mean = np.mean(df.loc[Filter, col_names[j]])
            if not math.isnan(mean):
                c[i][j] = mean
    return c

def Kmeans(df, c):
    # Initializing X_dist
    X_d = df
    X_d['closest'] = [1] * 210
    
    # Split into clusters
    for i in range(9999):
        # print('--------------Iteration %d--------------' % (i+1))
        closest_col = X_d['closest']

        # Assignment
        X_d = assignment(X, c)
        # Update
        new_c = update(X_d, c)

        # if centroid did not move -> break
        if(closest_col.equals(X_d['closest'])):
            break
        # else, go to next iteration
        c = new_c
    
    return X_d, new_c

def predict(X_d, y):
    X_d['label'] = y
    X_d['pred'] = [0] * 210
    for i in range(3):
        Filter = X_d['closest'] == i+1
        print(X_d[Filter])
        cnt = X_d[Filter]['label'].value_counts()
        print(cnt)
        X_d.loc[Filter, 'pred'] = cnt.idxmax()
    return X_d['pred'].values

In [96]:
# Generate centroids
np.random.seed(int(time.time()))
k = 3
centroids = {
    i+1:[np.random.uniform(10,20), np.random.uniform(10,20), np.random.uniform(0,1), np.random.uniform(4,7), np.random.uniform(2,4), np.random.uniform(1,5), np.random.uniform(4,6)]
    for i in range(k)
}

X_dist, centroids = Kmeans(X, centroids)
y_pred = predict(X_dist, y)
print(confusion_matrix(y.values, y_pred))
print(classification_report(y.values, y_pred))

j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6
j =  0
j =  1
j =  2

In [97]:
""" Question """
pca = PCA(2) 
X_pca = pca.fit_transform(X)
X_pca = pd.DataFrame(X_pca)

# Generate centroids
np.random.seed(int(time.time()))
k = 3
q_centroids = {
    i+1:[np.random.uniform(-10,10), np.random.uniform(-10,10)]
    for i in range(k)
}
X_dist, centroids = Kmeans(X_pca, centroids)
y_pred = predict(X_dist, y)

colormap = {1:'r', 2:'g', 3:'b'}
colors = map(lambda X:colormap[i+1], y_pred)

j =  0
j =  1
j =  2
j =  3
j =  4
j =  5
j =  6


IndexError: single positional indexer is out-of-bounds

In [59]:
X_pca

Unnamed: 0,0,1
0,-1.512065,-2.160016
1,-2.212291,-1.951784
2,6.394205,2.352517
3,-0.560385,-2.683301
4,-5.294618,0.512203
...,...,...
205,-4.256783,-0.345655
206,3.576253,-2.377080
207,0.453677,-1.446955
208,-3.439492,-1.158926
