## Load NumPy, pandas and time

In [1]:
import numpy as np
import pandas as pd
import time


In [2]:
def create_imputation(df):
    data = df.copy()
    data = data.drop(columns=["ID","CLASS"])
    flist = list(data.select_dtypes(include=['float','int']).columns)
    clist = list(data.select_dtypes(include=['category']).columns)
    olist = list(data.select_dtypes(include=['object']).columns)
    
    dicti={}
    for c in flist:
        if (data[c].isnull().all() == True):
            data[c].fillna(0,inplace=True)
            dicti[c]=0
        else:
            data[c].fillna(data[c].mean(),inplace=True)
            dicti[c]=data[c].mean()
    for k in clist:
        if (data[k].isnull().all() == True):
            data[k].fillna(data[k].cat.categories[0],inplace=True)
            dicti[k]=data[k].cat.categories[0]
        else:
            data[k].fillna(data[k].mode()[0],inplace=True)
            dicti[k] = data[k].mode()[0]
    for z in olist:
        if (data[z].isnull().all() == True):
            data[z].fillna("",inplace=True)
            dicti[z] = ""
        else:
            data[z].fillna(data[z].mode()[0],inplace=True)
            dicti[z] = data[z].mode()[0]
    data = data.assign(CLASS=df["CLASS"].values)
    data = data.assign(ID=df["ID"].values)
    return data,dicti

def apply_imputation(df,dicti):
    test = df.copy()
    test = test.drop(columns=["ID","CLASS"])
    flist_t = list(test.select_dtypes(include=['float','int']).columns)
    clist_t = list(test.select_dtypes(include=['category']).columns)
    olist_t = list(test.select_dtypes(include=['object']).columns)
    for c in flist_t:
        if (test[c].isnull().all() == True):
            test[c].fillna(0,inplace=True)
        else:
            test[c].fillna(dicti[c],inplace=True)
    for k in clist_t:
        if (test[k].isnull().all() == True):
            test[k].fillna(test[k].cat.categories[0],inplace=True)
        else:
            test[k].fillna(dicti[k],inplace=True)
    for z in olist_t:
        if (test[z].isnull().all() == True):
            test[z].fillna("",inplace=True)
        else:
            test[z].fillna(dicti[z],inplace=True)
    test = test.assign(CLASS=df["CLASS"].values)
    test = test.assign(ID=df["ID"].values)
    return test

def create_normalization(data,normalizationtype="minmax"):
    df = data.copy() # create a copy of the data
    df = df.drop(columns=["ID","CLASS"])
    df.select_dtypes(include=['float','int'])
    if(normalizationtype == "minmax"):
        dicti = {}
        for x in df.columns:
            mini = df[x].min()
            maxi = df[x].max()
            df[x] = [(y - mini)/(maxi-mini) for y in df[x]]
            dicti[x] =("minmax",mini,maxi)
        df = df.assign(CLASS=data["CLASS"].values)
        return df,dicti
    elif(normalizationtype =="zscore"): #The function is written for the zscore normalization though not used. 
        dict2 = {}
        for x in df.columns:
            meane = df[x].mean()
            stdi = df[x].std()
            df[x] = df[x].apply(lambda x:(x-meane)/stdi)
            dict2[x] =("zscore",meane,stdi)
        df = df.assign(CLASS=data["CLASS"].values)
        return df,dict2
    
def apply_normalization(test,normalization):
    dft = test.copy()
    dft = dft.drop(columns=["ID"])
    flist = list(dft.select_dtypes(include=['float']).columns)
    for c in flist:
        m = normalization[c][1] #m represents the minimum value from the normalization dictionary
        h = normalization[c][2] #h represents the maximum value from the normalization dictionary
        dft[c] = [(y - m)/(h-m) for y in dft[c]]
    return dft

def euclidean(arrone,arrtwo):
    summ= 0
    for x in range(len(arrone)):
        diff = (arrone[x] - arrtwo[x])
        squared = diff * diff
        summ = squared + summ
    return np.sqrt(summ)
def SortValue(value):
    return value[1]
def getclass(sorti,trainlabels):
    #The function returns the class labels for the K number of classes.
    classes = []
    for x in range(len(sorti)):
        classes.append(trainlabels[sorti[x][0]]) 
    return classes
def highestclass(K_classes):
    #The function returns a 2-D array with the class and its occurences/count number
    unique, counts = np.unique(K_classes, return_counts=True)
    final = []
    for i in range(len(unique)):
        final.append([unique[i],counts[i]])
    return final

def finalclass(arrone):
    #The function returns the class with the highest occurence as the reverse value is true(descending)
    finalarr = sorted(arrone,key=SortValue,reverse=True)
    fin = finalarr[0][0]
    return fin

def accuracy(predictions,correctlabels):
    count = 0
    numrows = len(correctlabels)
    for x,y in zip(predictions.index, range(numrows)):
        pos = np.argmax(np.array(predictions.loc[x]))
        if(predictions.columns[pos] == correctlabels[y]):
            count += 1
    accuracy = count/numrows
    return accuracy

def brier_score(df,correctlabels):
    col_list = list(df.columns)
    rw = 0
    b_score = 0
    for index, row in df.iterrows():
        for col in col_list:
            if col == correctlabels[rw]:
                val = 1
                b = (row[col]-val)**2
            else:
                val = 0
                b = (row[col]-val)**2
            b_score = b_score + b
        rw+=1
    b_score = b_score/(len(df))
    return b_score  

## 1. Define the class kNN

In [3]:

class kNN():
    def __init__(self):
        pass
        
    def fit(self,df,normalizationtype="minmax"):
        self.imputed,self.imputer = create_imputation(df)
        self.normalized,self.normalizer = create_normalization(self.imputed,normalizationtype)
        self.traininglabels = self.normalized["CLASS"].astype('category')
        self.labels = list(self.normalized["CLASS"].unique())
        self.labels.sort()
        self.labels = np.array(self.labels)
        self.trainingdata = self.normalized.drop(columns=["CLASS"],axis=1)
        self.trainingdata = np.array(self.trainingdata)
    def predict(self,df,k=1):
        self.testimp = apply_imputation(df,self.imputer)
        self.testnorm = apply_normalization(self.testimp,self.normalizer)
        self.testnorm = self.testnorm.drop(columns=["CLASS"],axis=1)
        self.testnorm = np.array(self.testnorm)
        
        creator = np.zeros(shape=(len(self.testnorm),len(self.labels)))
        data = pd.DataFrame(creator,columns=self.labels)
        incre = 0
        predicted = []
        for row in self.testnorm:
            indexi = 0
            sorti =[]
            dist_values = []
            for rowe in self.trainingdata:
                dist = euclidean(row,rowe)
                dist_values.append([indexi,dist])
                indexi+=1
            sorti = sorted(dist_values, key=SortValue)
            sorti = sorti[:k]
            K_classes = getclass(sorti,self.traininglabels)
            ranked = highestclass(K_classes)
            cl =[]
            for i in range(len(ranked)):
                cl.append(ranked[i][0])
            co = pd.DataFrame(columns=cl)
            h= data.columns.difference(co.columns)
            for d in h:
                data.loc[incre:incre,d] = 0
            b = data.columns.intersection(co.columns)
            for v in b:
                for i in range(len(ranked)):
                    if (v == ranked[i][0]):
                        data.loc[incre:incre,v] = (ranked[i][1]/k)
            incre +=1
            pclass = finalclass(ranked)
            predicted.append(pclass)
        return data


In [5]:
glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

knn_model = kNN()

t0 = time.perf_counter()
knn_model.fit(glass_train_df)
print("Training time: {0:.2f} s.".format(time.perf_counter()-t0))

test_labels = glass_test_df["CLASS"]

k_values = [1,3,5,7,9]
results = np.empty((len(k_values),2))

for i in range(len(k_values)):
    t0 = time.perf_counter()
    predictions = knn_model.predict(glass_test_df,k=k_values[i])
    print("Testing time (k={0}): {1:.2f} s.".format(k_values[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=k_values,columns=["Accuracy","Brier score"])

results

Training time: 0.05 s.
Testing time (k=1): 0.62 s.
Testing time (k=3): 0.63 s.
Testing time (k=5): 0.68 s.
Testing time (k=7): 0.73 s.
Testing time (k=9): 0.78 s.


Unnamed: 0,Accuracy,Brier score
1,0.747664,0.504673
3,0.663551,0.488058
5,0.579439,0.474019
7,0.598131,0.470723
9,0.616822,0.483674


In [6]:
train_labels = glass_train_df["CLASS"]
predictions = knn_model.predict(glass_train_df,k=1)
print("Accuracy on training set (k=1): {0:.2f}".format(accuracy(predictions,train_labels)))
print("Brier score on training set (k=1): {0:.2f}".format(brier_score(predictions,train_labels)))


Accuracy on training set (k=1): 1.00
Brier score on training set (k=1): 0.00
