In [1]:
import numpy as np
import pandas as pd
from math import sqrt
import random
from matplotlib import pyplot as plt
import sys
import seaborn
from scipy.spatial.distance import euclidean
import warnings
from sklearn import preprocessing

# pd.set_option('display.max_rows', 100)

In [2]:
columns = ['id', 'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marg_adhesion',
           'single_epith_cell_size', 'bare_nuclei', 'bland_chrom', 'norm_nucleoli', 'mitoses', 'class']

df = pd.read_csv('breast-cancer-wisconsin.data', header=None, names=columns)
df.head(10)

df = df.replace(['?'],np.NaN)
df.isnull().sum()

# replace the missing value with the mode value of that column
df = df.apply(pd.to_numeric)
mode = df['bare_nuclei'].mode()
df['bare_nuclei'].fillna(mode[0], inplace =True)

# check the missing value again, this time the number of missing value should be 0
df.isnull().sum()

# drop unwanted column - 'id'
df = df.drop("id",axis=1)
df.head(5)

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [3]:
def my_train_test_split(X, y, test_size=0.3, random_state=None):

    # add y as a column to X, use df.sample() to shuffle the whole dataset
    X = pd.DataFrame(X)
    X[y.columns.values.tolist()[0]] = y
    shuffled_df = X.sample(frac=1,random_state=random_state)

    # calculate the number of training dataset and testing dataset
    total_number = X.shape[0]
    number_of_testdata = int(np.ceil(total_number*test_size))
    number_of_traindata = total_number - number_of_testdata

    # get the training dataset and testing dataset
    train_set = shuffled_df[:number_of_traindata]
    test_set = shuffled_df[number_of_traindata:]

    # split the testing dataset to X_test and y_test
    X_test = test_set.drop(y.columns.values, axis=1)
    y_test = test_set[y.columns.values]

    # return X_train, X_test, y_train, y_test
    return train_set, X_test, y_test

# define dataset
X = df.drop(['class'],axis=1)
y = df[['class']]

# scale the feature, Use StandardScaler method
scaler = preprocessing.StandardScaler()
X_scaler = scaler.fit_transform(X)

# Split the Training and Test dataset
train_dataset, X_test, y_test = my_train_test_split(X_scaler, y, test_size=0.2, random_state=5)
print(type(train_dataset))
print(type(X_test))
print(type(y_test))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [4]:
def knn_algorithm(traindata, testdata, k_neighbours=5):
    
    classification_result = np.zeros(shape=(1,testdata.shape[0]))
    confidence_array = np.zeros(shape=(1,testdata.shape[0]))

    # set a warning message when k is less than or equal to the total classification groups
    if k_neighbours <= traindata.drop(testdata.columns.values, axis=1).nunique().values:
        warnings.warn('k is set to a value less than or equal to the total classification groups')
    else:
        pass
    
    
    traindata_X_array = traindata[testdata.columns].to_numpy()
    traindata_y_array = traindata.drop(testdata.columns, axis=1).to_numpy()
    test_data_array = testdata.to_numpy()

    # Calculate Euclidean distances
    for i in range(testdata.shape[0]):
        near_points = None
        feature_distance_list = []

        x_columns = testdata.columns
        for j in range(traindata.shape[0]):
            
            # for each 'point' in testing dataset, calculate distance with all 'points' in training dataset
            # euclidean_distance = euclidean(traindata[x_columns].values.tolist()[j], testdata.values.tolist()[i])
            euclidean_distance = np.sqrt(np.sum((test_data_array[i]-traindata_X_array[j])**2))

            # for each 'point' in testing, generate a list which contains all value of y and euclidean distance
            feature_distance_list.append([traindata_y_array[j][0], euclidean_distance])

        # sorted the list by distance generated in the last step
        # sorted_list = sorted(feature_distance_list,key=lambda x:x[1])
        # df_sorted_list = pd.DataFrame(sorted_list, columns=['dependent_variable','distance'])


        # take the first k_neighbours rows, which is exactly the k_nearest neighbours
        # df_neighbours = df_sorted_list.head(k_neighbours)

        # k_neighbours vote for the final result, minority obeys majority
        # classes = df_neighbours['dependent_variable'].value_counts()
        # proportions = classes / classes.sum()
        # classification_result[0][i] = proportions.index[0]

        # confidence for each 'point' in testing dataset is the proportion of the classification_result (the rate of the majority)
        # confidence_array[0][i] = proportions.iloc[0]

    # confidence = np.mean(confidence_array)
       
    # return classification_result, confidence_array
    return feature_distance_list

In [5]:
result = knn_algorithm(train_dataset, X_test, k_neighbours=5)
print(type(result))

print(result)

<class 'list'>
[[4, 3.8883986373877044], [4, 4.437106499422881], [4, 4.285486744149668], [2, 5.767383127042222], [4, 4.128618450493425], [2, 5.2755413695258], [2, 5.449274853184885], [4, 3.589319128730788], [2, 5.068693803084764], [4, 4.624069813237074], [2, 5.276866908122735], [2, 5.847098468468603], [2, 5.567726868150081], [4, 3.3326979289827174], [2, 5.276866908122735], [4, 6.179981663460013], [2, 5.390019539328946], [2, 5.555426875154199], [2, 5.4442058792632455], [2, 5.15035126129034], [2, 5.103423445246777], [2, 5.390019539328946], [4, 4.683053432444658], [2, 5.5605051905451015], [2, 5.715506180900491], [4, 3.8960527210949625], [4, 4.01677110900278], [4, 3.850196250662678], [2, 5.893746160432402], [2, 5.160996133213598], [2, 5.190964216478416], [2, 5.876750337613951], [2, 5.387445924027859], [4, 3.949866613326941], [2, 5.2755413695258], [4, 3.619923741110697], [4, 3.76015191446635], [4, 6.664129882008869], [4, 3.927386154767936], [2, 5.2244065196452105], [2, 3.834718433337878], [