In [2]:
# This program demonstrates modelling and predicting on breast cancer data using KNN algorithm

import numpy as np
from sklearn import neighbors
from sklearn.model_selection import train_test_split
import pandas as pd

# Data extracted from http://mlr.cs.umass.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)
# class column conveys if the cancer is benign(=not harmful) or malignant(harmful) 
# indicating with numbers 2 and 4 respectively.
df = pd.read_csv('breast-cancer-wisconsin.csv')
print("Breast Cancer Data:")
df


Breast Cancer Data:


Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


### Preprocessing 

In [3]:
# Replace ? with outlier value -99999
df.replace('?', -99999, inplace=True)
print('After replacing ? with outlier value -99999:')
df

After replacing ? with outlier value -99999:


Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [4]:
# Remove irrelevant column - id, which badly affects the outcome of the KNN, if we retain it
# To test the effect run the program without dropping the id column and notice that the accuracy will be bad
# Dropping id gives an Accuracy =  0.9642857142857143
# Retaining id gives an Accuracy =  0.65
df.drop(['id'], 1, inplace=True) # 2nd parameter indicates whether to remove rows(=0) or cols(=1)
print("After dropping column 'id':")
df

After dropping column 'id':


Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
5,8,10,10,8,7,10,9,7,1,4
6,1,1,1,1,2,10,3,1,1,2
7,2,1,2,1,2,1,3,1,1,2
8,2,1,1,1,2,1,1,1,5,2
9,4,2,1,1,2,1,2,1,1,2


In [5]:
# Slice the data into X (=features), y(=label)
# 2nd parameter of drop method indicates whether to remove rows(=0) or cols(=1)
X = np.array(df.drop(['class'], 1)) # Consider all columns as features except 'class'
y = np.array(df['class']) # Consider the column 'class' as it will be our 'Label'

In [6]:
print("Values of X:")
X

Values of X:


array([[5, 1, 1, ..., 3, 1, 1],
       [5, 4, 4, ..., 3, 2, 1],
       [3, 1, 1, ..., 3, 1, 1],
       ...,
       [5, 10, 10, ..., 8, 10, 2],
       [4, 8, 6, ..., 10, 6, 1],
       [4, 8, 8, ..., 10, 4, 1]], dtype=object)

In [7]:
print("Values of y:")
y

Values of y:


array([2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 4,
       2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4,
       4, 2, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4,
       2, 4, 4, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4,
       2, 4, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2,
       2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4,
       2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 4, 2,
       2, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2,
       2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4,
       4, 2, 4, 4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 4,
       4, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 4,

In [10]:
# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("X_train:")
print(X_train)

print("X_test:")
print(X_test)

print("y_train")
print(y_train)

print("y_test")
print(y_test)

X_train:
[[4 1 1 ... 2 1 1]
 [3 2 1 ... 3 1 1]
 [4 1 1 ... 1 1 1]
 ...
 [8 6 7 ... 3 4 2]
 [3 1 1 ... 2 1 1]
 [3 1 1 ... 1 1 1]]
X_test:
[[4 1 1 ... 1 1 1]
 [6 3 4 ... 3 9 1]
 [5 4 6 ... 8 10 1]
 ...
 [3 3 2 ... 3 6 1]
 [4 7 8 ... 9 1 1]
 [1 1 1 ... 2 1 1]]
y_train
[2 2 2 2 4 2 2 2 4 2 4 4 2 2 2 2 2 4 4 2 2 2 4 2 4 2 2 2 2 2 2 4 4 4 4 2 4
 2 4 2 2 2 4 2 2 4 2 2 2 4 4 4 2 4 4 2 4 4 2 4 2 4 2 2 2 2 2 2 2 2 4 4 2 2
 2 2 2 4 2 4 2 2 2 2 2 2 2 2 4 2 4 2 2 4 4 2 4 2 4 4 4 4 2 2 4 2 2 4 4 4 4
 2 2 2 2 4 4 4 2 4 2 4 2 2 2 2 4 2 2 4 4 2 2 4 4 2 4 2 4 4 2 4 2 4 4 2 2 2
 2 2 2 2 2 4 4 2 2 2 2 2 4 4 2 4 2 4 2 2 2 4 2 2 4 2 2 2 2 2 4 2 4 2 2 2 2
 2 4 2 2 4 2 2 2 4 2 4 2 2 4 2 2 2 2 2 4 2 4 4 2 4 4 2 4 4 2 2 2 4 4 4 2 2
 4 2 2 4 2 2 2 2 4 2 2 2 4 2 2 2 2 4 2 4 4 4 4 2 2 4 2 2 2 2 2 2 2 2 4 2 4
 2 4 2 2 2 2 4 2 4 4 4 4 2 4 2 2 4 2 2 4 2 2 2 2 2 2 2 4 4 2 2 2 2 4 2 2 2
 2 4 2 2 2 2 2 4 2 2 2 2 2 2 2 4 2 4 2 2 4 2 4 2 2 2 4 4 2 4 4 2 4 2 4 2 4
 2 2 4 2 2 2 2 2 4 2 2 2 2 2 2 2 4 2 4 2 4 4 2 2 2 2 2 2 4 

In [11]:
# Select the KNeighbors classifier
clf = neighbors.KNeighborsClassifier()

# Fit the model on training data
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print("Accuracy = ", accuracy)

Accuracy =  0.9928571428571429


In [12]:
# Example demonstration for reshape method that we are about to use
a = np.arange(6) # create single dim array of 6 elements
b = a.reshape(3, 2) # convert single dim array as 3 elements each with 2 elements
print("a:", a)

print("b:")
print(b)

a: [0 1 2 3 4 5]
b:
[[0 1]
 [2 3]
 [4 5]]


In [13]:
# Example demonstration for reshape method that we are about to use
a = np.arange(6) # create single dim array of 6 elements
b = a.reshape(1, -1) # convert single dim array as 3 elements each with 2 elements
print("a:", a)

print("b:")
print(b)

a: [0 1 2 3 4 5]
b:
[[0 1 2 3 4 5]]


In [14]:
# Apply the model on a fictitious sample measure for prediction
ex_measures = np.array([4, 2, 1, 1, 1, 2, 3, 2, 1])
ex_measures = ex_measures.reshape(1, -1) # convert single dim array to two-dim array as our orginal array was two-dim
ex_measures

# Note: We've supplied measures of single patient. 
#      If we have 3 patients, then we will supply the measures as
#   ex_measures = np.array([4, 2, 1, 1, 1, 2, 3, 2, 1], [6, 2, 2, 4, 1, 2, 2, 3, 1], [5, 4, 2, 1, 2, 2, 4, 2, 2])
# and we get result of 3 patient predictions

array([[4, 2, 1, 1, 1, 2, 3, 2, 1]])

In [15]:
prediction = clf.predict(ex_measures)
print("Prediction on example measure = ", prediction)

Prediction on example measure =  [2]


In [19]:
# Let us try with two sets for prediction
ex_measures_set_2 = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1], [2, 5, 2, 5, 4, 2, 3, 2, 4]])
prediction_2 = clf.predict(ex_measures_set_2)
print("Prediction on example measure = ", prediction_2)

Prediction on example measure =  [2 2]


In [36]:
# Let us try with three sets for prediction
ex_measures_set_3 = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1], [9, 1, 2, 5, 4, 2, 3, 9, 4], [2, 3, 4, 1, 1, 1, 3, 9, 2]])
prediction_3 = clf.predict(ex_measures_set_3)
print("Prediction on example measure = ", prediction_3)

Prediction on example measure =  [2 4 2]
