In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import accuracy_score

pd.set_option('display.max_rows', 100)

In [2]:
columns = ['id', 'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marg_adhesion',
           'single_epith_cell_size', 'bare_nuclei', 'bland_chrom', 'norm_nucleoli', 'mitoses', 'class']

df = pd.read_csv('breast-cancer-wisconsin.data', header=None, names=columns)
df.head(10)

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [3]:
df = df.replace(['?'],np.NaN)
df.isnull().sum()

id                         0
clump_thickness            0
unif_cell_size             0
unif_cell_shape            0
marg_adhesion              0
single_epith_cell_size     0
bare_nuclei               16
bland_chrom                0
norm_nucleoli              0
mitoses                    0
class                      0
dtype: int64

In [4]:
# replace the missing value with the mode value of that column
df = df.apply(pd.to_numeric)
mode = df['bare_nuclei'].mode()
df['bare_nuclei'].fillna(mode[0], inplace =True)

# check the missing value again, this time the number of missing value should be 0
df.isnull().sum()

id                        0
clump_thickness           0
unif_cell_size            0
unif_cell_shape           0
marg_adhesion             0
single_epith_cell_size    0
bare_nuclei               0
bland_chrom               0
norm_nucleoli             0
mitoses                   0
class                     0
dtype: int64

In [5]:
# drop unwanted column - 'id'
df = df.drop("id",axis=1)
df.head(5)

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [6]:
# define dataset
X = df.drop(['class'],axis=1)
y = df[['class']]

# scale the feature, Use StandardScaler method
scaler = preprocessing.StandardScaler()
X_scaler = scaler.fit_transform(X)

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=5)
random_state = None
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2,random_state=random_state)

# Creare KNN classifier for K=1, fit it and generate Prediction and Accuracy classification score
KNN_model = KNeighborsClassifier(n_neighbors=5,algorithm='brute')
KNN_model.fit(X_train, y_train) 
KNN_prediction = KNN_model.predict(X_test)
accuracy = accuracy_score(y_test, KNN_prediction)
print('When k=5, the accuracy is {}.'.format(accuracy))

When k=5, the accuracy is 0.9642857142857143.


  return self._fit(X, y)


In [8]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_scaler, y, test_size=0.4,random_state=random_state)

# Creare KNN classifier for K=1, fit it and generate Prediction and Accuracy classification score
KNN_model = KNeighborsClassifier(n_neighbors=5,algorithm='brute')
KNN_model.fit(X_train2, y_train2) 
KNN_prediction2 = KNN_model.predict(X_test2)
accuracy2 = accuracy_score(y_test2, KNN_prediction2)
print('When k=5, the accuracy is {}.'.format(accuracy2))

When k=5, the accuracy is 0.9571428571428572.


  return self._fit(X, y)
