In [11]:
import numpy as np
import pandas as pd
import os

In [12]:
#loading training and testing sets
projectDirPath = os.path.abspath("")

X_train = pd.read_csv(projectDirPath + "\\ready data\\X_train.csv").values
X_test = pd.read_csv(projectDirPath + "\\ready data\\X_test.csv").values
y_train = pd.read_csv(projectDirPath + "\\ready data\\y_train.csv").values.reshape(-1,)
y_test = pd.read_csv(projectDirPath + "\\ready data\\y_test.csv").values.reshape(-1,)

In [13]:
#training the models (3 models with different number of neighbors)
from sklearn.neighbors import KNeighborsClassifier

knn1 = KNeighborsClassifier(n_neighbors = 3)
knn2 = KNeighborsClassifier(n_neighbors = 4)
knn3 = KNeighborsClassifier(n_neighbors = 5)

knn1.fit(X_train, y_train)
knn2.fit(X_train, y_train)
knn3.fit(X_train, y_train)

KNeighborsClassifier()

In [14]:
#validation of the models using k-cross validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, make_scorer

accuracies1 = cross_val_score(estimator = knn1, X = X_train, y = y_train, cv = 10)
accuracies2 = cross_val_score(estimator = knn2, X = X_train, y = y_train, cv = 10)
accuracies3 = cross_val_score(estimator = knn3, X = X_train, y = y_train, cv = 10)

recall_scorer = make_scorer(recall_score, pos_label = 4)
recalls1 = cross_val_score(estimator = knn1, X = X_train, y = y_train, cv = 10, scoring = recall_scorer)
recalls2 = cross_val_score(estimator = knn2, X = X_train, y = y_train, cv = 10, scoring = recall_scorer)
recalls3 = cross_val_score(estimator = knn3, X = X_train, y = y_train, cv = 10, scoring = recall_scorer)

accMean1 = accuracies1.mean()
accStdDev1 = accuracies1.std()
recMean1 = recalls1.mean()
recStdDev1 = recalls1.std()
print("3 neighbors:\nmean of 10 accuracies1: ", accMean1)
print("standard deviation of accuracies1: ", recStdDev1)
print("mean of 10 recalls1: ", accMean1)
print("standard deviation of recalls1: ", recStdDev1, "\n")

accMean2 = accuracies2.mean()
accStdDev2 = accuracies2.std()
recMean2 = recalls2.mean()
recStdDev2 = recalls2.std()
print("4 neighbors:\nmean of 10 accuracies2: ", accMean2)
print("standard deviation of accuracies2: ", recStdDev2)
print("mean of 10 recalls2: ", accMean2)
print("standard deviation of recalls2: ", recStdDev2, "\n")

accMean3 = accuracies3.mean()
accStdDev3 = accuracies3.std()
recMean3 = recalls3.mean()
recStdDev3 = recalls3.std()
print("5 neighbors:\nmean of 10 accuracies3: ", accMean3)
print("standard deviation of accuracies3: ", recStdDev3)
print("mean of 10 recalls3: ", accMean3)
print("standard deviation of recalls3: ", recStdDev3)

3 neighbors:
mean of 10 accuracies1:  0.9751088534107403
standard deviation of accuracies1:  0.05106840454882057
mean of 10 recalls1:  0.9751088534107403
standard deviation of recalls1:  0.05106840454882057 

4 neighbors:
mean of 10 accuracies2:  0.9674528301886791
standard deviation of accuracies2:  0.06526169064983801
mean of 10 recalls2:  0.9674528301886791
standard deviation of recalls2:  0.06526169064983801 

5 neighbors:
mean of 10 accuracies3:  0.9731857764876632
standard deviation of accuracies3:  0.0370596012111225
mean of 10 recalls3:  0.9731857764876632
standard deviation of recalls3:  0.0370596012111225


In [15]:
#testing of the models on the test set and computing : accuracy, recall and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred1 = knn1.predict(X_test)
y_pred2 = knn2.predict(X_test)
y_pred3 = knn3.predict(X_test)

acc1 = accuracy_score(y_test, y_pred1)
acc2 = accuracy_score(y_test, y_pred2)
acc3 = accuracy_score(y_test, y_pred3)

rec1 = recall_score(y_test, y_pred1, pos_label = 4)
rec2 = recall_score(y_test, y_pred2, pos_label = 4)
rec3 = recall_score(y_test, y_pred3, pos_label = 4)

cm1 = confusion_matrix(y_test, y_pred1)
cm2 = confusion_matrix(y_test, y_pred2)
cm3 = confusion_matrix(y_test, y_pred3)

print("3 neighbors:\naccuracy1 on the test set: ", acc1)
print("recall1 on the test set: ", rec1)
print("confusion matrix1:\n ", cm1, "\n")

print("4 neighbors:\naccuracy2 on the test set: ", acc2)
print("recall2 on the test set: ", rec2)
print("confusion matrix2:\n ", cm2, "\n")

print("5 neighbors:\naccuracy3 on the test set: ", acc3)
print("recall3 on the test set: ", rec3)
print("confusion matrix3:\n ", cm3)

3 neighbors:
accuracy1 on the test set:  0.9542857142857143
recall1 on the test set:  0.9310344827586207
confusion matrix1:
  [[113   4]
 [  4  54]] 

4 neighbors:
accuracy2 on the test set:  0.9428571428571428
recall2 on the test set:  0.8793103448275862
confusion matrix2:
  [[114   3]
 [  7  51]] 

5 neighbors:
accuracy3 on the test set:  0.96
recall3 on the test set:  0.9482758620689655
confusion matrix3:
  [[113   4]
 [  3  55]]


In [16]:
#saving the valid model based on highest recall
from joblib import dump

dump(knn3, projectDirPath + "\\models\\K-NN.joblib")

['C:\\Users\\misla\\Desktop\\breast cancer tumor classification\\models\\K-NN.joblib']

In [17]:
#saving evaluation data for all tried out models
import json

knn1Data = {"mean10Acc" : accMean1, "accStd" : accStdDev1, "mean10Rec" : recMean1, "recStd" : recStdDev1, "acc": acc1, "rec": rec1}
knn2Data = {"mean10Acc" : accMean2, "accStd" : accStdDev2, "mean10Rec" : recMean2, "recStd" : recStdDev2, "acc": acc2, "rec": rec2}
knn3Data = {"mean10Acc" : accMean3, "accStd" : accStdDev3, "mean10Rec" : recMean3, "recStd" : recStdDev3, "acc": acc3, "rec": rec3}
evalData = [knn1Data, knn2Data, knn3Data]

cmData1 = {"tn" : int(cm1[0, 0]), "fn" : int(cm1[1, 0]), "tp" : int(cm1[1, 1]), "fp" : int(cm1[0, 1])}
cmData2 = {"tn" : int(cm2[0, 0]), "fn" : int(cm2[1, 0]), "tp" : int(cm2[1, 1]), "fp" : int(cm2[0, 1])}
cmData3 = {"tn" : int(cm3[0, 0]), "fn" : int(cm3[1, 0]), "tp" : int(cm3[1, 1]), "fp" : int(cm3[0, 1])}
cmData = [cmData1, cmData2, cmData3]

i = 1
for data in evalData:
    with open(projectDirPath + "\\modelsData\\knn{}.json".format(i), "w") as file:
        json.dump(data, file)
    i += 1

i = 1
for data in cmData:
    with open(projectDirPath + "\\modelsData\\knn{}Cm.json".format(i), "w") as file:
        json.dump(data, file)
    i += 1
