<a href="https://colab.research.google.com/github/chrisogonas/MLearningModels/blob/main/stat718_k_nearest_neighbors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# K-Nearest Neighbors (K-NN)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

## Importing the dataset

In [2]:
dataset = pd.read_csv('/content/sample_data/mydata/PimaIndiansDiabetes2.csv')
X = dataset.iloc[:, 2:].values
y = dataset.iloc[:, 1].values

## Splitting the dataset into the Training set and Test set

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [4]:
print(X_train)

[[  1.    128.     48.    ...  40.5     0.613  24.   ]
 [  3.    158.     70.    ...  35.5     0.344  35.   ]
 [  1.    128.     98.    ...  32.      1.321  33.   ]
 ...
 [  6.    119.     50.    ...  27.1     1.318  33.   ]
 [  4.    144.     58.    ...  29.5     0.287  37.   ]
 [  8.    126.     88.    ...  38.5     0.349  49.   ]]


In [5]:
print(y_train)

[1 1 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 0
 0 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0
 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1
 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 1 0 0 0 1 0 0
 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 0 0
 0 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0]


In [None]:
print(X_test)

In [7]:
print(y_test)

[1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0
 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1]


## Feature Scaling

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
print(X_train)

[[-0.74336231  0.13126715 -1.91904292 ...  1.00907749  0.24568836
  -0.67204882]
 [-0.09097867  1.08731695 -0.07141422 ...  0.31234479 -0.51438478
   0.40496103]
 [-0.74336231  0.13126715  2.28011322 ... -0.17536809  2.24617826
   0.20914106]
 ...
 [ 0.88759679 -0.15554778 -1.75107668 ... -0.85816614  2.23770161
   0.20914106]
 [ 0.23521315  0.64116038 -1.07921169 ... -0.52373444 -0.67544117
   0.600781  ]
 [ 1.53998043  0.0675305   1.44028199 ...  0.73038441 -0.50025702
   1.77570083]]


In [None]:
print(X_test)

## Training the K-NN model on the Training set

In [11]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Predicting a new result

In [12]:
print(classifier.predict(sc.transform([[7,187,68,39,304,37.7,0.254,41]])))
print(classifier.predict(sc.transform([[1,95,66,13,38,19.6,0.334,25]])))

[1]
[0]


## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Making the Confusion Matrix

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accur = accuracy_score(y_test, y_pred)

[[59  6]
 [13 20]]


In [15]:
print('Accuracy : ', round(accur * 100,2), '%')
sensitivity1 = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity1 )

specificity1 = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity : ', specificity1)

Accuracy :  80.61 %
Sensitivity :  0.9076923076923077
Specificity :  0.6060606060606061


### Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores

In [16]:
# visualize the plots use matplotlib and import roc_curve,auc from sklearn.metrics 
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef

# actual = X_train[tuple('diabetis_binary')]
# false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted_values1)
# plt.title('Receiver Operating Characteristic')
# plt.plot(false_positive_rate, true_positive_rate)
# plt.plot([0,1],[0,1],'r--')
# plt.xlim([-0.1,1.2])
# plt.ylim([-0.1,1.2])
# plt.ylabel('True Positive Rate(Sensitivity)')
# plt.xlabel('False Positive Rate(Specificity)')
# plt.show()


In [18]:
# Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores
y_pred_proba = classifier.predict_proba(X_test)
print(roc_auc_score(y_test, y_pred_proba[:,1]))
print("Roc AUC:", roc_auc_score(y_test, classifier.predict_proba(X_test)[:,1],average='macro'))

0.8603729603729604
Roc AUC: 0.8603729603729604


### Matthews correlation coefficient

In [19]:
# matthews_corrcoef(y_true, y_pred)
matthews_corrcoef(y_pred.reshape(len(y_test),1), y_pred.reshape(len(y_pred),1))

1.0

## Visualising the Training set results

## Visualising the Test set results