In [12]:
# Open diabetes data set - downloaded from Kaggle

import pandas as pd
#read in the data using pandas

df = pd.read_csv('datasets/diabetes.csv')

#check data has been read in properly and check the rows and columns - ALWAYS do this!
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:
df.shape

(768, 9)

In [14]:
# there are 768 potential patients with 9 features - 8 input ones and 1 target output 
# that classifies the patient as TRUE (diabetes) or FALSE (not diabetes)
# we need to get rid of this column as we want to predict the outcome 

In [17]:
# use Pandas drop function to build a new dataframe without Outcome column 
# usually this is called X (the base dataset excluding target)

X = df.drop(columns=["Outcome"])

In [18]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [22]:
# The target variable is usually called 'y' 

y = df["Outcome"].values

# y is an array rather than a dataframe so access it by index - checking for ten rows
y[0:10]

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1])

In [23]:
# We have to now split our data into training test (and should also do validation) sets
# scikit learn does this for us using the train_test_split function - often 20-30% of data is used for test
# and 70-80% for training - a validation set is usually half of the training data

from sklearn.model_selection import train_test_split

# split both X and y into the new training and test sets 
# this takes advantage of Pythons powerful assignment chaining and implicit tuple unpacking

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [24]:
# note that 'stratify' is set to yes - this ensures that the sets contain the correct amount of each possible
# target value. A stratified sample helps to ensure precision 

# The next step is to build our model - we are using the scikit learn KNN classifier
from sklearn.neighbors import KNeighborsClassifier

scikit_knn = KNeighborsClassifier(n_neighbors = 3)

# now need to fit the classifier to the data
#
scikit_knn.fit(X_train,y_train)



KNeighborsClassifier(n_neighbors=3)

In [25]:
# the n_neighbours parameter sets the number of items nearest to the new data we are trying to predict.
# KNN then classifies the new point based on a majority vote system - whichever category gets at least 2 votes 
# is the category the item will fall into. The neighbours are decided by distance from the new data point

In [27]:
# Now we have trained our model we can see how it performs on our test data 
# Start by looking at the first few predictions

#show first 5 model predictions on the test data
scikit_knn.predict(X_test)[0:5]

array([0, 0, 0, 0, 1])

In [29]:
#Now we can check the accuracy of our model on the test data
#remember we have the real data so can compare actual results against our predictions

scikit_knn.score(X_test, y_test)

0.6688311688311688

In [30]:
# so our model predicts the outcome with an accuracy of 67%
# which is a little disappointing... it could be that we need more neighbours 

# First of all we do some cross-validation - randomly split the data into different groups and then run the process
# on each of those groups (we split into k groups which we then run as training and test sets) 
# This means that each of our k-folds gets to be the test set

In [32]:
# Instead of the train_test_split method which is very blunt we can use cross validation to generate k number
# of accuracy scores and get the mean of them

from sklearn.model_selection import cross_val_score
import numpy as np

#create a new KNN model
scikit_knn_cv = KNeighborsClassifier(n_neighbors=3)

#train model with cv of 5 (ie 5 groups)
cv_scores = cross_val_score(scikit_knn_cv, X, y, cv=5)

#print each cross validation score (accuracy) and find the mean
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.68181818 0.69480519 0.75324675 0.75163399 0.68627451]
cv_scores mean:0.7135557253204311


In [35]:
# this shows that each fold produced a different set of predictions but the average of them was 71.3%

# the original number of neighbours used was 3, but was random. These types of hyperparameters can be tuned so that
# optimum value can be found. This can be done using grid search techniques - where a range of paramters is tried on
# a cross validation model - so not only are the test and training data tuned the number of neighbours will also be 
# varied.

from sklearn.model_selection import GridSearchCV

#create new a knn model
scikit_knn2 = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors using a numpy array
param_grid = {'n_neighbors': np.arange(1, 25)}

#use gridsearch to test all values for n_neighbors

scikit_knn_gscv = GridSearchCV(scikit_knn2, param_grid, cv=5)

#fit this new model to our data
scikit_knn_gscv.fit(X, y)

# we can find out what the best number of neighbours by using the 'best params' function
scikit_knn_gscv.best_params_


{'n_neighbors': 14}

In [36]:
# so if we set n_neighbours to 14 our model will have the best accuracy for this data
# can check the score under this constraint

scikit_knn_gscv.best_score_

0.7578558696205755

In [37]:
# can validate this easily enough by rerunning the origianl cv model

scikit_knn_cv = KNeighborsClassifier(n_neighbors=14)

#train model with cv of 5 (ie 5 groups)
cv_scores = cross_val_score(scikit_knn_cv, X, y, cv=5)

#print each cross validation score (accuracy) and find the mean
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.75974026 0.72077922 0.75974026 0.81045752 0.73856209]
cv_scores mean:0.7578558696205755
