In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv("diabetes.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [26]:
from sklearn import model_selection
X = data.iloc[:,0:8]
Y = data.iloc[:,8]
seed = 7
validation_size = 0.3
X_train, X_test, Y_train, Y_test =model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
print(X_train)
print(Y_train)

     Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
590      111             84             40        0  46.8   
692      121             70             32       95  39.1   
492       99             68             38        0  32.8   
205      111             72             28        0  23.9   
709       93             64             32      160  38.0   
..       ...            ...            ...      ...   ...   
579      197             70             99        0  34.7   
502        0             68             41        0  39.0   
537       57             60              0        0  21.7   
196      105             58              0        0  24.3   
175      179             72             42      130  32.7   

     DiabetesPedigreeFunction  Age  
590                     0.925   45  
692                     0.886   23  
492                     0.145   33  
205                     0.407   27  
709                     0.674   23  
..                        ...  ...  
579     

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(data)
print(scaled)

from sklearn import model_selection
X = scaled[:,0:8]
Y = scaled [:,8]
seed = 7
validation_size = 0.3
X_train, X_test, Y_train, Y_test =model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
X_train

[[0.35294118 0.74371859 0.59016393 ... 0.23441503 0.48333333 1.        ]
 [0.05882353 0.42713568 0.54098361 ... 0.11656704 0.16666667 0.        ]
 [0.47058824 0.91959799 0.52459016 ... 0.25362938 0.18333333 1.        ]
 ...
 [0.29411765 0.6080402  0.59016393 ... 0.07130658 0.15       0.        ]
 [0.05882353 0.63316583 0.49180328 ... 0.11571307 0.43333333 1.        ]
 [0.05882353 0.46733668 0.57377049 ... 0.10119556 0.03333333 0.        ]]


array([[0.55778894, 0.68852459, 0.4040404 , ..., 0.69746647, 0.3616567 ,
        0.4       ],
       [0.6080402 , 0.57377049, 0.32323232, ..., 0.58271237, 0.34500427,
        0.03333333],
       [0.49748744, 0.55737705, 0.38383838, ..., 0.48882265, 0.02860803,
        0.2       ],
       ...,
       [0.28643216, 0.49180328, 0.        , ..., 0.32339791, 0.28052946,
        0.76666667],
       [0.52763819, 0.47540984, 0.        , ..., 0.36214605, 0.04654142,
        0.        ],
       [0.89949749, 0.59016393, 0.42424242, ..., 0.48733234, 0.27369769,
        0.25      ]])

In [28]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Test options and evaluation metric
seed = 7
#the seed ensures we have the same sequence of random numbers. 
#The random numbers ensure we have a random split of the data into the k folds.
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR',  LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

print(models)

[('LR', LogisticRegression()), ('LDA', LinearDiscriminantAnalysis()), ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier()), ('NB', GaussianNB()), ('SVM', SVC())]


In [16]:
results=[]
names=[]
#putting a loop for testing various models
for name, model in models:
    kfold=model_selection.KFold(n_splits=15, random_state=seed,shuffle=True)
    cv_results = model_selection.cross_val_score(model,X_train,Y_train, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s : %f (%f)"%(name,cv_results.mean(),cv_results.std())
    print(msg)
    
print(results)

LR : 0.768889 (0.061057)
LDA : 0.770899 (0.056334)
KNN : 0.729947 (0.071978)
CART : 0.713333 (0.071679)
NB : 0.767090 (0.069289)
SVM : 0.772593 (0.063228)
[array([0.80555556, 0.77777778, 0.77777778, 0.77777778, 0.83333333,
       0.77777778, 0.72222222, 0.66666667, 0.80555556, 0.86111111,
       0.77777778, 0.75      , 0.85714286, 0.68571429, 0.65714286]), array([0.80555556, 0.75      , 0.83333333, 0.69444444, 0.83333333,
       0.75      , 0.69444444, 0.69444444, 0.80555556, 0.83333333,
       0.80555556, 0.77777778, 0.85714286, 0.71428571, 0.71428571]), array([0.80555556, 0.61111111, 0.75      , 0.75      , 0.83333333,
       0.72222222, 0.69444444, 0.72222222, 0.75      , 0.75      ,
       0.80555556, 0.58333333, 0.82857143, 0.65714286, 0.68571429]), array([0.77777778, 0.66666667, 0.69444444, 0.66666667, 0.80555556,
       0.63888889, 0.61111111, 0.66666667, 0.63888889, 0.83333333,
       0.83333333, 0.66666667, 0.74285714, 0.77142857, 0.68571429]), array([0.86111111, 0.61111111, 0

In [30]:
#0 or 1
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
# Make predictions on validation dataset
knn = KNeighborsClassifier(n_neighbors=15, n_jobs=3)

knn.fit(X_train, Y_train)
#knn internal parameters have been set
predictions = knn.predict(X_test)
print(predictions)

[0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1.
 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1.
 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0.]


In [33]:
####Metrics for model selection
print("Accuracy Score :", accuracy_score( predictions, Y_test))
print("Confusion Matrix : \n",confusion_matrix( predictions, Y_test))

Accuracy Score : 0.7575757575757576
Confusion Matrix : 
 [[123  32]
 [ 24  52]]


In [34]:
df = pd.DataFrame(data)
corr = df.corr()
corr

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0
