In [63]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [64]:
diabetes_dataset = pd.read_csv('/content/diabetes (1).csv')

In [65]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [66]:
diabetes_dataset.shape

(768, 9)

In [67]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [68]:
diabetes_dataset['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


Imbalanced dataset so we perform UNDERSAMPLING

In [69]:
non_diabetic = diabetes_dataset[diabetes_dataset.Outcome == 0]
diabetic = diabetes_dataset[diabetes_dataset.Outcome == 1]

In [70]:
print(non_diabetic.shape)
print(diabetic.shape)

(500, 9)
(268, 9)


In [71]:
non_diabetic_sample = non_diabetic.sample(n=268)
print(non_diabetic_sample.shape)

(268, 9)


In [72]:
#concatenate new non diabetic sample and original diabetic samples
new_dataset = pd.concat([non_diabetic_sample, diabetic], axis=0)

In [73]:
new_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
677,0,93,60,0,0,35.3,0.263,25,0
713,0,134,58,20,291,26.4,0.352,21,0
625,4,90,88,47,54,37.7,0.362,29,0
686,3,130,64,0,0,23.1,0.314,22,0
62,5,44,62,0,0,25.0,0.587,36,0


In [75]:
new_dataset.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
755,1,128,88,39,110,36.5,1.057,37,1
757,0,123,72,0,0,36.3,0.258,52,1
759,6,190,92,0,0,35.5,0.278,66,1
761,9,170,74,31,0,44.0,0.403,43,1
766,1,126,60,0,0,30.1,0.349,47,1


In [76]:
new_dataset['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,268
1,268


In [77]:
#seperating data and labels
X = new_dataset.drop(columns = 'Outcome', axis=1)
Y = new_dataset['Outcome']

In [78]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
677            0       93             60              0        0  35.3   
713            0      134             58             20      291  26.4   
625            4       90             88             47       54  37.7   
686            3      130             64              0        0  23.1   
62             5       44             62              0        0  25.0   
..           ...      ...            ...            ...      ...   ...   
755            1      128             88             39      110  36.5   
757            0      123             72              0        0  36.3   
759            6      190             92              0        0  35.5   
761            9      170             74             31        0  44.0   
766            1      126             60              0        0  30.1   

     DiabetesPedigreeFunction  Age  
677                     0.263   25  
713                     0.352   21  


In [79]:
print(Y)

677    0
713    0
625    0
686    0
62     0
      ..
755    1
757    1
759    1
761    1
766    1
Name: Outcome, Length: 536, dtype: int64


Data standardization

In [80]:
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
print(standardized_data)

[[-1.14991137 -0.97129271 -0.42307905 ...  0.32860102 -0.6474974
  -0.76511297]
 [-1.14991137  0.26210198 -0.52167487 ... -0.7890876  -0.38771207
  -1.10657887]
 [-0.01062765 -1.0615411   0.95726234 ...  0.63000019 -0.3585227
  -0.42364706]
 ...
 [ 0.55901422  1.94673864  1.15445397 ...  0.35371761 -0.60371335
   2.73491256]
 [ 1.41347701  1.34508269  0.26709165 ...  1.42117303 -0.23884632
   0.7714836 ]
 [-0.86509044  0.02143961 -0.42307905 ... -0.32443054 -0.39646888
   1.11294951]]


In [81]:
X = standardized_data
Y = new_dataset['Outcome']

In [82]:
print(X)
print(Y)

[[-1.14991137 -0.97129271 -0.42307905 ...  0.32860102 -0.6474974
  -0.76511297]
 [-1.14991137  0.26210198 -0.52167487 ... -0.7890876  -0.38771207
  -1.10657887]
 [-0.01062765 -1.0615411   0.95726234 ...  0.63000019 -0.3585227
  -0.42364706]
 ...
 [ 0.55901422  1.94673864  1.15445397 ...  0.35371761 -0.60371335
   2.73491256]
 [ 1.41347701  1.34508269  0.26709165 ...  1.42117303 -0.23884632
   0.7714836 ]
 [-0.86509044  0.02143961 -0.42307905 ... -0.32443054 -0.39646888
   1.11294951]]
677    0
713    0
625    0
686    0
62     0
      ..
755    1
757    1
759    1
761    1
766    1
Name: Outcome, Length: 536, dtype: int64


In [83]:
#splitting
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(536, 8) (428, 8) (108, 8)


In [84]:
#training the model
classifier = svm.SVC(kernel='linear')
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

Model evaluation

In [85]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [86]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7897196261682243


In [87]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [88]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.6666666666666666


Making a predictive system

In [92]:
input_data = (4,	110,	92,	0,	0	,37.6	,0.191,	30)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[-0.01062765 -0.45988515  1.15445397 -1.2808025  -0.69262932  0.61744189
  -0.85766081 -0.33828059]]
[0]
The person is not diabetic


