# Using SUPPORT VECTOR MACHINE on Diabetes dataset
## Diabetes Prediction

Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm

Data Collection and analysis

In [2]:
#loading diabetes dataset
dataset=pd.read_csv("diabetes.csv")

In [3]:
print(dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
print(dataset.shape)

(768, 9)


In [5]:
print(dataset.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [6]:
print(dataset['Outcome'].value_counts())

0    500
1    268
Name: Outcome, dtype: int64


0->Non Diabetic
1->Diabetic

Splitting our datasets into training and testing data

In [7]:
X=dataset.drop(dataset.iloc[:,8:9],axis=1)

In [8]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [9]:
Y=dataset.iloc[:,8:9]

In [10]:
print(Y)

     Outcome
0          1
1          0
2          1
3          0
4          1
..       ...
763        0
764        0
765        0
766        1
767        0

[768 rows x 1 columns]


Data Standarisation

In [11]:
#scaler=StandardScaler()

In [12]:
#X_=scaler.fit_transform(X)

In [13]:
#print(X_)

In [14]:
print(Y)

     Outcome
0          1
1          0
2          1
3          0
4          1
..       ...
763        0
764        0
765        0
766        1
767        0

[768 rows x 1 columns]


Splitting data

In [15]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,stratify=Y,random_state=2)

Training the Model using SVM

In [16]:
model_classifier=svm.SVC(kernel='linear')

In [17]:
model_classifier.fit(X_train,Y_train.values.ravel())

SVC(kernel='linear')

Model Evaluation and score

In [18]:
print("Accuracy score of the model is ",model_classifier.score(X_test,Y_test)*100,'%')
rows,columns=X_test.shape

Accuracy score of the model is  77.48917748917748 %


## Predicting Diabetes or not

In [19]:
print(X_test)
prediction=model_classifier.predict(X_test)
print("Prediction on test data")
for i in range(0,rows,1):
 if(prediction[i]==0):
  print("The patient is diabetic")
 else:
  print("The patient is not diabetic")

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
46             1      146             56              0        0  29.7   
438            1       97             70             15        0  18.2   
756            7      137             90             41        0  32.0   
306           10      161             68             23      132  25.5   
224            1      100             66             15       56  23.6   
..           ...      ...            ...            ...      ...   ...   
186            8      181             68             36      495  30.1   
109            0       95             85             25       36  37.4   
75             1        0             48             20        0  24.7   
679            2      101             58             17      265  24.2   
132            3      170             64             37      225  34.5   

     DiabetesPedigreeFunction  Age  
46                      0.564   29  
438                     0.147   21  


Testing data on single instance

In [23]:
print("Given data")
input_data=[7,181,84,21,192,35.9,0.586,51]
print(input_data)
numpy_data=np.array(input_data)
data=numpy_data.reshape(1,-1)
#std_data=scaler.transform(data)
prediction=model_classifier.predict(data)
print("Outcome predicted: ",end="")
print(prediction[0])
if(prediction[0]==1):
    print("The patient is having diabetes")
else:
    print("The patient is non diabetic")

Given data
[7, 181, 84, 21, 192, 35.9, 0.586, 51]
Outcome predicted: 1
The patient is having diabetes


SAVING THE TRAINED MODEL

In [24]:
import pickle

In [25]:
filename="trained_model.sav"
pickle.dump(model_classifier,open(filename,"wb"))

loading the saved model

In [27]:
load_model=pickle.load(open(filename,"rb"))

In [29]:
print("Given data")
input_data=[7,181,84,21,192,35.9,0.586,51]
print(input_data)
numpy_data=np.array(input_data)
data=numpy_data.reshape(1,-1)
#std_data=scaler.transform(data)
prediction=load_model.predict(data)
print("Outcome predicted: ",end="")
print(prediction[0])
if(prediction[0]==1):
    print("The patient is having diabetes")
else:
    print("The patient is non diabetic")

Given data
[7, 181, 84, 21, 192, 35.9, 0.586, 51]
Outcome predicted: 1
The patient is having diabetes
