In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [None]:
dataset = pd.read_csv('datas.csv')

In [None]:
dataset.head(1)

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0


In [None]:
dataset.rename(columns = {'6' : 'Pregnancies' , '148' : 'Glucose' , '72' : 'BloodPressure' , '35' : 'SkinThickness' , '0' : 'Insulin' , '33.6' : 'BMI' , '0.627' : 'DiabetesPedigreeFunction' , '50' : 'Age' , '1' : 'outcome'},inplace =True)

In [None]:
dataset.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,outcome
0,1,85,66,29,0,26.6,0.351,31,0


In [None]:
dataset.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,outcome
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               767 non-null    int64  
 1   Glucose                   767 non-null    int64  
 2   BloodPressure             767 non-null    int64  
 3   SkinThickness             767 non-null    int64  
 4   Insulin                   767 non-null    int64  
 5   BMI                       767 non-null    float64
 6   DiabetesPedigreeFunction  767 non-null    float64
 7   Age                       767 non-null    int64  
 8   outcome                   767 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
dataset.shape  # no of people and no of features

(767, 9)

In [None]:
# getting the statistical measure of the data
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,outcome
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.842243,120.859192,69.101695,20.517601,79.90352,31.990482,0.471674,33.219035,0.34811
std,3.370877,31.978468,19.368155,15.954059,115.283105,7.889091,0.331497,11.752296,0.476682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.371,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.5,36.6,0.625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
dataset['outcome'].value_counts()    #how many diabetic and how many is non diabetic

0    500
1    267
Name: outcome, dtype: int64

level 0 : Non Diabetic people
level 1 : Diabetic People

In [None]:
dataset.groupby('outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.861423,141.23221,70.820225,22.116105,100.71161,35.148315,0.550213,37.018727


In [None]:
# we can see that diabetic people has the amount of glucose present in their body is more

Separating data and levels

In [None]:
X = dataset.drop(columns = 'outcome',axis = 1)
Y = dataset['outcome']

In [None]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              1       85             66             29        0  26.6   
1              8      183             64              0        0  23.3   
2              1       89             66             23       94  28.1   
3              0      137             40             35      168  43.1   
4              5      116             74              0        0  25.6   
..           ...      ...            ...            ...      ...   ...   
762           10      101             76             48      180  32.9   
763            2      122             70             27        0  36.8   
764            5      121             72             23      112  26.2   
765            1      126             60              0        0  30.1   
766            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.351   31  
1                       0.672   32  


In [None]:
print(Y)

0      0
1      1
2      0
3      1
4      0
      ..
762    0
763    0
764    0
765    1
766    0
Name: outcome, Length: 767, dtype: int64


Data standardzation

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X)

In [None]:
standardized_data = scaler.transform(X)  # transfromation to the standardized scaler from

In [None]:
print(standardized_data)

[[-0.84372629 -1.12208597 -0.16024856 ... -0.68372895 -0.36426474
  -0.18894038]
 [ 1.23423997  1.94447577 -0.26357823 ... -1.10230105  0.60470064
  -0.1037951 ]
 [-0.84372629 -0.99692019 -0.16024856 ... -0.49346891 -0.91968415
  -1.0403932 ]
 ...
 [ 0.343683    0.0044061   0.14974046 ... -0.73446496 -0.68423462
  -0.27408566]
 [-0.84372629  0.16086333 -0.47023757 ... -0.23978884 -0.37030191
   1.17338414]
 [-0.84372629 -0.8717544   0.04641078 ... -0.20173684 -0.47293375
  -0.87010264]]


In [None]:
X =  standardized_data

In [None]:
print(X)

[[-0.84372629 -1.12208597 -0.16024856 ... -0.68372895 -0.36426474
  -0.18894038]
 [ 1.23423997  1.94447577 -0.26357823 ... -1.10230105  0.60470064
  -0.1037951 ]
 [-0.84372629 -0.99692019 -0.16024856 ... -0.49346891 -0.91968415
  -1.0403932 ]
 ...
 [ 0.343683    0.0044061   0.14974046 ... -0.73446496 -0.68423462
  -0.27408566]
 [-0.84372629  0.16086333 -0.47023757 ... -0.23978884 -0.37030191
   1.17338414]
 [-0.84372629 -0.8717544   0.04641078 ... -0.20173684 -0.47293375
  -0.87010264]]


Train Test Split

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,stratify=Y,random_state = 2)

# 20% test data and 80% is training data
# we want our dataset to be splited in the same proportion ....that is why we used Stratify = Y
#if I donot mention the keyword Stratify keyword all the diabetic will go to one frame(train frame)
# all the non diabetic will be go to the test data.....this can be a case ...to avoid it we have to use
# Stratify keyword with respect to outcome

In [None]:
print(X.shape,X_train.shape,X_test.shape)

(767, 8) (613, 8) (154, 8)


Training the model

In [None]:
classifier = svm.SVC(kernel = 'linear') # support vector machine classifier classifier

In [None]:
classifier.fit(X_train,Y_train)

Model Evaluation

Accuracy Score

In [None]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [None]:
print(training_data_accuracy)

0.7830342577487766


In [None]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [None]:
print(test_data_accuracy)

0.7857142857142857


In [None]:
# model is not overfitting at well....acuracy score of the training data as well as test data is same

Predictive System:

In [None]:
input_data = (6,148,72,35,0,33.6,0.627,50)

input_data_as_np = np.asarray(input_data)
input_data_reshaped =input_data_as_np.reshape(1,-1) # single rows and multiple columns


std_data = scaler.transform(input_data_reshaped)

print(std_data)

prediction = classifier.predict(std_data)  # classifier function returns a array always
print(prediction)


if prediction[0] == 0 :
  print("Non Diabetic preson")
else:
  print("Diabetic Person")

[[ 0.64053533  0.84927515  0.14974046  0.90834872 -0.69355921  0.20415126
   0.46886437  1.42881999]]
[1]
Diabetic Person


