In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("diabetes.csv")

### depending columns


In [3]:
depends = data.corr()["Outcome"]*100
print(depends)

Pregnancies                  22.189815
Glucose                      46.658140
BloodPressure                 6.506836
SkinThickness                 7.475223
Insulin                      13.054795
BMI                          29.269466
DiabetesPedigreeFunction     17.384407
Age                          23.835598
Outcome                     100.000000
Name: Outcome, dtype: float64


BloodPressure and Skinthickness is less veryless so we can except those

In [4]:
data = data.drop(columns=["BloodPressure","SkinThickness"])
print(data.shape)

(768, 7)


### check about outliers and remove

In [5]:
print(data.iloc[:,:].max())

Pregnancies                  17.00
Glucose                     199.00
Insulin                     846.00
BMI                          67.10
DiabetesPedigreeFunction      2.42
Age                          81.00
Outcome                       1.00
dtype: float64


In [6]:
print(data.iloc[:,:].min())

Pregnancies                  0.000
Glucose                      0.000
Insulin                      0.000
BMI                          0.000
DiabetesPedigreeFunction     0.078
Age                         21.000
Outcome                      0.000
dtype: float64


#### remove the outlnes of pregnancy column

In [7]:
outlierindexes = []
outlierindexes.extend(data.index[data["Pregnancies"]>10].tolist())
data = data.drop(data.index[outlierindexes])
print(data.shape)

(734, 7)


#### check suspect columns

In [8]:
print(data["Insulin"].value_counts())

0      356
105     10
130      8
120      8
140      8
      ... 
38       1
43       1
108      1
73       1
112      1
Name: Insulin, Length: 180, dtype: int64


there is huge range of data in Insulij column so we have to scale those those data

#### scale Insulin column

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[["Insulin"]])
print(scaled_data)

[[0.        ]
 [0.        ]
 [0.        ]
 [0.11111111]
 [0.19858156]
 [0.        ]
 [0.10401891]
 [0.        ]
 [0.64184397]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [1.        ]
 [0.20685579]
 [0.        ]
 [0.27186761]
 [0.        ]
 [0.09810875]
 [0.11347518]
 [0.27777778]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.13593381]
 [0.        ]
 [0.16548463]
 [0.        ]
 [0.        ]
 [0.28959811]
 [0.06382979]
 [0.        ]
 [0.        ]
 [0.22695035]
 [0.        ]
 [0.        ]
 [0.24468085]
 [0.08274232]
 [0.        ]
 [0.        ]
 [0.28368794]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.09692671]
 [0.04255319]
 [0.02718676]
 [0.35460993]
 [0.40425532]
 [0.        ]
 [0.35933806]
 [0.13002364]
 [0.        ]
 [0.1678487 ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.15130024]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.        ]
 [0.04491726]
 [0.11820331]
 [0.10638298]
 [0.16548463]
 [0.31914894]
 [0.        ]
 [0.  

In [12]:
data["Insulin"] = scaled_data

In [13]:
print(data["Insulin"])

0      0.000000
1      0.000000
2      0.000000
3      0.111111
4      0.198582
         ...   
763    0.212766
764    0.000000
765    0.132388
766    0.000000
767    0.000000
Name: Insulin, Length: 734, dtype: float64


### Ready data to process

In [14]:
x = data.drop("Outcome",axis=1)
y = data["Outcome"]


In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


### Check for the best model

In [16]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [17]:
svc = SVC()
KNN = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

def model_acc(model):
    model.fit(x_train,y_train)
    acc = model.score(x_test,y_test)
    print("Score",model ,":" , acc*100)

model_acc(svc)
model_acc(KNN)
model_acc(dt)
model_acc(rf)


Score SVC() : 77.55102040816327
Score KNeighborsClassifier() : 76.87074829931973
Score DecisionTreeClassifier() : 74.82993197278913
Score RandomForestClassifier() : 78.2312925170068


SVC got the highest score so we use SVC to this prolem.

#### next check for the parameters

In [18]:
from sklearn.model_selection import GridSearchCV

parameters = {"C":[0.2,5,20], "kernel":["linear", "poly", "rbf"]}

grid_obj = GridSearchCV(estimator=svc,param_grid=parameters)
grid_obj.fit(x_train,y_train)
print(grid_obj.best_params_)

{'C': 5, 'kernel': 'linear'}


### Train the model with that parameters

In [19]:
model = SVC(kernel="linear", C=20) # change parameters
model.fit(x, y)
pre = model.predict(x_test)
print(pre)


[1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 0 1]


In [23]:
from sklearn.metrics import accuracy_score as acu

acc = acu(pre,y_test)
print(acc)

0.8027210884353742
