## SVM

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection and Analysis

In [None]:
dataset = pd.read_csv("/content/diabetes.csv")

In [None]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
dataset.columns

NameError: ignored

In [None]:
dataset.shape

(768, 9)

In [None]:
dataset.isnull().values.any()

False

In [None]:
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [None]:
dataset.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


Seperating the data and label 

In [None]:
X = dataset.drop(columns = "Outcome")
Y = dataset['Outcome']

In [None]:
X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [None]:
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [None]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [None]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Data Standardization

In [None]:
scaler = StandardScaler()

In [None]:
standard_data = scaler.fit_transform(X)

In [None]:
print(standard_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
X = standard_data
Y = dataset['Outcome']

In [None]:
print(X)
print(Y)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Train Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

Data Preprocessing

In [None]:
print("Total number of rows {0}".format(len(dataset)))
print("Total number of rows missing pregnansies {0}".format(len(dataset.loc[dataset["Pregnancies"] == 0])))
print("Total number of rows missing glucose {0}".format(len(dataset.loc[dataset["Glucose"] == 0])))
print("Total number of rows missing bloodPressure{0}".format(len(dataset.loc[dataset["BloodPressure"] == 0])))
print("Total number of rows missing skinThickness {0}".format(len(dataset.loc[dataset["SkinThickness"] == 0])))
print("Total number of rows missing BMI {0}".format(len(dataset.loc[dataset["BMI"] == 0])))
print("Total number of rows missing insultin {0}".format(len(dataset.loc[dataset["Insulin"] == 0])))
print("Total number of rows missing diabetesPredictFunction {0}".format(len(dataset.loc[dataset["DiabetesPedigreeFunction"] == 0])))
print("Total number of rows missing age {0}".format(len(dataset.loc[dataset["Age"] == 0])))

Total number of rows 768
Total number of rows missing pregnansies 111
Total number of rows missing glucose 5
Total number of rows missing bloodPressure35
Total number of rows missing skinThickness 227
Total number of rows missing BMI 11
Total number of rows missing insultin 374
Total number of rows missing diabetesPredictFunction 0
Total number of rows missing age 0


In [None]:
from sklearn.impute import SimpleImputer
fill_values = SimpleImputer(missing_values = 0, strategy = "mean")

X_train = fill_values.fit_transform(X_train)
X_test = fill_values.fit_transform(X_test)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


Training Models

In [None]:
classifier = svm.SVC(kernel = "linear")

In [None]:
classifier.fit(X_train, Y_train)

SVC(kernel='linear')

Model Evaluation

In [None]:
X_train_prediction = classifier.predict(X_train)
train_accuracy = accuracy_score(X_train_prediction, Y_train)
print("Accuracy on train data : ", train_accuracy)

Accuracy on train data :  0.7866449511400652


In [None]:
X_test_prediction = classifier.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, Y_test)
print("Accuracy on test data : ", test_accuracy)

Accuracy on test data :  0.7727272727272727


In [None]:
input_data = (1,80,66,29,0,26.6,0.35,31)
input_array = np.asarray(input_data)
input_reshape = input_array.reshape(1,-1)
standard_data = scaler.transform(input_reshape)
print(standard_data)
prediction = classifier.predict(standard_data)
if(prediction == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")

[[-0.84488505 -1.27988209 -0.16054575  0.53090156 -0.69289057 -0.68442195
  -0.3680809  -0.19067191]]
The person is not diabetic




## DECISION TREE

In [None]:
print(X, Y)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]] 0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = "entropy", max_depth = 3)
clf.fit(X_train, Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3)

In [None]:
train_predict_DT = clf.predict(X_train)
train_accuracy_DT = accuracy_score(train_predict_DT, Y_train)
print("Accuracy on the training data : ", train_accuracy_DT)

Accuracy on the training data :  0.7801302931596091


In [None]:
test_predict_DT = clf.predict(X_test)
test_accuracy_DT = accuracy_score(test_predict_DT, Y_test)
print("Accuracy on the testing data : ", test_accuracy_DT)

Accuracy on the testing data :  0.7402597402597403


In [None]:
input_data = (4,110,92,0,0,37.6,0.191,30)
input_array = np.asarray(input_data)
input_reshape = input_array.reshape(1,-1)
standard_data = scaler.transform(input_reshape)
print(standard_data)
prediction = clf.predict(standard_data)
if(prediction == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")

[[ 0.04601433 -0.34096773  1.18359575 -1.28821221 -0.69289057  0.71168975
  -0.84827977 -0.27575966]]
The person is not diabetic




## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(criterion = "entropy", max_depth = 3)
random_forest_classifier.fit(X_train, Y_train) 

RandomForestClassifier(criterion='entropy', max_depth=3)

In [None]:
random_predict = random_forest_classifier.predict(X_train)
random_accuracy = accuracy_score(random_predict, Y_train)
print("Accuracy on the training data : ", random_accuracy)

Accuracy on the training data :  0.7980456026058632


In [None]:
random_predict_test = random_forest_classifier.predict(X_test)
random_accuracy_test = accuracy_score(random_predict_test, Y_test)
print("Accuracy on the testing data : ", random_accuracy_test)

Accuracy on the testing data :  0.7077922077922078


In [None]:
input_data = (4,110,92,0,0,37.6,0.191,30)
input_array = np.asarray(input_data)
input_reshape = input_array.reshape(1,-1)
standard_data = scaler.transform(input_reshape)
print(standard_data)
prediction = random_forest_classifier.predict(standard_data)
if(prediction == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")

[[ 0.04601433 -0.34096773  1.18359575 -1.28821221 -0.69289057  0.71168975
  -0.84827977 -0.27575966]]
The person is not diabetic




## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
reg_model = LogisticRegression(verbose = 1)
reg_model.fit(X_train, Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(verbose=1)

In [None]:
reg_predict = reg_model.predict(X_train)
reg_accuracy = accuracy_score(Y_train, reg_predict)
print("The accuracy on the training data : ", reg_accuracy)

The accuracy on the training data :  0.7850162866449512


In [None]:
reg_predict_test = reg_model.predict(X_test)
reg_accuracy_test = accuracy_score(Y_test, reg_predict_test)
print("The accuracy on the training data : ", reg_accuracy_test)

The accuracy on the training data :  0.7597402597402597


In [None]:
input_data = (4,110,92,0,0,37.6,0.191,30)
input_array = np.asarray(input_data)
input_reshape = input_array.reshape(1,-1)
standard_data = scaler.transform(input_reshape)
print(standard_data)
prediction = reg_model.predict(standard_data)
if(prediction == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")

[[ 0.04601433 -0.34096773  1.18359575 -1.28821221 -0.69289057  0.71168975
  -0.84827977 -0.27575966]]
The person is not diabetic




## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, Y_train)

GaussianNB()

In [None]:
gnb_predict = gnb.predict(X_train)
gnb_accuracy = accuracy_score(Y_train, gnb_predict)
print("Accuracy on the training data : ", gnb_accuracy)

Accuracy on the training data :  0.755700325732899


In [None]:
gnb_predict_test = gnb.predict(X_test)
gnb_accuracy_test = accuracy_score(Y_test, gnb_predict_test)
print("Accuracy on the training data : ", gnb_accuracy_test)

Accuracy on the training data :  0.7727272727272727


In [None]:
input_data = (1,80,66,29,0,26.6,0.35,31)
input_array = np.asarray(input_data)
input_reshape = input_array.reshape(1,-1)
standard_data = scaler.transform(input_reshape)
print(standard_data)
prediction = gnb.predict(standard_data)
if(prediction == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")

[[-0.84488505 -1.27988209 -0.16054575  0.53090156 -0.69289057 -0.68442195
  -0.3680809  -0.19067191]]
The person is not diabetic




In [None]:
import pickle
import dill

In [None]:
pickle.dump(gnb.predict, open('model_diabetes.pkl', 'wb'))

In [None]:
diabetes_predict = pickle.load(open("/content/model_diabetes.pkl", "rb"))

In [None]:
gnb_predict = diabetes_predict([[1, 80, 66, 29, 0, 26.6, 0.35, 31]])
print(gnb_predict)


[1]
