In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [2]:
diabetes = pd.read_csv("diabetesdata.csv",header = None)
diabetes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
diabetes.columns = ["times_pregnant","glucose_concentration","Diastolic_blood_pressure",
                   "Triceps_skin_fold_thickness","serum_insulin","BMI","Diabetes_pedigree_function","Age","Diabetes"]

In [4]:
diabetes.head()

Unnamed: 0,times_pregnant,glucose_concentration,Diastolic_blood_pressure,Triceps_skin_fold_thickness,serum_insulin,BMI,Diabetes_pedigree_function,Age,Diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
print((diabetes[["glucose_concentration","Diastolic_blood_pressure","Triceps_skin_fold_thickness","serum_insulin","BMI"]] == 0).sum())

glucose_concentration            5
Diastolic_blood_pressure        35
Triceps_skin_fold_thickness    227
serum_insulin                  374
BMI                             11
dtype: int64


In [6]:
# Replacing 0 with Nan
diabetes[["glucose_concentration","Diastolic_blood_pressure","Triceps_skin_fold_thickness","serum_insulin","BMI"]] = diabetes[["glucose_concentration","Diastolic_blood_pressure","Triceps_skin_fold_thickness","serum_insulin","BMI"]].replace(0,np.NaN)

In [7]:
print(diabetes.isnull().sum())

times_pregnant                   0
glucose_concentration            5
Diastolic_blood_pressure        35
Triceps_skin_fold_thickness    227
serum_insulin                  374
BMI                             11
Diabetes_pedigree_function       0
Age                              0
Diabetes                         0
dtype: int64


In [8]:
X = diabetes.iloc[:,:8]
y  = diabetes.iloc[:,8]

In [9]:
# create pipeline for performing multiple transformations
pipeline = Pipeline([
('imputer', IterativeImputer()),
('scaler', StandardScaler())])

In [10]:
X_transformed = pipeline.fit_transform(X)
X_transformed

array([[ 0.63994726,  0.86571155, -0.02903387, ...,  0.16831005,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.20347832, -0.52329612, ..., -0.85006189,
        -0.36506078, -0.19067191],
       [ 1.23388019,  2.01526148, -0.6880502 , ..., -1.33015152,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 , -0.02108411, -0.02903387, ..., -0.90825457,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.14313731, -1.01755837, ..., -0.34087592,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.94072405, -0.19378795, ..., -0.29723141,
        -0.47378505, -0.87137393]])

In [11]:
new_dataset = pd.DataFrame(X_transformed, columns= ["times_pregnant","glucose_concentration","Diastolic_blood_pressure",
                   "Triceps_skin_fold_thickness","serum_insulin","BMI","Diabetes_pedigree_function","Age"])
new_dataset

Unnamed: 0,times_pregnant,glucose_concentration,Diastolic_blood_pressure,Triceps_skin_fold_thickness,serum_insulin,BMI,Diabetes_pedigree_function,Age
0,0.639947,0.865712,-0.029034,0.640318,0.681265,0.168310,0.468492,1.425995
1,-0.844885,-1.203478,-0.523296,0.009785,-0.846509,-0.850062,-0.365061,-0.190672
2,1.233880,2.015261,-0.688050,-0.774954,1.190927,-1.330152,0.604397,-0.105584
3,-0.844885,-1.072101,-0.523296,-0.620749,-0.602910,-0.631839,-0.920763,-1.041549
4,-1.141852,0.504424,-2.665099,0.640318,0.157791,1.550386,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.677970,0.300474,2.006473,0.281147,0.066473,-0.908682,2.532136
764,-0.547919,0.011760,-0.193788,-0.200393,0.062477,0.633852,-0.398282,-0.531023
765,0.342981,-0.021084,-0.029034,-0.620749,-0.417875,-0.908255,-0.685193,-0.275760
766,-0.844885,0.143137,-1.017558,-0.107763,0.214390,-0.340876,-0.371101,1.170732


In [None]:
new_dataset.describe()

Unnamed: 0,times_pregnant,glucose_concentration,Diastolic_blood_pressure,Triceps_skin_fold_thickness,serum_insulin,BMI,Diabetes_pedigree_function,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,2.5442610000000002e-17,1.272131e-16,-2.142022e-16,4.367889e-16,2.977942e-17,2.312965e-17,2.398978e-16,1.8576e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-1.141852,-2.550094,-3.983132,-2.302171,-1.769736,-2.072108,-1.189553,-1.041549
25%,-0.8448851,-0.7436583,-0.6880502,-0.7131382,-0.6441946,-0.7191284,-0.6889685,-0.7862862
50%,-0.2509521,-0.1524612,-0.02903387,-0.04831913,-0.2309346,-0.04263843,-0.3001282,-0.3608474
75%,0.6399473,0.6111684,0.6299825,0.6403179,0.3839448,0.6047552,0.4662269,0.6602056
max,3.906578,2.54077,4.089818,7.366006,7.127453,5.041947,5.883565,4.063716


In [12]:
# perform train test split
X_train,X_test,y_train,y_test= train_test_split(new_dataset,y,test_size=0.2,stratify=y,random_state = 88)

In [13]:
X_train.shape,y_train.shape, X_test.shape,y_test.shape

((614, 8), (614,), (154, 8), (154,))

## Creating a Naive Bayes Model

In [14]:
# create an instance for GaussianNB
model=GaussianNB()

In [15]:
# train the model on train dataset
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
model.score(X_train,y_train)    # train data accuracy score

0.752442996742671

In [17]:
# perform predictions on the test dataset
pred_y=model.predict(X_test)
pred_y

array([0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0])

In [18]:
confusion_matrix(y_test, pred_y)

array([[78, 22],
       [14, 40]])

In [19]:
accuracy_score(y_test, pred_y)

0.7662337662337663