In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Data Exploration:

a. Load and explore the medical dataset using Python libraries like pandas. Describe the features, labels, and the distribution of diagnoses.


In [37]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data Preprocessing:

a. Explain the necessary data preprocessing steps for preparing the medical data. This may include handling missing values, normalizing or scaling features, and encoding categorical variables.

In [38]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [39]:
no_zeroes = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in no_zeroes:
    mean_value = df[column][df[column] != 0].mean()
    df[column] = df[column].replace(0, mean_value)
    
print(df)

     Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \
0              6    148.0           72.0       35.00000  155.548223  33.6   
1              1     85.0           66.0       29.00000  155.548223  26.6   
2              8    183.0           64.0       29.15342  155.548223  23.3   
3              1     89.0           66.0       23.00000   94.000000  28.1   
4              0    137.0           40.0       35.00000  168.000000  43.1   
..           ...      ...            ...            ...         ...   ...   
763           10    101.0           76.0       48.00000  180.000000  32.9   
764            2    122.0           70.0       27.00000  155.548223  36.8   
765            5    121.0           72.0       23.00000  112.000000  26.2   
766            1    126.0           60.0       29.15342  155.548223  30.1   
767            1     93.0           70.0       31.00000  155.548223  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627

In [40]:
x = df.drop(columns=["Outcome"])
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.00000,155.548223,33.6,0.627,50
1,1,85.0,66.0,29.00000,155.548223,26.6,0.351,31
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32
3,1,89.0,66.0,23.00000,94.000000,28.1,0.167,21
4,0,137.0,40.0,35.00000,168.000000,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.00000,180.000000,32.9,0.171,63
764,2,122.0,70.0,27.00000,155.548223,36.8,0.340,27
765,5,121.0,72.0,23.00000,112.000000,26.2,0.245,30
766,1,126.0,60.0,29.15342,155.548223,30.1,0.349,47


In [41]:
y = df[["Outcome"]]
y

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


In [42]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# Model Building

In [43]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [44]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=0)
print("******************Training Data Shape********************")
print("Training data- X- Shape:\t",xtrain.shape)
print()
print("Training data- y- Shape:\t",ytrain.shape)
print()
print("******************Testing Data Shape********************")
print("Training data- (X-input) Shape:\t",xtest.shape)
print()
print("Training data- (y-outcome) Shape:\t",ytest.shape)



******************Training Data Shape********************
Training data- X- Shape:	 (614, 8)

Training data- y- Shape:	 (614, 1)

******************Testing Data Shape********************
Training data- (X-input) Shape:	 (154, 8)

Training data- (y-outcome) Shape:	 (154, 1)


In [45]:
gnb = GaussianNB()
print("****************Naive Bayes GaussianNB Model*******************")
print("Training Phase\n")
gnb.fit(xtrain,ytrain)
print("Training is completed")
print()
print("Testing Phase\n")
ypred = gnb.predict(xtest)
print('Predicted class labels are:\n',ypred)
print()
print("Test is also done")
print()


****************Naive Bayes GaussianNB Model*******************
Training Phase

Training is completed

Testing Phase

Predicted class labels are:
 [1 0 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1
 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0
 0 1 1 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0]

Test is also done



  y = column_or_1d(y, warn=True)


# Performance Measure

In [46]:
print("Accuracy Score:\t",accuracy_score(ytest,ypred))
print()
print("Confusion Matrix:\n",confusion_matrix(ytest,ypred))
print()
print("Classification Report:\n",classification_report(ytest,ypred))

Accuracy Score:	 0.7857142857142857

Confusion Matrix:
 [[92 15]
 [18 29]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85       107
           1       0.66      0.62      0.64        47

    accuracy                           0.79       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.78      0.79      0.78       154



# Scaling the data

In [47]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc_x_train = sc.fit_transform(xtrain)
sc_x_test = sc.transform(xtest)

In [48]:
sc_x_train

array([[ 0.90832902,  0.93627156,  0.44607305, ...,  0.36780137,
         0.67740401,  1.69955804],
       [ 0.03644676, -0.81645845, -1.05366073, ..., -0.63382702,
        -0.07049698, -0.96569189],
       [-1.12606292,  1.43232723,  1.44589558, ...,  2.81463643,
        -0.11855487, -0.88240283],
       ...,
       [ 0.03644676, -0.91566959, -0.63706802, ..., -1.13464121,
        -0.95656442, -1.04898095],
       [ 2.0708387 , -1.21330299,  0.11279888, ..., -0.36195646,
        -0.50001442,  0.11706589],
       [ 0.32707418,  0.47328628,  0.77934723, ..., -0.02462752,
         0.52121586,  2.94889395]])

In [49]:
sc_x_test

array([[-0.8354355 ,  2.55672007,  0.27943597, ...,  1.46959259,
         2.78594417, -0.96569189],
       [-0.54480808, -0.48575468,  0.11279888, ...,  0.13885774,
        -0.1876381 , -0.88240283],
       [ 0.03644676, -1.51093639, -0.88702365, ...,  0.19609364,
        -0.22668514, -0.71582471],
       ...,
       [ 0.03644676,  0.67170854,  1.1126214 , ...,  1.62699134,
         0.53623395, -0.96569189],
       [-0.25418066, -0.18812128,  0.11279888, ..., -0.90569758,
        -1.07971278, -0.79911377],
       [-0.8354355 , -0.48575468, -0.05383821, ..., -0.26179362,
         1.06487079, -0.79911377]])

# Building the model after scaling

In [50]:
gnb_sc = GaussianNB()
print("****************Naive Bayes GaussianNB Model*******************")
print("Training Phase\n")
gnb_sc.fit(sc_x_train,ytrain)
print("Training is completed")
print()
print("Testing Phase\n")
ypred_sc = gnb_sc.predict(sc_x_test)
print('Predicted class labels are:\n',ypred_sc)
print()
print("Test is also done")
print()

****************Naive Bayes GaussianNB Model*******************
Training Phase

Training is completed

Testing Phase

Predicted class labels are:
 [1 0 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1
 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0
 0 1 1 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0]

Test is also done



  y = column_or_1d(y, warn=True)


# Performance Metrics

In [51]:
print("Accuracy Score:\t",accuracy_score(ytest,ypred_sc))
print()
print("Confusion Matrix:\n",confusion_matrix(ytest,ypred_sc))
print()
print("Classification Report:\n",classification_report(ytest,ypred_sc))

Accuracy Score:	 0.7857142857142857

Confusion Matrix:
 [[92 15]
 [18 29]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85       107
           1       0.66      0.62      0.64        47

    accuracy                           0.79       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.78      0.79      0.78       154

