In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

In [None]:
from sklearn.model_selection import KFold

In [None]:
data = pd.read_csv('/content/diabetes.csv')

In [None]:
print(data)

     Glucose  Insulin   BMI  Age  Outcome
0        148        0  33.6   50        1
1         85        0  26.6   31        0
2        183        0  23.3   32        1
3         89       94  28.1   21        0
4        137      168  43.1   33        1
..       ...      ...   ...  ...      ...
763      101      180  32.9   63        0
764      122        0  36.8   27        0
765      121      112  26.2   30        0
766      126        0  30.1   47        1
767       93        0  30.4   23        0

[768 rows x 5 columns]


In [None]:
data.describe()

Unnamed: 0,Glucose,Insulin,BMI,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0
mean,120.894531,79.799479,31.992578,33.240885,0.348958
std,31.972618,115.244002,7.88416,11.760232,0.476951
min,0.0,0.0,0.0,21.0,0.0
25%,99.0,0.0,27.3,24.0,0.0
50%,117.0,30.5,32.0,29.0,0.0
75%,140.25,127.25,36.6,41.0,1.0
max,199.0,846.0,67.1,81.0,1.0


In [None]:
X_data = data.drop('Outcome',axis = 1)
Y_data = data['Outcome']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_data,Y_data,test_size = 0.2,random_state = 42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled,columns = X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled,columns = X_test.columns)


In [None]:
X_train.describe()

Unnamed: 0,Glucose,Insulin,BMI,Age
count,614.0,614.0,614.0,614.0
mean,120.855049,81.438111,31.983388,32.907166
std,32.035057,116.234835,7.740625,11.503437
min,0.0,0.0,0.0,21.0
25%,100.0,0.0,27.1,24.0
50%,117.0,42.5,32.0,29.0
75%,139.0,129.75,36.375,40.0
max,199.0,846.0,67.1,81.0


In [None]:
X_train_scaled_df.describe()

Unnamed: 0,Glucose,Insulin,BMI,Age
count,614.0,614.0,614.0,614.0
mean,-2.1698170000000003e-17,-2.6037800000000002e-17,-5.699386e-16,-3.7610160000000004e-17
std,1.000815,1.000815,1.000815,1.000815
min,-3.775663,-0.7012055,-4.135256,-1.03594
25%,-0.6515379,-0.7012055,-0.6313921,-0.7749361
50%,-0.1204366,-0.3352683,0.002147879,-0.339929
75%,0.566871,0.4159792,0.5678085,0.6170866
max,2.441346,6.583098,4.540363,4.184145


In [None]:
X_test.describe()

Unnamed: 0,Glucose,Insulin,BMI,Age
count,154.0,154.0,154.0,154.0
mean,121.051948,73.266234,32.029221,34.571429
std,31.825873,111.334275,8.458727,12.685155
min,62.0,0.0,0.0,21.0
25%,96.0,0.0,28.225,24.0
50%,112.0,0.0,32.85,29.5
75%,146.0,125.75,36.975,42.0
max,197.0,579.0,53.2,67.0


In [None]:
X_test_scaled_df.describe()

Unnamed: 0,Glucose,Insulin,BMI,Age
count,154.0,154.0,154.0,154.0
mean,0.006151,-0.070362,0.005926,0.144793
std,0.99428,0.95862,1.093661,1.103627
min,-1.838706,-0.701206,-4.135256,-1.03594
25%,-0.776503,-0.701206,-0.485936,-0.774936
50%,-0.276643,-0.701206,0.112048,-0.296428
75%,0.78556,0.381538,0.645385,0.791089
max,2.378864,4.284151,2.743178,2.966125


In [None]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train_scaled,y_train)
y_pred_nb = naive_bayes_model.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test,y_pred_nb)
precision = precision_score(y_test,y_pred_nb)
recall = recall_score(y_test,y_pred_nb)
f1 = f1_score(y_test,y_pred_nb)

In [None]:
print(f"Accuracy of Naive Bayes Model is: {accuracy}")
print(f"Precision of Naive Bayes Model is: {precision}")
print(f"Recall of Naive Bayes Model is: {recall}")
print(f"F1 Score of Naive Bayes Model is: {f1}")

Accuracy of Naive Bayes Model is: 0.7467532467532467
Precision of Naive Bayes Model is: 0.6481481481481481
Recall of Naive Bayes Model is: 0.6363636363636364
F1 Score of Naive Bayes Model is: 0.6422018348623854


In [None]:
with open('naive_bayes_model.pkl','wb') as file:
  pickle.dump(naive_bayes_model,file)

In [None]:
scaler.mean_

array([120.85504886,  81.43811075,  31.98338762,  32.90716612])

In [None]:
scaler.scale_

array([ 32.00895893, 116.14014299,   7.73431907,  11.49406506])

In [None]:
with open('scaler.pkl','wb') as file:
  pickle.dump(scaler,file)

In [None]:
X_test

Unnamed: 0,Glucose,Insulin,BMI,Age
668,98,190,34.0,43
324,112,0,35.7,21
624,108,0,30.8,21
690,107,0,24.6,34
473,136,0,29.9,50
...,...,...,...,...
355,165,0,30.4,49
534,77,56,33.3,24
344,95,0,36.8,57
296,146,360,28.0,29


In [None]:
y_test


Unnamed: 0,Outcome
668,0
324,0
624,0
690,0
473,0
...,...
355,1
534,0
344,0
296,1


In [None]:
results = pd.DataFrame({'True Labels': y_test, 'Predictions': y_pred_nb})
results['Correct'] = (results['True Labels'] == results['Predictions'])

# Filter correct and incorrect predictions
correct_predictions = results[results['Correct'] == True]
incorrect_predictions = results[results['Correct'] == False]

# Display the results
pd.set_option('display.max_rows',None)
print("Correct Predictions:")
print(correct_predictions)

print("\nIncorrect Predictions:")
print(incorrect_predictions)
print(len(correct_predictions))
print(len(incorrect_predictions))

Correct Predictions:
     True Labels  Predictions  Correct
668            0            0     True
324            0            0     True
624            0            0     True
690            0            0     True
473            0            0     True
97             0            0     True
336            0            0     True
199            1            1     True
265            0            0     True
760            0            0     True
501            0            0     True
457            0            0     True
604            1            1     True
636            0            0     True
544            0            0     True
86             0            0     True
208            0            0     True
281            0            0     True
209            1            1     True
581            0            0     True
639            0            0     True
431            0            0     True
120            1            1     True
363            1            1     True
425 

In [None]:
zero_label = data[data['Outcome'] == 0]
one_label = data[data['Outcome'] == 1]
print(len(zero_label))
print(len(one_label))

500
268


K-fold Cross Verification:-

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds, shuffled data

In [None]:
from sklearn.model_selection import cross_val_score

# Assuming naive_bayes_model and X_data, Y_data are already defined
cv_scores = cross_val_score(naive_bayes_model, X_data, Y_data, cv=kfold, scoring='accuracy')  # You can change 'accuracy' to other metrics

In [None]:
print("Cross-validation scores:", cv_scores)
print("Average accuracy:", cv_scores.mean())

Cross-validation scores: [0.74675325 0.74025974 0.74025974 0.81045752 0.73202614]
Average accuracy: 0.7539512774806892
