In [4]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

from tabulate import tabulate

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn

In [5]:
df = pd.read_csv("../diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
df.Outcome.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

The data is poorly balanced, so I am going to use stratified shuffle split for having equal distribution of classes in the train and test datasets. 

But before that, I'll be scaling all the features down to standard scale format to have consistency

In [14]:
scaler = StandardScaler()
X = df.drop('Outcome', axis=1)
y = df.Outcome

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)

In [15]:
strat_shuf_split = StratifiedShuffleSplit(n_splits=1, test_size=.3, random_state=42)
train_idx, test_idx = next(strat_shuf_split.split(X_scaled, y))

In [17]:
X_train = X_scaled.iloc[train_idx]
y_train = y.iloc[train_idx]

X_test = X_scaled.iloc[test_idx]
y_test = y.iloc[test_idx]

In [24]:
y_train.value_counts(normalize=True)

Outcome
0    0.651769
1    0.348231
Name: proportion, dtype: float64

In [25]:
y_test.value_counts(normalize=True)

Outcome
0    0.649351
1    0.350649
Name: proportion, dtype: float64

In [26]:
nb = GaussianNB()

nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f"R2 score: {score}")

R2 score: 0.7445887445887446


In [28]:
splits = [i/10 for i in range(1, 10)]
scores = []
results = []
nb = GaussianNB()

for i, split in enumerate(splits):
    strat_shuf_split = StratifiedShuffleSplit(n_splits=1, test_size=split, random_state=24)
    train_idx, test_idx = next(strat_shuf_split.split(X_scaled, y))
    
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]

    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]

    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    score = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)

    results.append([i+1, str(f"{np.ceil((1-split)*100)}%"), str(f"{np.ceil(split*100)}%"), score])
    scores.append([i+1, str(f"{np.ceil((1-split)*100)}%"), str(f"{np.ceil(split*100)}%"), score, cr, cm])

headers = ['SNo.', 'Train %', 'Test %', 'Accuracy', 'Report', 'Confusion Matrix']
new_headers = ['SNo.', 'Train %', 'Test %', 'Accuracy']
print(tabulate(scores, headers=headers, tablefmt='grid'))

+--------+-----------+----------+------------+-------------------------------------------------------+--------------------+
|   SNo. | Train %   | Test %   |   Accuracy | Report                                                | Confusion Matrix   |
|      1 | 90.0%     | 10.0%    |   0.753247 | precision    recall  f1-score   support               | [[43  7]           |
|        |           |          |            |                                                       |  [12 15]]          |
|        |           |          |            |            0       0.78      0.86      0.82        50 |                    |
|        |           |          |            |            1       0.68      0.56      0.61        27 |                    |
|        |           |          |            |                                                       |                    |
|        |           |          |            |     accuracy                           0.75        77 |                    |
|       

In [30]:
import csv
with open("diabetes_result.csv", 'w', newline='') as file:
    csvWriter = csv.writer(file)
    csvWriter.writerow(new_headers)
    csvWriter.writerows(results)