# Diabetes Prediction using Machine Learning

Diabetes is a group of metabolic disorders in which there are high blood sugar levels over a prolonged period. 
Symptoms of high blood sugar include frequent urination, increased thirst, and increased hunger. 
If left untreated, diabetes can cause many complications such as cardiovascular disease, stroke, kidney disease, 
foot ulcers, and eye damage.

This dataset is from the National Institute of Diabetes and Digestive and Kidney Diseases. 
The objective is to diagnostically predict whether or not a patient has diabetes based on certain diagnostic measurements.

**Objective**: Build a machine learning model to accurately predict whether or not patients have diabetes.

**Dataset Details:**
- Pregnancies
- Glucose
- BloodPressure
- SkinThickness
- Insulin
- BMI
- DiabetesPedigreeFunction
- Age
- Outcome (0 or 1)


In [None]:
# Installation of required libraries
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.simplefilter(action = "ignore") 


In [None]:
# Reading the dataset
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
df.head()

In [None]:
# Shape and info
print(df.shape)
df.info()
df.describe().T
df['Outcome'].value_counts()


In [None]:
# Histogram plots
df['Age'].hist(edgecolor="black")
plt.show()

fig, ax = plt.subplots(4,2, figsize=(16,16))
sns.histplot(df.Age, bins=20, ax=ax[0,0]) 
sns.histplot(df.Pregnancies, bins=20, ax=ax[0,1]) 
sns.histplot(df.Glucose, bins=20, ax=ax[1,0]) 
sns.histplot(df.BloodPressure, bins=20, ax=ax[1,1]) 
sns.histplot(df.SkinThickness, bins=20, ax=ax[2,0])
sns.histplot(df.Insulin, bins=20, ax=ax[2,1])
sns.histplot(df.DiabetesPedigreeFunction, bins=20, ax=ax[3,0]) 
sns.histplot(df.BMI, bins=20, ax=ax[3,1])
plt.show()


In [None]:
# Correlation heatmap
f, ax = plt.subplots(figsize=[20,15])
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="magma", ax=ax)
plt.show()

In [None]:
# Replace 0 with NaN in some columns
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df.isnull().sum()

In [None]:
# Fill missing values with median by class
def median_target(var):   
    temp = df[df[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

columns = df.columns.drop("Outcome")
for i in columns:
    df.loc[(df['Outcome'] == 0) & (df[i].isnull()), i] = median_target(i)[i][0]
    df.loc[(df['Outcome'] == 1) & (df[i].isnull()), i] = median_target(i)[i][1]

df.isnull().sum()


In [None]:
# Outlier detection using Local Outlier Factor
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=10)
lof.fit_predict(df)
df_scores = lof.negative_outlier_factor_
threshold = np.sort(df_scores)[7]
outlier = df_scores > threshold
df = df[outlier]
df.shape


In [None]:
# Feature engineering
df['NewBMI'] = pd.cut(df['BMI'], bins=[0,18.5,24.9,29.9,34.9,39.9,100], labels=["Underweight","Normal","Overweight","Obesity1","Obesity2","Obesity3"])

def set_insulin(row):
    if 16 <= row["Insulin"] <= 166:
        return "Normal"
    else:
        return "Abnormal"
df['NewInsulinScore'] = df.apply(set_insulin, axis=1)

df['NewGlucose'] = pd.cut(df['Glucose'], bins=[0,70,99,126,200], labels=["Low","Normal","Overweight","High"])
df.head()


In [None]:
# One hot encoding categorical features
df = pd.get_dummies(df, columns=["NewBMI","NewInsulinScore","NewGlucose"], drop_first=True)
y = df["Outcome"]
X = df.drop("Outcome", axis=1)

from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X = pd.DataFrame(transformer.transform(X), columns=X.columns)


In [None]:
# Base models comparison
models = []
models.append(('LR', LogisticRegression(random_state=12345)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state=12345)))
models.append(('RF', RandomForestClassifier(random_state=12345)))
models.append(('SVM', SVC(gamma='auto', random_state=12345)))
models.append(('XGB', GradientBoostingClassifier(random_state=12345)))
models.append(("LightGBM", LGBMClassifier(random_state=12345)))

results = []
names = []
for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=10, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")

plt.figure(figsize=(15,10))
plt.boxplot(results)
plt.xticks(range(1,len(names)+1), names)
plt.title("Algorithm Comparison")
plt.show()


In [None]:
# TODO: Add Gaussian Process Classifier
# from sklearn.gaussian_process import GaussianProcessClassifier
# gpc = GaussianProcessClassifier().fit(X, y)

# TODO: Add Fuzzy Logic classification
# Example with scikit-fuzzy (to be implemented)


In [None]:

# --- Gaussian Process Classifiers: RBF vs Matern ---
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel as C, WhiteKernel
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix, brier_score_loss
import numpy as np

# Use the preprocessed X, y from earlier cells. If not present, re-create them.
try:
    X.shape
except NameError:
    print("X not found - please run previous cells to prepare X and y")
else:
    # For GP, reduce dimensionality slightly to avoid slow kernels: use top features by variance
    from sklearn.feature_selection import SelectKBest, f_classif
    selector = SelectKBest(f_classif, k=min(10, X.shape[1]))
    X_sel = selector.fit_transform(X, y)

    # Standardize
    from sklearn.preprocessing import StandardScaler
    scaler_gp = StandardScaler().fit(X_sel)
    Xg = scaler_gp.transform(X_sel)
    yg = y.values if hasattr(y, "values") else y

    Xg_train, Xg_test, yg_train, yg_test = train_test_split(Xg, yg, test_size=0.2, stratify=yg, random_state=42)

    kernels = {
        "RBF": C(1.0)*RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0),
        "Matern": C(1.0)*Matern(length_scale=1.0, nu=1.5) + WhiteKernel(noise_level=1.0)
    }

    gp_results = {}
    for name, kernel in kernels.items():
        print(f"Training GPC with kernel: {name}")
        gpc = GaussianProcessClassifier(kernel=kernel, max_iter_predict=100)
        gpc.fit(Xg_train, yg_train)
        prob = gpc.predict_proba(Xg_test)[:,1]
        pred = (prob >= 0.5).astype(int)
        auc = roc_auc_score(yg_test, prob)
        acc = accuracy_score(yg_test, pred)
        brier = brier_score_loss(yg_test, prob)
        print(f"{name} - AUC: {auc:.4f}, Acc: {acc:.4f}, Brier: {brier:.4f}")
        gp_results[name] = {"model": gpc, "auc": auc, "acc": acc, "brier": brier, "prob": prob, "pred": pred}

    # Compare to Logistic Regression baseline on same features
    lr = LogisticRegression(max_iter=1000)
    lr.fit(Xg_train, yg_train)
    prob_lr = lr.predict_proba(Xg_test)[:,1]
    auc_lr = roc_auc_score(yg_test, prob_lr)
    acc_lr = accuracy_score(yg_test, (prob_lr>=0.5).astype(int))
    print(f"LogisticRegression - AUC: {auc_lr:.4f}, Acc: {acc_lr:.4f}")

    # Summary table
    import pandas as pd
    summary = pd.DataFrame([
        {"method":"LogisticRegression","auc":auc_lr,"acc":acc_lr,"brier":brier_score_loss(yg_test, prob_lr)}
    ] + [{"method":k,"auc":v["auc"],"acc":v["acc"],"brier":v["brier"]} for k,v in gp_results.items()])
    display(summary.sort_values("auc", ascending=False))


In [None]:

# --- Gaussian Process Regression: Predict Glucose ---
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel as C, WhiteKernel
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Prepare regression dataset: predict Glucose from other clinical features
if 'df' not in globals():
    print("Dataframe df not found. Please run earlier cells.")
else:
    reg_features = ['Pregnancies','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
    # ensure no NaNs (should be imputed earlier)
    Xr = df[reg_features].copy()
    yr = df['Glucose'].copy()
    from sklearn.model_selection import train_test_split
    Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)
    scaler_r = StandardScaler().fit(Xr_train)
    Xr_train_s = scaler_r.transform(Xr_train)
    Xr_test_s = scaler_r.transform(Xr_test)

    kernels_r = {
        "RBF": C(1.0)*RBF(length_scale=np.ones(Xr_train_s.shape[1])),
        "Matern": C(1.0)*Matern(length_scale=np.ones(Xr_train_s.shape[1]), nu=1.5)
    }

    gpr_results = {}
    for name,kernel in kernels_r.items():
        print(f"Training GPR with kernel: {name}")
        gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-6, n_restarts_optimizer=3, normalize_y=True, random_state=42)
        gpr.fit(Xr_train_s, yr_train)
        ypred, ystd = gpr.predict(Xr_test_s, return_std=True)
        rmse = np.sqrt(mean_squared_error(yr_test, ypred))
        r2 = r2_score(yr_test, ypred)
        print(f"{name} - RMSE: {rmse:.4f}, R2: {r2:.4f}")
        gpr_results[name] = {"model": gpr, "rmse": rmse, "r2": r2, "ypred": ypred, "ystd": ystd}

    # Plot predicted vs true for best model by RMSE
    best_name = min(gpr_results, key=lambda k: gpr_results[k]['rmse'])
    best = gpr_results[best_name]
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,5))
    plt.errorbar(np.arange(len(best['ypred'])), best['ypred'], yerr=best['ystd'], fmt='x', label='pred')
    plt.scatter(np.arange(len(best['ypred'])), yr_test.values, marker='o', facecolors='none', edgecolors='r', label='true')
    plt.legend(); plt.title(f'GPR predictions ({best_name})'); plt.show()


In [None]:

# --- Fuzzy Logic: Mamdani baseline and simple tuning ---
import numpy as np
import pandas as pd
import skfuzzy as fuzz
from skfuzzy import control as ctrl
from sklearn.metrics import roc_auc_score, accuracy_score

# Prepare inputs: use BMI and Age as core inputs (from df)
if 'df' not in globals():
    print("df not present. Run earlier cells.")
else:
    bmi_vals = df['BMI'].values
    age_vals = df['Age'].values
    outcomes = df['Outcome'].values

    # Normalize ranges for universes
    bmi_univ = np.arange(10, 51, 1)
    age_univ = np.arange(10, 101, 1)
    risk_univ = np.arange(0, 101, 1)

    def build_mamdani(bmi_thresh_obese=30, age_thresh_senior=55):
        # Create fuzzy variables
        bmi = ctrl.Antecedent(bmi_univ, 'bmi')
        age = ctrl.Antecedent(age_univ, 'age')
        risk = ctrl.Consequent(risk_univ, 'risk')

        # BMI membership: under/normal/over/obese using thresholds
        bmi['under'] = fuzz.trimf(bmi.universe, [10, 15, 18.5])
        bmi['normal'] = fuzz.trimf(bmi.universe, [18.5, 22.5, 25])
        bmi['over'] = fuzz.trimf(bmi.universe, [24, 27.5, 30])
        bmi['obese'] = fuzz.trimf(bmi.universe, [bmi_thresh_obese-2, bmi_thresh_obese, 50])

        # Age membership
        age['young'] = fuzz.trimf(age.universe, [10, 20, 30])
        age['adult'] = fuzz.trimf(age.universe, [25, 40, 55])
        age['senior'] = fuzz.trimf(age.universe, [age_thresh_senior-10, age_thresh_senior, 100])

        # Risk membership
        risk['low'] = fuzz.trimf(risk.universe, [0, 10, 40])
        risk['medium'] = fuzz.trimf(risk.universe, [30, 50, 70])
        risk['high'] = fuzz.trimf(risk.universe, [60, 80, 100])

        # Rules
        rules = [
            ctrl.Rule(bmi['obese'] & age['senior'], risk['high']),
            ctrl.Rule(bmi['obese'] & age['adult'], risk['high']),
            ctrl.Rule(bmi['over'] & age['senior'], risk['high']),
            ctrl.Rule(bmi['normal'] & age['adult'], risk['medium']),
            ctrl.Rule(bmi['under'] & age['young'], risk['low']),
            ctrl.Rule(bmi['over'] & age['young'], risk['medium'])
        ]

        system = ctrl.ControlSystem(rules)
        sim = ctrl.ControlSystemSimulation(system)
        return sim

    # Baseline Mamdani
    sim_base = build_mamdani()
    fuzz_scores = []
    for bi, ag in zip(bmi_vals, age_vals):
        sim_base.input['bmi'] = bi
        sim_base.input['age'] = ag
        sim_base.compute()
        fuzz_scores.append(sim_base.output['risk'])

    # Evaluate baseline by AUC (threshold risk>50)
    try:
        auc_base = roc_auc_score(outcomes, np.array(fuzz_scores)/100.0)
    except Exception as e:
        auc_base = None
    print("Baseline Mamdani approx AUC (risk normalized):", auc_base)

    # Simple tuning: grid search over bmi_thresh_obese and age_thresh_senior to maximize AUC
    best_auc = -1
    best_params = None
    params_tested = []
    for bmi_t in range(28,35):
        for age_t in range(50,66,5):
            sim_t = build_mamdani(bmi_thresh_obese=bmi_t, age_thresh_senior=age_t)
            scores = []
            for bi, ag in zip(bmi_vals, age_vals):
                sim_t.input['bmi'] = bi
                sim_t.input['age'] = ag
                sim_t.compute()
                scores.append(sim_t.output['risk'])
            try:
                auc = roc_auc_score(outcomes, np.array(scores)/100.0)
            except Exception:
                auc = -1
            params_tested.append((bmi_t, age_t, auc))
            if auc > best_auc:
                best_auc = auc
                best_params = (bmi_t, age_t)
    print("Best tuned Mamdani params:", best_params, "AUC:", best_auc)

    # Build tuned sim and get final scores
    sim_tuned = build_mamdani(bmi_thresh_obese=best_params[0], age_thresh_senior=best_params[1])
    tuned_scores = []
    for bi, ag in zip(bmi_vals, age_vals):
        sim_tuned.input['bmi'] = bi
        sim_tuned.input['age'] = ag
        sim_tuned.compute()
        tuned_scores.append(sim_tuned.output['risk'])

    # Compare baseline and tuned by AUC and correlation with Outcome
    from scipy.stats import pearsonr
    corr_base = pearsonr(np.array(fuzz_scores), outcomes)[0]
    corr_tuned = pearsonr(np.array(tuned_scores), outcomes)[0]
    print("Correlation base:", corr_base, "tuned:", corr_tuned)
    print("AUC base:", auc_base, "AUC tuned:", best_auc)



In [None]:

# --- Summary: Compare GP, GPR, and Fuzzy ---
print('GP Classifier results and comparison table should appear above.')
try:
    display(summary)
except Exception:
    pass

print('\\nGPR regression results (Glucose) - review RMSE/R2 above.')
print('\\nFuzzy logic: baseline and tuned AUC and correlations printed above.')

# Save tuned fuzzy scores and best GP probabilities to CSV for further reporting if available
try:
    df_out = df.copy()
    df_out['fuzzy_tuned'] = np.array(tuned_scores)
    # if gp_results exists, add prob from best GP (Matern or RBF)
    best_gp_name = max(gp_results, key=lambda k: gp_results[k]['auc']) if 'gp_results' in globals() else None
    if best_gp_name:
        df_out['gp_prob_'+best_gp_name] = np.nan
        # map test indices - simpler: recompute on whole Xg scaled set (not advised but quick)
        Xg_all = scaler_gp.transform(selector.transform(X))
        df_out['gp_prob_'+best_gp_name] = gp_results[best_gp_name]['model'].predict_proba(Xg_all)[:,1]
    df_out.to_csv('diabetes_with_fuzzy_gp.csv', index=False)
    print('Saved diabetes_with_fuzzy_gp.csv')
except Exception as e:
    print('Could not save output CSV:', e)
