# Diabetes Prediction using Machine Learning



In [None]:
# Installation of required libraries
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.simplefilter(action = "ignore") 


In [None]:
# Reading the dataset
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
df.head()

In [None]:
# Shape and info
print(df.shape)
df.info()
df.describe().T
df['Outcome'].value_counts()


In [None]:
# Histogram plots
df['Age'].hist(edgecolor="black")
plt.show()

fig, ax = plt.subplots(4,2, figsize=(16,16))
sns.histplot(df.Age, bins=20, ax=ax[0,0]) 
sns.histplot(df.Pregnancies, bins=20, ax=ax[0,1]) 
sns.histplot(df.Glucose, bins=20, ax=ax[1,0]) 
sns.histplot(df.BloodPressure, bins=20, ax=ax[1,1]) 
sns.histplot(df.SkinThickness, bins=20, ax=ax[2,0])
sns.histplot(df.Insulin, bins=20, ax=ax[2,1])
sns.histplot(df.DiabetesPedigreeFunction, bins=20, ax=ax[3,0]) 
sns.histplot(df.BMI, bins=20, ax=ax[3,1])
plt.show()


In [None]:
# Correlation heatmap
f, ax = plt.subplots(figsize=[20,15])
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="magma", ax=ax)
plt.show()

In [None]:
# Replace 0 with NaN in some columns
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df.isnull().sum()

In [None]:
# Fill missing values with median by class
def median_target(var):   
    temp = df[df[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

columns = df.columns.drop("Outcome")
for i in columns:
    df.loc[(df['Outcome'] == 0) & (df[i].isnull()), i] = median_target(i)[i][0]
    df.loc[(df['Outcome'] == 1) & (df[i].isnull()), i] = median_target(i)[i][1]

df.isnull().sum()


In [None]:
# Outlier detection using Local Outlier Factor
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=10)
lof.fit_predict(df)
df_scores = lof.negative_outlier_factor_
threshold = np.sort(df_scores)[7]
outlier = df_scores > threshold
df = df[outlier]
df.shape


In [None]:
# Feature engineering
df['NewBMI'] = pd.cut(df['BMI'], bins=[0,18.5,24.9,29.9,34.9,39.9,100], labels=["Underweight","Normal","Overweight","Obesity1","Obesity2","Obesity3"])

def set_insulin(row):
    if 16 <= row["Insulin"] <= 166:
        return "Normal"
    else:
        return "Abnormal"
df['NewInsulinScore'] = df.apply(set_insulin, axis=1)

df['NewGlucose'] = pd.cut(df['Glucose'], bins=[0,70,99,126,200], labels=["Low","Normal","Overweight","High"])
df.head()


In [None]:
# One hot encoding categorical features
df = pd.get_dummies(df, columns=["NewBMI","NewInsulinScore","NewGlucose"], drop_first=True)
y = df["Outcome"]
X = df.drop("Outcome", axis=1)

from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X = pd.DataFrame(transformer.transform(X), columns=X.columns)


In [None]:
# Base models comparison
models = []
models.append(('LR', LogisticRegression(random_state=12345)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state=12345)))
models.append(('RF', RandomForestClassifier(random_state=12345)))
models.append(('SVM', SVC(gamma='auto', random_state=12345)))
models.append(('XGB', GradientBoostingClassifier(random_state=12345)))
models.append(("LightGBM", LGBMClassifier(random_state=12345)))

results = []
names = []
for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=10, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")

plt.figure(figsize=(15,10))
plt.boxplot(results)
plt.xticks(range(1,len(names)+1), names)
plt.title("Algorithm Comparison")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern

X = df.drop('Outcome', axis=1)
y = df['Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Baseline RF
rf = RandomForestClassifier(random_state=42)
rf_scores = cross_val_score(rf, X_scaled, y, cv=5, scoring='accuracy')
print('RF Accuracy:', rf_scores.mean())

# Gaussian Process Classifier with RBF kernel
gpc = GaussianProcessClassifier(1.0*RBF(), random_state=42)
gpc_scores = cross_val_score(gpc, X_scaled, y, cv=5, scoring='accuracy')
print('GPC Accuracy:', gpc_scores.mean())

In [None]:
#Task 2: Fuzzy Logic Controller
!pip install scikit-fuzzy deap

In [None]:
import skfuzzy as fuzz
import numpy as np
import matplotlib.pyplot as plt

# Define universe variables
temp = np.arange(15, 31, 1)
light = np.arange(0, 101, 1)

# Membership functions
temp_cold = fuzz.trimf(temp, [15, 15, 22])
temp_warm = fuzz.trimf(temp, [20, 25, 30])
temp_hot = fuzz.trimf(temp, [25, 30, 30])

plt.plot(temp, temp_cold, label='Cold')
plt.plot(temp, temp_warm, label='Warm')
plt.plot(temp, temp_hot, label='Hot')
plt.legend();

In [None]:
#Genetic Algorithm Optimization Example
from deap import base, creator, tools, algorithms
import random

# Example GA to optimize a simple objective function
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', list, fitness=creator.FitnessMin)

# Example problem: minimize sum of squares of 5 variables
def eval_func(ind):
    return sum(x**2 for x in ind),

IND_SIZE = 5

toolbox = base.Toolbox()
toolbox.register('attr_float', random.uniform, -5, 5)
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.attr_float, n=IND_SIZE)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)

toolbox.register('mate', tools.cxBlend, alpha=0.5)
toolbox.register('mutate', tools.mutGaussian, mu=0, sigma=1, indpb=0.2)
toolbox.register('select', tools.selTournament, tournsize=3)
toolbox.register('evaluate', eval_func)

pop = toolbox.population(n=20)
algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=True)