# Import Basic Libraries 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# Load the dataset into dataframe

In [2]:
data = pd.read_csv("diabetes.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

In [None]:
data.head()

In [None]:
data.columns

# Exploratory Data Analysis

In [None]:
data.shape

In [None]:
data.isna().sum()

In [None]:
data.describe().T

In [None]:
data.info()

In [None]:
data.corr()

In [None]:
correlation_matrix = data.corr()


plt.figure(figsize=(8, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
data.duplicated()

# Data Visualization

In [None]:
sns.boxplot(x='Outcome', y='Glucose', data=data)
plt.show()


In [None]:
sns.barplot(x='Outcome', y='Glucose', data=data)
plt.show()

In [None]:
data.Pregnancies.hist()

In [None]:
data.Glucose.hist()

In [None]:
X = data.drop("Outcome", axis =1) 
y = data["Outcome"]

# Split and scale the data

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Now, X_train_scaled and X_test_scaled contain the scaled features


# Build and Train the model 

### Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=0)
RFC.fit(X_train_scaled, y_train)

pred_rfc = RFC.predict(X_test_scaled)
RFC_Accuracy= accuracy_score(y_test, pred_rfc)
RFC_Accuracy = round(RFC_Accuracy*100, 2)

print("Accuracy score for RandomForestClassifier is: ", RFC_Accuracy)
print(classification_report(y_test, pred_rfc))

### Decission Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(X_train_scaled, y_train)

pred_dt = DT.predict(X_test_scaled)
DT_Accuracy= accuracy_score(y_test, pred_dt)
DT_Accuracy = round(DT_Accuracy *100, 2)
print("Accuracy score for Decission tree classifier is: ", DT_Accuracy )
print(classification_report(y_test, pred_dt))

####  XGBoost classifier

In [None]:
from xgboost import XGBClassifier

XGB = XGBClassifier()
XGB.fit(X_train_scaled, y_train)

pred_xgb = XGB.predict(X_test_scaled)

XGB_Accuracy = accuracy_score(y_test, pred_xgb)
XGB_Accuracy = round(XGB_Accuracy * 100, 2)
print("Accuracy score for XGBoost classifier is:", XGB_Accuracy)

print(classification_report(y_test, pred_xgb))


### KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
pred_knn = knn.predict(X_test_scaled)

# Calculate accuracy and print the results
knn_accuracy = accuracy_score(y_test, pred_knn)
knn_accuracy = round(knn_accuracy * 100, 2)
print("Accuracy score for k-Nearest Neighbors classifier is:", knn_accuracy)

# Print classification report
print(classification_report(y_test, pred_knn))


In [None]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression(random_state=0)
logreg.fit(X_train_scaled, y_train)


pred_logreg = logreg.predict(X_test_scaled)


logreg_accuracy = accuracy_score(y_test, pred_logreg)
logreg_accuracy = round(logreg_accuracy * 100, 2)
print("Accuracy score for Logistic Regression is:", logreg_accuracy)

# Print classification report
print(classification_report(y_test, pred_logreg))


# Summary of Algorithm's Accuracy 

In [None]:
data = {'Algorithm': ['Random Forest Classifier','Decision Tree Classifier','XGBoost Classifier', 'KNeighborsClassifier', 'Logistic Regression'],
        'Accuracy': [RFC_Accuracy, DT_Accuracy, XGB_Accuracy, knn_accuracy, logreg_accuracy]}

summary = pd.DataFrame(data)

summary.head()

In [None]:
import numpy as np

def plot_confusion_matrix_percentage(cm, title, class_names):
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    labels = [f"{desc}\n{percent:.2f}%" for desc, percent in zip(class_names, cm_percent.flatten())]
    sns.heatmap(cm_percent, annot=np.array(labels).reshape(2, 2), fmt='', cmap='Blues', cbar=False)

    
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


class_names = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
cm_rfc = confusion_matrix(y_test, pred_rfc)
plot_confusion_matrix_percentage(cm_rfc, 'Confusion Matrix - Random Forest Classifier', class_names)
cm_dt = confusion_matrix(y_test, pred_dt)
plot_confusion_matrix_percentage(cm_dt, 'Confusion Matrix - Decision Tree Classifier', class_names)


cm_xgb = confusion_matrix(y_test, pred_xgb)
plot_confusion_matrix_percentage(cm_xgb, 'Confusion Matrix - XGBoost Classifier', class_names)
cm_knn = confusion_matrix(y_test, pred_knn)
plot_confusion_matrix_percentage(cm_knn, 'Confusion Matrix - KNeighbors Classifier', class_names)
cm_logreg = confusion_matrix(y_test, pred_logreg)
plot_confusion_matrix_percentage(cm_logreg, 'Confusion Matrix - Logistic Regression', class_names)


In [None]:
from sklearn.metrics import roc_curve, auc

pred_proba_rfc = RFC.predict_proba(X_test_scaled)
pred_proba_dt = DT.predict_proba(X_test_scaled)
pred_proba_xgb = XGB.predict_proba(X_test_scaled)
pred_proba_knn = knn.predict_proba(X_test_scaled)
pred_proba_logreg = logreg.predict_proba(X_test_scaled)


def plot_roc_curve(fpr, tpr, label, auc_score):
    plt.plot(fpr, tpr, label=f'{label} (AUC = {auc_score:.2f})')

fpr_rfc, tpr_rfc, _ = roc_curve(y_test, pred_proba_rfc[:, 1])
auc_rfc = auc(fpr_rfc, tpr_rfc)
plot_roc_curve(fpr_rfc, tpr_rfc, 'Random Forest Classifier', auc_rfc)
fpr_dt, tpr_dt, _ = roc_curve(y_test, pred_proba_dt[:, 1])
auc_dt = auc(fpr_dt, tpr_dt)
plot_roc_curve(fpr_dt, tpr_dt, 'Decision Tree Classifier', auc_dt)

fpr_xgb, tpr_xgb, _ = roc_curve(y_test, pred_proba_xgb[:, 1])
auc_xgb = auc(fpr_xgb, tpr_xgb)
plot_roc_curve(fpr_xgb, tpr_xgb, 'XGBoost Classifier', auc_xgb)

fpr_knn, tpr_knn, _ = roc_curve(y_test, pred_proba_knn[:, 1])
auc_knn = auc(fpr_knn, tpr_knn)
plot_roc_curve(fpr_knn, tpr_knn, 'KNeighbors Classifier', auc_knn)

fpr_logreg, tpr_logreg, _ = roc_curve(y_test, pred_proba_logreg[:, 1])
auc_logreg = auc(fpr_logreg, tpr_logreg)
plot_roc_curve(fpr_logreg, tpr_logreg, 'Logistic Regression', auc_logreg)


plt.plot([0, 1], [0, 1], linestyle='--', color='grey', label='Baseline')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Different Classifiers')
plt.legend()
plt.show()
