# Some basic Exploratory Data Analysis

In [3]:
import plotly.express as px
import plotly.io as pio
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_diabetes

url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

corr = df.drop("Outcome", axis=1).corr()
top_features = corr.abs().mean().sort_values(ascending=False).head(5).index
corr_subset = df[top_features].corr()

fig_corr = px.imshow(
    corr_subset,
    text_auto=True,
    color_continuous_scale="RdBu",
    title="Correlation Heatmap (Top 5 Features)",
    template="plotly_dark"
)
pio.write_json(fig_corr, "eda_correlation.json")

fig_dist = px.histogram(
    df,
    x="Glucose",
    color="Outcome",
    barmode="overlay",
    nbins=40,
    title="Distribution of Glucose Feature by Outcome Class",
    template="plotly_dark"
)
pio.write_json(fig_dist, "eda_glucose_hist.json")

df["Outcome_str"] = df["Outcome"].astype(str)

fig_scatter = px.scatter(
    df,
    x="Insulin",
    y="Pregnancies",
    color="Outcome_str",
    title="Insulin vs Pregnancies by Outcome Class",
    template="plotly_dark"
)
fig_scatter.update_layout(legend_title_text='Outcome')
pio.write_json(fig_scatter, "eda_scatter.json")


fig_box = px.box(
    df,
    x="Outcome",
    y="Age",
    points="all",
    color="Outcome",
    title="Age by Outcome Class",
    template="plotly_dark"
)
pio.write_json(fig_box, "eda_box_age.json")


fig_corr.show()
fig_dist.show()
fig_scatter.show()
fig_box.show()

# Model Training and Performance Evaluation

In [2]:
import pickle
import json
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from xgboost import XGBClassifier


url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

feature_names = df.drop("Outcome", axis=1).columns

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_scaled, y_train)

xgb_model = XGBClassifier(
    use_label_encoder=False, eval_metric="logloss", random_state=42
)
xgb_model.fit(X_train, y_train)

mlp = MLPClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    solver="adam",
    max_iter=500,
    random_state=42,
)
mlp.fit(X_train_scaled, y_train)


metrics = {}

def evaluate_model(name, model, X_test_in, y_test_in):
    y_pred = model.predict(X_test_in)
    y_proba = model.predict_proba(X_test_in)[:, 1]

    metrics[name] = {
        "accuracy": accuracy_score(y_test_in, y_pred),
        "f1": f1_score(y_test_in, y_pred),
        "auc": roc_auc_score(y_test_in, y_proba),
    }
    return y_pred

y_pred_logreg = evaluate_model("Logistic Regression", logreg, X_test_scaled, y_test)
y_pred_xgb = evaluate_model("XGBoost", xgb_model, X_test, y_test)
y_pred_mlp = evaluate_model("Neural Network", mlp, X_test_scaled, y_test)


models = list(metrics.keys())
accuracies = [metrics[m]["accuracy"] for m in models]
f1s = [metrics[m]["f1"] for m in models]
aucs = [metrics[m]["auc"] for m in models]

metrics_fig = go.Figure()
metrics_fig.add_trace(go.Bar(x=models, y=accuracies, name="Accuracy"))
metrics_fig.add_trace(go.Bar(x=models, y=f1s, name="F1 Score"))
metrics_fig.add_trace(go.Bar(x=models, y=aucs, name="ROC AUC"))

metrics_fig.update_layout(
    barmode="group",
    yaxis=dict(title="Score"),
    template="plotly_dark",
)
pio.write_json(metrics_fig, "metrics_bar.json")


cm_logreg = confusion_matrix(y_test, y_pred_logreg)

cm_fig_logreg = px.imshow(
    cm_logreg,
    text_auto=True,
    color_continuous_scale="Blues",
    labels=dict(x="Predicted", y="Actual"),
    title="Logistic Regression Confusion Matrix",
    template="plotly_dark"
)
pio.write_json(cm_fig_logreg, "cm_logreg.json")

y_pred_xgb = xgb_model.predict(X_test)
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
cm_fig_xgb = px.imshow(
    cm_xgb,
    text_auto=True,
    color_continuous_scale="Blues",
    labels=dict(x="Predicted", y="Actual"),
    title="XGBoost Confusion Matrix",
    template="plotly_dark"
)
pio.write_json(cm_fig_xgb, "cm_xgb.json")

y_pred_mlp = mlp.predict(X_test_scaled)
cm_mlp = confusion_matrix(y_test, y_pred_mlp)
cm_fig_mlp = px.imshow(
    cm_mlp,
    text_auto=True,
    color_continuous_scale="Blues",
    labels=dict(x="Predicted", y="Actual"),
    title="Neural Network Confusion Matrix",
    template="plotly_dark"
)
pio.write_json(cm_fig_mlp, "cm_mlp.json")

lr_importances = np.abs(logreg.coef_[0])
lr_top_idx = np.argsort(lr_importances)[-10:][::-1]
lr_top_features = feature_names[lr_top_idx]
lr_top_values = lr_importances[lr_top_idx]

lr_fig = go.Figure()
lr_fig.add_trace(go.Bar(x=lr_top_features, y=lr_top_values, name="LR Coefficients"))
lr_fig.update_layout(
    title="Logistic Regression Feature Importance",
    xaxis=dict(title="Features", tickangle=45),
    yaxis=dict(title="Absolute Coefficient"),
    template="plotly_dark"
)
pio.write_json(lr_fig, "feature_importance_lr.json")


xgb_importances = xgb_model.feature_importances_
xgb_top_idx = np.argsort(xgb_importances)[-10:][::-1]
xgb_top_features = feature_names[xgb_top_idx]
xgb_top_values = xgb_importances[xgb_top_idx]

xgb_fig = go.Figure()
xgb_fig.add_trace(go.Bar(x=xgb_top_features, y=xgb_top_values, name="XGB Importances"))
xgb_fig.update_layout(
    title="XGBoost Feature Importance",
    xaxis=dict(title="Features", tickangle=45),
    yaxis=dict(title="Importance"),
    template="plotly_dark"
)
pio.write_json(xgb_fig, "feature_importance_xgb.json")


with open("logreg.pkl", "wb") as f:
    pickle.dump(logreg, f)

with open("xgb.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

with open("mlp.pkl", "wb") as f:
    pickle.dump(mlp, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)


metrics_fig.show()
lr_fig.show()
xgb_fig.show()

cm_fig_logreg.show()
cm_fig_mlp.show()
cm_fig_xgb.show()


Parameters: { "use_label_encoder" } are not used.



Stochastic Optimizer: Maximum iterations (500) reached and the optimization hasn't converged yet.

