# Diabetes Analysis Dashboard with Streamlit

This dashboard allows you to interactively explore and visualize the diabetes dataset, including data statistics, feature distributions, and model results.

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

st.title('Diabetes Data Analysis Dashboard')

# Load data
df = pd.read_csv('diabetes.csv')

# Data cleaning
def clean_data(df):
    cols_with_invalid_zeros = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
    df[cols_with_invalid_zeros] = df[cols_with_invalid_zeros].replace(0, np.nan)
    df[cols_with_invalid_zeros] = df[cols_with_invalid_zeros].fillna(df[cols_with_invalid_zeros].median())
    return df

df = clean_data(df)

# Sidebar for navigation
option = st.sidebar.selectbox('Select Analysis', ['Data Overview', 'Feature Distributions', 'Correlation Heatmap', 'Model Comparison'])

if option == 'Data Overview':
    st.header('Data Overview')
    st.write(df.describe())
    st.write(df.head())

elif option == 'Feature Distributions':
    st.header('Feature Distributions')
    feature = st.selectbox('Select Feature', df.columns[:-1])
    fig, ax = plt.subplots()
    sns.histplot(df[feature], kde=True, ax=ax)
    st.pyplot(fig)
    fig2, ax2 = plt.subplots()
    sns.boxplot(y=df[feature], ax=ax2)
    st.pyplot(fig2)

elif option == 'Correlation Heatmap':
    st.header('Correlation Heatmap')
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
    st.pyplot(fig)

elif option == 'Model Comparison':
    st.header('Model Comparison')
    X = df.drop('Outcome', axis=1)
    y = df['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    results = {}
    # Logistic Regression
    log_model = LogisticRegression()
    log_model.fit(X_train_scaled, y_train)
    log_acc = accuracy_score(y_test, log_model.predict(X_test_scaled))
    results['Logistic Regression'] = log_acc
    # Decision Tree
    dt_model = DecisionTreeClassifier(random_state=42)
    dt_model.fit(X_train_scaled, y_train)
    dt_acc = accuracy_score(y_test, dt_model.predict(X_test_scaled))
    results['Decision Tree'] = dt_acc
    # Random Forest
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_scaled, y_train)
    rf_acc = accuracy_score(y_test, rf_model.predict(X_test_scaled))
    results['Random Forest'] = rf_acc
    # XGBoost
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb_model.fit(X_train_scaled, y_train)
    xgb_acc = accuracy_score(y_test, xgb_model.predict(X_test_scaled))
    results['XGBoost'] = xgb_acc
    st.write(pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy']))
    best_model = max(results, key=results.get)
    st.success(f'Best Model: {best_model} (Accuracy: {results[best_model]:.4f})')