# Comprehensive Diabetes Analysis and Prediction

This notebook provides a detailed analysis of the diabetes dataset and implements multiple machine learning models for prediction.

## Contents:
1. Data Loading and Initial Exploration
2. Exploratory Data Analysis (EDA)
3. Feature Engineering
4. Model Development and Comparison
5. Advanced Analysis and Insights

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from scipy import stats

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = pd.read_csv('diabetes_dataset.csv')

# Display basic information
print("=== Dataset Info ===")
print(df.info())

print("\n=== First Few Rows ===")
display(df.head())

print("\n=== Basic Statistics ===")
display(df.describe())

print("\n=== Missing Values ===")
display(df.isnull().sum())

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Distribution of target variable
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Diabetes_Diagnosis')
plt.title('Distribution of Diabetes Cases')
plt.show()

# Age distribution by diabetes status
plt.figure(figsize=(12, 6))
sns.kdeplot(data=df, x='Age', hue='Diabetes_Diagnosis')
plt.title('Age Distribution by Diabetes Status')
plt.show()

# BMI vs Glucose Level scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='BMI', y='Glucose_Level', hue='Diabetes_Diagnosis', alpha=0.6)
plt.title('BMI vs Glucose Level')
plt.show()

In [None]:
# Correlation analysis
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Create new features
df['BMI_Category'] = pd.cut(df['BMI'], 
                           bins=[0, 18.5, 24.9, 29.9, 100],
                           labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

df['BP_Category'] = pd.cut(df['Blood_Pressure'],
                          bins=[0, 120, 140, 160, 200],
                          labels=['Normal', 'Prehypertension', 'Stage 1', 'Stage 2'])

df['Age_Group'] = pd.cut(df['Age'],
                        bins=[0, 30, 45, 60, 75, 100],
                        labels=['Young Adult', 'Adult', 'Middle Age', 'Senior', 'Elderly'])

# Calculate risk score
df['Risk_Score'] = (
    (df['BMI'] > 30).astype(int) +
    (df['Blood_Pressure'] > 140).astype(int) +
    (df['Glucose_Level'] > 126).astype(int) +
    (df['Age'] > 45).astype(int) +
    df['Family_History_Diabetes']
)

## 4. Model Development and Comparison

In [None]:
# Prepare data for modeling
def prepare_data(df):
    # Drop newly created categorical columns
    df = df.drop(['BMI_Category', 'BP_Category', 'Age_Group'], axis=1)
    
    # Separate features and target
    X = df.drop('Diabetes_Diagnosis', axis=1)
    y = df['Diabetes_Diagnosis']
    
    # Handle categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

X_train_scaled, X_test_scaled, y_train, y_test = prepare_data(df)

In [None]:
# Train and evaluate multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Store results
    results[name] = {
        'predictions': y_pred,
        'probabilities': model.predict_proba(X_test_scaled)[:, 1]
    }

## 5. Advanced Analysis and Insights

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.show()

In [None]:
# Analyze feature importance across models
feature_names = df.drop(['Diabetes_Diagnosis', 'BMI_Category', 'BP_Category', 'Age_Group'], axis=1).columns

# Get feature importance from Random Forest
rf_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': models['Random Forest'].feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(12, 6))
sns.barplot(data=rf_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Most Important Features (Random Forest)')
plt.tight_layout()
plt.show()

## 6. Risk Analysis and Recommendations

In [None]:
# Analyze risk factors
risk_analysis = pd.DataFrame({
    'Risk_Score': df['Risk_Score'],
    'Diabetes': df['Diabetes_Diagnosis']
})

plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Risk_Score', y='Diabetes_Diagnosis')
plt.title('Risk Score Distribution by Diabetes Status')
plt.show()

# Calculate risk probability by score
risk_prob = risk_analysis.groupby('Risk_Score')['Diabetes'].mean()
print("\nProbability of Diabetes by Risk Score:")
print(risk_prob)