# Advanced Diabetes Analysis and Prediction

## Contents:
1. Data Loading and Initial Analysis
2. Interactive Data Visualization
3. Advanced Feature Engineering
4. Statistical Analysis
5. Multiple Model Comparison
6. Risk Factor Analysis
7. Patient Risk Profiling

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import xgboost as xgb
import lightgbm as lgb
from scipy import stats

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Data Loading and Initial Analysis

In [None]:
# Load the dataset
df = pd.read_csv('diabetes_dataset.csv')

# Display basic information
print("Dataset Info:")
display(df.info())

print("\nSample Data:")
display(df.head())

print("\nBasic Statistics:")
display(df.describe())

## 2. Interactive Data Visualization

In [None]:
def create_interactive_plots(df):
    # Age Distribution by Diabetes Status
    fig1 = px.histogram(df, x='Age', color='Diabetes_Diagnosis',
                       title='Age Distribution by Diabetes Status',
                       marginal='box')
    fig1.show()
    
    # BMI vs Glucose Level
    fig2 = px.scatter(df, x='BMI', y='Glucose_Level',
                      color='Diabetes_Diagnosis',
                      title='BMI vs Glucose Level',
                      trendline='ols')
    fig2.show()
    
    # Health Metrics Distribution
    metrics = ['Blood_Pressure', 'Cholesterol', 'HbA1c', 'BMI']
    fig3 = make_subplots(rows=2, cols=2, subplot_titles=metrics)
    
    for i, metric in enumerate(metrics, 1):
        row = (i-1)//2 + 1
        col = (i-1)%2 + 1
        
        fig3.add_trace(
            go.Violin(x=df['Diabetes_Diagnosis'].map({0:'No Diabetes', 1:'Diabetes'}),
                     y=df[metric],
                     name=metric,
                     box_visible=True),
            row=row, col=col
        )
    
    fig3.update_layout(height=800, title_text='Health Metrics Distribution by Diabetes Status')
    fig3.show()

create_interactive_plots(df)

## 3. Advanced Feature Engineering

In [None]:
def create_advanced_features(df):
    df_new = df.copy()
    
    # Health Risk Indicators
    df_new['BMI_Risk'] = pd.cut(df_new['BMI'],
                                bins=[0, 18.5, 24.9, 29.9, 34.9, 100],
                                labels=['Underweight', 'Normal', 'Overweight', 'Obese', 'Severely_Obese'])
    
    df_new['BP_Risk'] = pd.cut(df_new['Blood_Pressure'],
                               bins=[0, 120, 130, 140, 180, 200],
                               labels=['Normal', 'Elevated', 'Stage1', 'Stage2', 'Crisis'])
    
    # Metabolic Syndrome Score
    df_new['Metabolic_Score'] = (
        (df_new['BMI'] > 30).astype(int) +
        (df_new['Blood_Pressure'] > 130).astype(int) +
        (df_new['Glucose_Level'] > 100).astype(int) +
        (df_new['Cholesterol'] > 200).astype(int)
    )
    
    # Lifestyle Score
    df_new['Lifestyle_Score'] = (
        df_new['Exercise_Hours_Per_Week'] * 2 -
        df_new['Fast_Food_Intake_Per_Week'] -
        df_new['Processed_Food_Intake_Per_Week'] -
        df_new['Alcohol_Consumption_Per_Week'] * 0.5
    )
    
    # Age-related risk
    df_new['Age_Group'] = pd.cut(df_new['Age'],
                                 bins=[0, 30, 45, 60, 75, 100],
                                 labels=['Young', 'Adult', 'Middle_Age', 'Senior', 'Elderly'])
    
    # Interaction features
    df_new['BMI_X_Age'] = df_new['BMI'] * df_new['Age'] / 100
    df_new['Glucose_X_BMI'] = df_new['Glucose_Level'] * df_new['BMI'] / 100
    
    return df_new

# Create enhanced dataset
df_enhanced = create_advanced_features(df)

# Display new features
print("New Features Created:")
display(df_enhanced[[col for col in df_enhanced.columns if col not in df.columns]].head())

## 4. Statistical Analysis

In [None]:
def perform_statistical_analysis(df):
    # Chi-square test for categorical variables
    categorical_vars = ['Gender', 'Ethnicity', 'Smoking_Status', 'Physical_Activity_Level']
    chi_square_results = {}
    
    for var in categorical_vars:
        contingency = pd.crosstab(df[var], df['Diabetes_Diagnosis'])
        chi2, p_value = stats.chi2_contingency(contingency)[:2]
        chi_square_results[var] = {'chi2': chi2, 'p_value': p_value}
    
    # T-tests for numerical variables
    numerical_vars = ['Age', 'BMI', 'Blood_Pressure', 'Glucose_Level', 'HbA1c']
    t_test_results = {}
    
    for var in numerical_vars:
        diabetic = df[df['Diabetes_Diagnosis'] == 1][var]
        non_diabetic = df[df['Diabetes_Diagnosis'] == 0][var]
        t_stat, p_value = stats.ttest_ind(diabetic, non_diabetic)
        t_test_results[var] = {'t_stat': t_stat, 'p_value': p_value}
    
    return chi_square_results, t_test_results

chi_square_results, t_test_results = perform_statistical_analysis(df)

print("Chi-square Test Results:")
display(pd.DataFrame(chi_square_results).T)

print("\nT-test Results:")
display(pd.DataFrame(t_test_results).T)

## 5. Multiple Model Comparison

In [None]:
def prepare_data_for_modeling(df):
    # Encode categorical variables
    categorical_cols = df.select_dtypes(include=['object']).columns
    df_encoded = df.copy()
    
    for col in categorical_cols:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
    
    # Split features and target
    X = df_encoded.drop('Diabetes_Diagnosis', axis=1)
    y = df_encoded['Diabetes_Diagnosis']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

# Prepare data
X_train_scaled, X_test_scaled, y_train, y_test = prepare_data_for_modeling(df_enhanced)

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    results[name] = {
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'report': classification_report(y_test, y_pred)
    }
    
    print(f"\n{name} Performance:")
    print(results[name]['report'])

## 6. Risk Factor Analysis

In [None]:
def analyze_risk_factors(df, model, feature_names):
    # Get feature importance
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
    else:
        return None
    
    # Create importance DataFrame
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    # Plot top 15 features
    plt.figure(figsize=(12, 6))
    sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
    plt.title('Top 15 Risk Factors for Diabetes')
    plt.tight_layout()
    plt.show()
    
    return feature_importance

# Analyze risk factors using the best performing model
feature_importance = analyze_risk_factors(df_enhanced, models['Random Forest'], df_enhanced.drop('Diabetes_Diagnosis', axis=1).columns)
print("\nTop 10 Risk Factors:")
display(feature_importance.head(10))

## 7. Patient Risk Profiling

In [None]:
def create_risk_profile(patient_data, model, feature_names):
    """
    Create a comprehensive risk profile for a patient
    """
    # Create DataFrame from patient data
    patient_df = pd.DataFrame([patient_data])
    
    # Apply feature engineering
    patient_df = create_advanced_features(patient_df)
    
    # Prepare data for prediction
    for col in patient_df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        patient_df[col] = le.fit_transform(patient_df[col])
    
    # Scale features
    scaler = StandardScaler()
    patient_scaled = scaler.fit_transform(patient_df)
    
    # Make prediction
    risk_probability = model.predict_proba(patient_scaled)[0][1]
    
    # Create risk profile
    risk_profile = {
        'risk_probability': risk_probability,
        'risk_level': 'High' if risk_probability > 0.7 else 'Medium' if risk_probability > 0.3 else 'Low',
        'key_factors': [],
        'recommendations': []
    }
    
    # Add risk factors
    if patient_data['BMI'] > 30:
        risk_profile['key_factors'].append('High BMI')
        risk_profile['recommendations'].append('Consider weight management program')
    
    if patient_data['Blood_Pressure'] > 140:
        risk_profile['key_factors'].append('High Blood Pressure')
        risk_profile['recommendations'].append('Regular blood pressure monitoring')
    
    if patient_data['Glucose_Level'] > 126:
        risk_profile['key_factors'].append('High Glucose')
        risk_profile['recommendations'].append('Regular glucose monitoring')
    
    if patient_data['Exercise_Hours_Per_Week'] < 2.5:
        risk_profile['recommendations'].append('Increase physical activity')
    
    return risk_profile

# Example patient
example_patient = {
    'Age': 45,
    'Gender': 'Male',
    'BMI': 28.5,
    'Blood_Pressure': 130,
    'Glucose_Level': 100,
    'HbA1c': 5.7,
    'Exercise_Hours_Per_Week': 3,
    'Physical_Activity_Level': 'Moderate',
    # Add other required fields...
}

# Create risk profile
risk_profile = create_risk_profile(example_patient, models['Random Forest'], df_enhanced.columns)
print("Patient Risk Profile:")
display(risk_profile)