# Diabetes Risk Analysis and Model Development
## By Fahad

This notebook contains the exploratory data analysis and model development for the DiabetesGuard Pro application.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = pd.read_csv('../data/diabetes_dataset.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample Data:")
df.head()

## 2. Data Preprocessing

In [None]:
# Check for missing values
print("Missing Values:")
df.isnull().sum()

## 3. Feature Engineering

In [None]:
# Create interaction features
df['Age_BMI'] = df['Age'] * df['BMI']
df['BMI_Glucose'] = df['BMI'] * df['Glucose_Level']

# Create risk categories
df['Age_Risk'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 100], labels=['Low', 'Moderate', 'High', 'Very High'])
df['BMI_Risk'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 100], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

## 4. Exploratory Data Analysis

In [None]:
# Distribution of key features
fig = px.histogram(df, x='Age', color='Diabetes_Diagnosis',
                  marginal='box', title='Age Distribution by Diabetes Status')
fig.show()

fig = px.histogram(df, x='BMI', color='Diabetes_Diagnosis',
                  marginal='box', title='BMI Distribution by Diabetes Status')
fig.show()

## 5. Model Development

In [None]:
# Prepare features and target
X = df[['Age', 'BMI', 'Blood_Pressure', 'Glucose_Level', 'Exercise_Hours_Per_Week']]
y = df['Diabetes_Diagnosis']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate model
y_pred = model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))

## 6. Feature Importance Analysis

In [None]:
# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance in Diabetes Prediction')
plt.show()