# Diabetes Prediction EDA (Playground Series S5E12)

This notebook performs Exploratory Data Analysis (EDA) on the Playground Series S5E12 dataset and the external diabetes dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
# Load Playground Series S5E12 Data
try:
    tr_00 = pd.read_csv('../input/playground-series-s5e12/train.csv', index_col='id')
    ts_00 = pd.read_csv('../input/playground-series-s5e12/test.csv', index_col='id')
    sb_00 = pd.read_csv('../input/playground-series-s5e12/sample_submission.csv')
    print("✅ Playground datasets loaded successfully.")
    print(f"Train shape: {tr_00.shape}, Test shape: {ts_00.shape}")
except FileNotFoundError as e:
    print(f"❌ Error loading Playground data: {e}")

# Load External Dataset
try:
    # external dataset
    # Note: filtering by tr_00.columns requires tr_00 to be loaded.
    if 'tr_00' in locals():
        or_00 = pd.read_csv('../input/diabetes_dataset.csv')[tr_00.columns]
        print("✅ External dataset loaded and filtered successfully.")
        print(f"External shape: {or_00.shape}")
    else:
        or_00 = pd.read_csv('../input/diabetes_dataset.csv')
        print("✅ External dataset loaded (full columns as tr_00 missing).")
        print(f"External shape: {or_00.shape}")
except FileNotFoundError as e:
    print(f"❌ Error loading External data: {e}")
except KeyError as e:
    print(f"❌ Column mismatch error: {e}. Loading full external dataset instead.")
    or_00 = pd.read_csv('../input/diabetes_dataset.csv')
    print(f"External shape: {or_00.shape}")

## Data Overview

In [None]:
if 'tr_00' in locals():
    print("--- Train Data Info ---")
    display(tr_00.info())
    print("\n--- Train Data Statistics ---")
    display(tr_00.describe())

In [None]:
if 'tr_00' in locals():
    display(tr_00.head())

## Visualizations

In [None]:
# Filter numeric columns for correlation
if 'tr_00' in locals():
    numeric_cols = tr_00.select_dtypes(include=[np.number]).columns
    
    plt.figure(figsize=(16, 12))
    sns.heatmap(tr_00[numeric_cols].corr(), annot=False, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix (Train Data)')
    plt.show()

In [None]:
# Target Distribution (assuming target column name, checking commonly used names)
if 'tr_00' in locals():
    possible_targets = ['diagnosed_diabetes', 'Outcome', 'Diabetes_012', 'target']
    target_col = next((col for col in possible_targets if col in tr_00.columns), None)
    
    if target_col:
        plt.figure(figsize=(10, 6))
        sns.countplot(data=tr_00, x=target_col)
        plt.title(f'Target Distribution: {target_col}')
        plt.show()
    else:
        print("Target column not automatically identified from common names.")

In [None]:
# Compare Distributions (Train vs External)
if 'tr_00' in locals() and 'or_00' in locals():
    common_cols = [c for c in tr_00.columns if c in or_00.columns and pd.api.types.is_numeric_dtype(tr_00[c])]
    
    for col in common_cols[:5]: # Plot first 5 common numeric columns
        plt.figure(figsize=(10, 5))
        sns.kdeplot(tr_00[col], label='Train', fill=True, alpha=0.3)
        sns.kdeplot(or_00[col], label='External', fill=True, alpha=0.3)
        plt.title(f'Distribution Comparison: {col}')
        plt.legend()
        plt.show()