In [None]:
# Diabetes Data Preprocessing - Complete 5 Phase Pipeline

# ==================== IMPORTS & SETUP ====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
import os
import requests
import io
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

print("‚úÖ All libraries imported successfully!")

# ==================== FILE HANDLING ====================

def find_and_load_dataset():
    """Find and load the diabetes dataset with multiple fallback options"""
    
    # Possible file names and paths
    possible_paths = [
        'diabetes.csv',
        'Diabetes Missing Data.csv',
        '../diabetes.csv', 
        '../Diabetes Missing Data.csv',
        './diabetes.csv',
        './Diabetes Missing Data.csv',
        'data/diabetes.csv',
        '../data/diabetes.csv'
    ]
    
    # Check each possible path
    for path in possible_paths:
        if os.path.exists(path):
            df = pd.read_csv(path)
            print(f"‚úÖ Dataset found at: {path}")
            
            # Check and rename columns to standard format
            column_mapping = {}
            if 'BloodPressure' not in df.columns and 'Diastolic_BP' in df.columns:
                column_mapping['Diastolic_BP'] = 'BloodPressure'
            if 'SkinThickness' not in df.columns and 'Skin_Fold' in df.columns:
                column_mapping['Skin_Fold'] = 'SkinThickness'
            if 'Insulin' not in df.columns and 'Serum_Insulin' in df.columns:
                column_mapping['Serum_Insulin'] = 'Insulin'
            if 'DiabetesPedigreeFunction' not in df.columns and 'Diabetes_Pedigree' in df.columns:
                column_mapping['Diabetes_Pedigree'] = 'DiabetesPedigreeFunction'
            if 'Outcome' not in df.columns and 'Class' in df.columns:
                column_mapping['Class'] = 'Outcome'
            
            if column_mapping:
                df = df.rename(columns=column_mapping)
                print(f"‚úÖ Renamed columns: {column_mapping}")
            
            return df, path
    
    # If no local file found, download from web
    print(" No local file found. Downloading from web...")
    try:
        url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
        column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
        
        response = requests.get(url)
        df = pd.read_csv(io.StringIO(response.text), names=column_names)
        
        # Save for future use
        df.to_csv('diabetes.csv', index=False)
        print(" Dataset downloaded and saved as 'diabetes.csv'")
        return df, 'diabetes.csv'
        
    except Exception as e:
        print(f" Download failed: {e}")
        print(" Creating sample dataset for demonstration...")
        
        # Create sample data
        np.random.seed(42)
        n_samples = 768
        
        data = {
            'Pregnancies': np.random.randint(0, 15, n_samples),
            'Glucose': np.random.randint(50, 200, n_samples),
            'BloodPressure': np.random.randint(50, 110, n_samples),
            'SkinThickness': np.random.randint(10, 50, n_samples),
            'Insulin': np.random.randint(0, 200, n_samples),
            'BMI': np.random.uniform(20, 45, n_samples),
            'DiabetesPedigreeFunction': np.random.uniform(0.1, 2.5, n_samples),
            'Age': np.random.randint(20, 70, n_samples),
            'Outcome': np.random.randint(0, 2, n_samples)
        }
        
        df = pd.DataFrame(data)
        df.to_csv('diabetes.csv', index=False)
        print("‚úÖ Sample dataset created and saved as 'diabetes.csv'")
        return df, 'diabetes.csv'

# Load the dataset
df, file_path = find_and_load_dataset()
print(f"üìä Dataset shape: {df.shape}")
print(f"üìù Actual columns: {df.columns.tolist()}")

# ==================== PHASE 1: DATA COLLECTION & UNDERSTANDING ====================

print("=" * 70)
print("PHASE 1: DATA COLLECTION & UNDERSTANDING")
print("=" * 70)

print(f"‚úÖ Dataset loaded successfully!")
print(f" Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Display basic information
print("\n" + "‚îÄ" * 50)
print(" DATASET INFORMATION")
print("‚îÄ" * 50)
df.info()

# Basic Statistics
print("\n" + "‚îÄ" * 50)
print(" BASIC STATISTICS")
print("‚îÄ" * 50)
print(df.describe())

# First few rows
print("\n" + "‚îÄ" * 50)
print(" FIRST 5 ROWS")
print("‚îÄ" * 50)
print(df.head())

# Data Quality Assessment
print("\n" + "=" * 50)
print(" DATA QUALITY ASSESSMENT")
print("=" * 50)

# Biological features where zero is impossible - check which ones exist
biological_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
available_bio_features = [feature for feature in biological_features if feature in df.columns]

print(" Zeros in Biological Features (Potential Missing Values):")
zero_summary = {}
for feature in available_bio_features:
    zero_count = (df[feature] == 0).sum()
    percentage = (zero_count / len(df)) * 100
    zero_summary[feature] = zero_count
    print(f"   üî∏ {feature}: {zero_count} zeros ({percentage:.2f}%)")

# Check class distribution - find target column
target_col = None
for possible_target in ['Outcome', 'Class']:
    if possible_target in df.columns:
        target_col = possible_target
        break

if target_col:
    print(f"\n Target Variable Distribution ({target_col}):")
    class_dist = df[target_col].value_counts()
    print(class_dist)
    if len(class_dist) > 1:
        imbalance_ratio = class_dist[0] / class_dist[1]
        print(f" Imbalance Ratio: {imbalance_ratio:.2f}:1")
    else:
        print(" Only one class found in target variable")
else:
    print(" No target variable found in dataset")

# Visualize class distribution if target exists
if target_col:
    plt.figure(figsize=(10, 4))

    plt.subplot(1, 2, 1)
    sns.countplot(data=df, x=target_col, palette=['skyblue', 'salmon'])
    plt.title(f'Class Distribution\n(0 = Non-diabetic, 1 = Diabetic)')
    plt.xlabel(target_col)
    plt.ylabel('Count')

    plt.subplot(1, 2, 2)
    plt.pie(class_dist, labels=['Non-diabetic', 'Diabetic'], autopct='%1.1f%%', 
            colors=['lightblue', 'lightcoral'], startangle=90)
    plt.title('Class Distribution (%)')

    plt.tight_layout()
    plt.show()

print(f"\n Duplicate rows: {df.duplicated().sum()}")
print(f" Data types are consistent: {all(df.dtypes != 'object')}")
print("\n PHASE 1 COMPLETED - Ready for Data Cleaning!")

# ==================== PHASE 2: DATA CLEANING ====================

print("\n" + "=" * 70)
print("PHASE 2: DATA CLEANING")
print("=" * 70)

df_cleaned = df.copy()
print(" Created working copy of the dataset")

# 1. Handle Missing Values
print("\n" + "‚îÄ" * 50)
print("1. üõ†Ô∏è HANDLING MISSING VALUES")
print("‚îÄ" * 50)

print(" Replacing impossible zeros with NaN...")
for feature in available_bio_features:
    zero_count = (df_cleaned[feature] == 0).sum()
    if zero_count > 0:
        df_cleaned[feature] = df_cleaned[feature].replace(0, np.nan)
        print(f"   ‚úÖ {feature}: {zero_count} zeros replaced with NaN")

# Check missing values after replacement
print("\n Missing values after zero replacement:")
missing_after = df_cleaned.isnull().sum()
print(missing_after[missing_after > 0])

# Visualize missing values
plt.figure(figsize=(12, 6))
msno.matrix(df_cleaned)
plt.title('Missing Values Pattern After Zero Replacement')
plt.show()

# Apply imputation strategy
print("\nüîß Applying imputation strategy...")

# For glucose, blood pressure, BMI - use median
median_features = ['Glucose', 'BloodPressure', 'BMI']
for feature in median_features:
    if feature in df_cleaned.columns:
        median_val = df_cleaned[feature].median()
        df_cleaned[feature].fillna(median_val, inplace=True)
        print(f"    {feature}: Imputed with median ({median_val:.2f})")

# For skin thickness and insulin - use KNN imputer
knn_features = ['SkinThickness', 'Insulin']
available_knn = [feature for feature in knn_features if feature in df_cleaned.columns]

if available_knn:
    knn_imputer = KNNImputer(n_neighbors=5)
    df_cleaned[available_knn] = knn_imputer.fit_transform(df_cleaned[available_knn])
    print(f"    {available_knn}: Imputed using KNN")

# 2. Handle Outliers
print("\n" + "‚îÄ" * 50)
print("2. üìè HANDLING OUTLIERS")
print("‚îÄ" * 50)

print(" Detecting and treating outliers using IQR method...")
numerical_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                     'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
available_numerical = [feature for feature in numerical_features if feature in df_cleaned.columns]

outlier_count = 0
for feature in available_numerical:
    Q1 = df_cleaned[feature].quantile(0.25)
    Q3 = df_cleaned[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers before treatment
    outliers_before = ((df_cleaned[feature] < lower_bound) | (df_cleaned[feature] > upper_bound)).sum()
    outlier_count += outliers_before
    
    # Cap outliers
    df_cleaned[feature] = np.clip(df_cleaned[feature], lower_bound, upper_bound)

print(f"‚úÖ Treated {outlier_count} outliers using IQR capping method")

# Show outlier treatment comparison
plt.figure(figsize=(15, 10))
for i, feature in enumerate(['Glucose', 'BMI', 'Insulin'][:3], 1):
    if feature in df_cleaned.columns:
        plt.subplot(2, 3, i)
        plt.hist(df[feature], bins=30, alpha=0.7, color='blue', label='Original')
        plt.title(f'Original {feature}')
        plt.xlabel(feature)
        
        plt.subplot(2, 3, i+3)
        plt.hist(df_cleaned[feature], bins=30, alpha=0.7, color='green', label='Cleaned')
        plt.title(f'Cleaned {feature}')
        plt.xlabel(feature)

plt.tight_layout()
plt.show()

print(f"\n Cleaned dataset shape: {df_cleaned.shape}")
print("PHASE 2 COMPLETED - Data cleaned successfully!")

# ==================== PHASE 3: DATA TRANSFORMATION ====================

print("\n" + "=" * 70)
print("PHASE 3: DATA TRANSFORMATION")
print("=" * 70)

df_transformed = df_cleaned.copy()

# 1. Feature Engineering
print("\n" + "‚îÄ" * 50)
print("1. üîß FEATURE ENGINEERING")
print("‚îÄ" * 50)

print(" Creating new features...")

# Age groups
if 'Age' in df_transformed.columns:
    bins = [0, 30, 45, 60, 100]
    labels = ['Young', 'Middle-aged', 'Senior', 'Elderly']
    df_transformed['Age_Group'] = pd.cut(df_transformed['Age'], bins=bins, labels=labels)
    print("    Created Age_Group feature")

# BMI categories
if 'BMI' in df_transformed.columns:
    bmi_bins = [0, 18.5, 25, 30, 100]
    bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
    df_transformed['BMI_Category'] = pd.cut(df_transformed['BMI'], bins=bmi_bins, labels=bmi_labels)
    print("    Created BMI_Category feature")

# Glucose categories
if 'Glucose' in df_transformed.columns:
    glucose_bins = [0, 70, 99, 125, 200, 300]
    glucose_labels = ['Low', 'Normal', 'Prediabetic', 'Diabetic', 'High Diabetic']
    df_transformed['Glucose_Category'] = pd.cut(df_transformed['Glucose'], bins=glucose_bins, labels=glucose_labels)
    print("   Created Glucose_Category feature")

print(" Feature engineering completed!")
new_features = [col for col in df_transformed.columns if col not in df_cleaned.columns]
print(f" New features: {new_features}")

# 2. Encoding
print("\n" + "‚îÄ" * 50)
print("2.  ENCODING CATEGORICAL FEATURES")
print("‚îÄ" * 50)

print(" Encoding categorical features...")
categorical_features = ['Age_Group', 'BMI_Category', 'Glucose_Category']
available_categorical = [feature for feature in categorical_features if feature in df_transformed.columns]

for feature in available_categorical:
    le = LabelEncoder()
    df_transformed[f'{feature}_Encoded'] = le.fit_transform(df_transformed[feature])
    print(f"    Encoded {feature}")

print(" All categorical features encoded!")

# 3. Feature Scaling
print("\n" + "‚îÄ" * 50)
print("3. ‚öñÔ∏è FEATURE SCALING")
print("‚îÄ" * 50)

print(" Comparing StandardScaler vs MinMaxScaler...")

# Apply StandardScaler
scaler_standard = StandardScaler()
df_standard = df_transformed.copy()

for feature in available_numerical:
    df_standard[feature] = scaler_standard.fit_transform(df_standard[[feature]])

# Apply MinMaxScaler
scaler_minmax = MinMaxScaler()
df_minmax = df_transformed.copy()

for feature in available_numerical:
    df_minmax[feature] = scaler_minmax.fit_transform(df_minmax[[feature]])

print(" Both scaling methods applied for comparison")

# Show scaling comparison
if 'Glucose' in df_transformed.columns:
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.hist(df_transformed['Glucose'], bins=30, alpha=0.7, color='blue')
    plt.title('Original Glucose')
    plt.xlabel('Glucose')

    plt.subplot(1, 3, 2)
    plt.hist(df_standard['Glucose'], bins=30, alpha=0.7, color='green')
    plt.title('StandardScaler Glucose')
    plt.xlabel('Glucose')

    plt.subplot(1, 3, 3)
    plt.hist(df_minmax['Glucose'], bins=30, alpha=0.7, color='red')
    plt.title('MinMaxScaler Glucose')
    plt.xlabel('Glucose')

    plt.tight_layout()
    plt.show()

# Choose StandardScaler (better for algorithms assuming normal distribution)
df_transformed = df_standard
print("\n Selected StandardScaler for final dataset")

print(f"\n Transformed dataset shape: {df_transformed.shape}")
print("PHASE 3 COMPLETED - Data transformed successfully!")

# ==================== PHASE 4: DATA REDUCTION ====================

print("\n" + "=" * 70)
print("PHASE 4: DATA REDUCTION")
print("=" * 70)

# 1. Feature Selection
print("\n" + "‚îÄ" * 50)
print("1. üîç FEATURE SELECTION")
print("‚îÄ" * 50)

# Prepare features and target
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age',
                 'Age_Group_Encoded', 'BMI_Category_Encoded', 'Glucose_Category_Encoded']
available_features = [col for col in feature_columns if col in df_transformed.columns]

X = df_transformed[available_features]
y = df_transformed[target_col] if target_col else None

if y is not None:
    # Feature selection using mutual information
    selector = SelectKBest(score_func=mutual_info_classif, k='all')
    X_selected = selector.fit_transform(X, y)

    # Feature importance scores
    feature_scores = pd.DataFrame({
        'Feature': available_features,
        'Score': selector.scores_
    }).sort_values('Score', ascending=False)

    print(" Feature Importance Scores:")
    print(feature_scores)

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_scores, x='Score', y='Feature', palette='viridis')
    plt.title('Feature Importance Scores (Mutual Information)')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()

    # Select top 6 features
    top_features = feature_scores.head(6)['Feature'].tolist()
    print(f"\nüéØ Selected top 6 features: {top_features}")

    # 2. Dimensionality Reduction
    print("\n" + "‚îÄ" * 50)
    print("2.  DIMENSIONALITY REDUCTION (PCA)")
    print("‚îÄ" * 50)

    # Apply PCA
    pca = PCA()
    X_pca = pca.fit_transform(X_selected)

    # Plot explained variance
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Individual Explained Variance')

    plt.subplot(1, 2, 2)
    plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
             np.cumsum(pca.explained_variance_ratio_), marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Cumulative Explained Variance')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    print(" Explained variance ratio:", pca.explained_variance_ratio_)
    print(" Cumulative explained variance:", np.cumsum(pca.explained_variance_ratio_))

    # Find optimal number of components
    n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
    print(f"\nüéØ Recommended number of components: {n_components} (95% variance explained)")

    # Create reduced dataset
    pca_reduced = PCA(n_components=n_components)
    X_reduced = pca_reduced.fit_transform(X_selected)
    
    # Create DataFrame with reduced features
    reduced_columns = [f'PC{i+1}' for i in range(n_components)]
    df_reduced = pd.DataFrame(X_reduced, columns=reduced_columns)
    df_reduced[target_col] = y.values
    
    print(f"\n Reduced dataset shape: {df_reduced.shape}")
    print(" PHASE 4 COMPLETED - Data reduced successfully!")
else:
    print(" Cannot perform feature selection without target variable")
    df_reduced = df_transformed

# ==================== PHASE 5: DATA IMBALANCE HANDLING ====================

print("\n" + "=" * 70)
print("PHASE 5: DATA IMBALANCE HANDLING")
print("=" * 70)

if target_col and y is not None:
    # 1. Class Distribution Analysis
    print("\n" + "‚îÄ" * 50)
    print("1.  CLASS DISTRIBUTION ANALYSIS")
    print("‚îÄ" * 50)

    print("Original class distribution:")
    original_dist = df_transformed[target_col].value_counts()
    print(original_dist)

    imbalance_ratio = original_dist[0] / original_dist[1]
    print(f" Imbalance Ratio: {imbalance_ratio:.2f}:1")

    # Visualize original distribution
    plt.figure(figsize=(10, 4))

    plt.subplot(1, 2, 1)
    sns.countplot(data=df_transformed, x=target_col, palette=['skyblue', 'salmon'])
    plt.title('Original Class Distribution')
    plt.xlabel('Class')
    plt.ylabel('Count')

    plt.subplot(1, 2, 2)
    plt.pie(original_dist, labels=['Non-diabetic', 'Diabetic'], autopct='%1.1f%%', 
            colors=['lightblue', 'lightcoral'], startangle=90)
    plt.title('Original Class Distribution (%)')

    plt.tight_layout()
    plt.show()

    # 2. Balancing Techniques
    print("\n" + "‚îÄ" * 50)
    print("2. ‚öñÔ∏è APPLYING SMOTE FOR CLASS BALANCING")
    print("‚îÄ" * 50)

    print(" Applying SMOTE (Synthetic Minority Over-sampling Technique)...")

    # Prepare features for balancing
    if 'df_reduced' in locals() and hasattr(df_reduced, 'shape'):
        X_balance = df_reduced.drop(columns=[target_col])
    else:
        X_balance = df_transformed[available_features]
        
    y_balance = df_transformed[target_col]

    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X_balance, y_balance)

    print(" SMOTE applied successfully!")

    # Create balanced dataset
    df_balanced = pd.DataFrame(X_balanced, columns=X_balance.columns)
    df_balanced[target_col] = y_balanced

    print("\nBalanced class distribution:")
    balanced_dist = df_balanced[target_col].value_counts()
    print(balanced_dist)

    # Visualize balanced distribution
    plt.figure(figsize=(10, 4))

    plt.subplot(1, 2, 1)
    sns.countplot(data=df_balanced, x=target_col, palette=['lightgreen', 'lightcoral'])
    plt.title('Balanced Class Distribution (After SMOTE)')
    plt.xlabel('Class')
    plt.ylabel('Count')

    plt.subplot(1, 2, 2)
    plt.pie(balanced_dist, labels=['Non-diabetic', 'Diabetic'], autopct='%1.1f%%', 
            colors=['lightgreen', 'lightcoral'], startangle=90)
    plt.title('Balanced Class Distribution (%)')

    plt.tight_layout()
    plt.show()

    print(f"\n Balanced dataset shape: {df_balanced.shape}")
    print(" PHASE 5 COMPLETED - Class imbalance handled successfully!")
else:
    print(" Cannot perform imbalance handling without target variable")
    df_balanced = df_reduced if 'df_reduced' in locals() else df_transformed

# ==================== FINAL OUTPUT & SUMMARY ====================

print("\n" + "=" * 70)
print("FINAL OUTPUT & SUMMARY")
print("=" * 70)

# Save all processed datasets
print(" Saving processed datasets...")

df_cleaned.to_csv('diabetes_cleaned.csv', index=False)
df_transformed.to_csv('diabetes_transformed.csv', index=False)

if 'df_reduced' in locals() and hasattr(df_reduced, 'shape'):
    df_reduced.to_csv('diabetes_reduced.csv', index=False)
    
if 'df_balanced' in locals() and hasattr(df_balanced, 'shape'):
    df_balanced.to_csv('diabetes_balanced_final.csv', index=False)
    final_df = df_balanced
else:
    final_df = df_transformed

print(" All datasets saved successfully!")

# Generate comprehensive summary
print("\n" + "‚îÄ" * 50)
print(" COMPREHENSIVE PROCESSING SUMMARY")
print("‚îÄ" * 50)

print("DATASET TRANSFORMATION JOURNEY:")
print(f"   Phase 1 - Original: {df.shape}")
print(f"   Phase 2 - Cleaned: {df_cleaned.shape}")
print(f"   Phase 3 - Transformed: {df_transformed.shape}")
if 'df_reduced' in locals() and hasattr(df_reduced, 'shape'):
    print(f"   Phase 4 - Reduced: {df_reduced.shape}")
if 'df_balanced' in locals() and hasattr(df_balanced, 'shape'):
    print(f"   Phase 5 - Balanced: {df_balanced.shape}")

if target_col:
    print(f"\nTARGET VARIABLE TRANSFORMATION:")
    print(f"   Original: {df[target_col].value_counts().to_dict()}")
    if 'df_balanced' in locals() and hasattr(df_balanced, 'shape'):
        print(f"   Final Balanced: {df_balanced[target_col].value_counts().to_dict()}")

print(f"\nFEATURES CREATED:")
new_features = [col for col in final_df.columns if col not in df.columns]
for feature in new_features:
    print(f"   - {feature}")

print(f"\nDATA QUALITY IMPROVEMENTS:")
print("   ‚úÖ Missing values (zeros) identified and handled")
print("   ‚úÖ Outliers detected and treated")
print("   ‚úÖ New informative features created")
print("   ‚úÖ Features scaled for machine learning")
print("   ‚úÖ Feature selection performed")
print("   ‚úÖ Dimensionality reduction applied")
if target_col:
    print("   ‚úÖ Class imbalance addressed with SMOTE")

# Display final dataset preview
print("\n" + "‚îÄ" * 50)
print("üîç FINAL PROCESSED DATASET PREVIEW")
print("‚îÄ" * 50)

print("First 5 rows of final processed data:")
print(final_df.head())

print("\nFinal dataset info:")
print(final_df.info())

# Data Dictionary
print("\n" + "‚îÄ" * 50)
print(" DATA DICTIONARY")
print("‚îÄ" * 50)

data_dict = {
    'Pregnancies': 'Number of times pregnant',
    'Glucose': 'Plasma glucose concentration (mg/dL)',
    'BloodPressure': 'Diastolic blood pressure (mm Hg)',
    'SkinThickness': 'Triceps skin fold thickness (mm)',
    'Insulin': '2-Hour serum insulin (mu U/ml)',
    'BMI': 'Body mass index (kg/m¬≤)',
    'DiabetesPedigreeFunction': 'Diabetes pedigree function',
    'Age': 'Age in years',
    'Outcome': 'Target variable (0 = non-diabetic, 1 = diabetic)',
    'Age_Group': 'Categorical age groups (Young, Middle-aged, Senior, Elderly)',
    'BMI_Category': 'BMI classification (Underweight, Normal, Overweight, Obese)',
    'Glucose_Category': 'Glucose level classification'
}

for feature, description in data_dict.items():
    if feature in final_df.columns:
        print(f"{feature}: {description}")

print(f"\n Final Dataset Ready for Machine Learning!")
print(f" Output files:")
print(f"   - diabetes_cleaned.csv (Phase 2 output)")
print(f"   - diabetes_transformed.csv (Phase 3 output)")
if 'df_reduced' in locals() and hasattr(df_reduced, 'shape'):
    print(f"   - diabetes_reduced.csv (Phase 4 output)")
if 'df_balanced' in locals() and hasattr(df_balanced, 'shape'):
    print(f"   - diabetes_balanced_final.csv (Phase 5 output - Recommended)")

print(f"Final shape: {final_df.shape}")
if target_col:
    print(f"Target variable: '{target_col}' (0=Non-diabetic, 1=Diabetic)")

print("\n" + "" * 20)
print(" 5-PHASE DATA PROCESSING COMPLETED SUCCESSFULLY!")
print(" " * 20)
