In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Machine Learning
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report, roc_curve
)

import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")
print("Libraries imported successfully")

## 1. Load Preprocessed Data

In [None]:
# Load datasets from previous notebook
X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, y_test = joblib.load(
    '../data/processed_datasets.pkl'
)

print(f"Train set: {X_train_scaled.shape}")
print(f"Validation set: {X_val_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")
print(f"\nNumber of features: {X_train_scaled.shape[1]}")

# Convert to numpy arrays for TensorFlow
X_train_np = X_train_scaled.values
X_val_np = X_val_scaled.values
X_test_np = X_test_scaled.values

print(f"\nData converted to numpy arrays")

## 2. Build Autoencoder Architecture

The autoencoder will compress the input features into a lower-dimensional latent space,
learning meaningful representations that capture the essential patterns in the data.

In [None]:
# Autoencoder architecture
input_dim = X_train_np.shape[1]
encoding_dim = 10  # Latent dimension

print(f"Input dimension: {input_dim}")
print(f"Encoding dimension: {encoding_dim}")

# Encoder
input_layer = layers.Input(shape=(input_dim,), name='input')
encoded = layers.Dense(32, activation='relu', name='encoder_layer1')(input_layer)
encoded = layers.BatchNormalization()(encoded)
encoded = layers.Dropout(0.2)(encoded)
encoded = layers.Dense(16, activation='relu', name='encoder_layer2')(encoded)
encoded = layers.BatchNormalization()(encoded)
encoded = layers.Dropout(0.2)(encoded)
latent = layers.Dense(encoding_dim, activation='relu', name='latent_space')(encoded)

# Decoder
decoded = layers.Dense(16, activation='relu', name='decoder_layer1')(latent)
decoded = layers.BatchNormalization()(decoded)
decoded = layers.Dropout(0.2)(decoded)
decoded = layers.Dense(32, activation='relu', name='decoder_layer2')(decoded)
decoded = layers.BatchNormalization()(decoded)
decoded = layers.Dropout(0.2)(decoded)
output_layer = layers.Dense(input_dim, activation='linear', name='output')(decoded)

# Full autoencoder
autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder')

# Encoder model (for extracting latent features)
encoder = Model(inputs=input_layer, outputs=latent, name='encoder')

print("\nAutoencoder architecture created")
autoencoder.summary()

## 3. Train the Autoencoder

In [None]:
# Compile autoencoder
autoencoder.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10,
    min_lr=1e-6,
    verbose=1
)

print("Training autoencoder...\n")

# Train autoencoder
history = autoencoder.fit(
    X_train_np, X_train_np,
    epochs=200,
    batch_size=32,
    validation_data=(X_val_np, X_val_np),
    callbacks=[early_stopping, reduce_lr],
    verbose=0
)

print("\n✓ Autoencoder training completed")

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss (MSE)')
axes[0].set_title('Autoencoder Training Loss', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# MAE plot
axes[1].plot(history.history['mae'], label='Train MAE', linewidth=2)
axes[1].plot(history.history['val_mae'], label='Validation MAE', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MAE')
axes[1].set_title('Autoencoder Training MAE', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/autoencoder_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nFinal train loss: {history.history['loss'][-1]:.4f}")
print(f"Final validation loss: {history.history['val_loss'][-1]:.4f}")

## 4. Extract Latent Features

In [None]:
# Extract latent representations
X_train_encoded = encoder.predict(X_train_np, verbose=0)
X_val_encoded = encoder.predict(X_val_np, verbose=0)
X_test_encoded = encoder.predict(X_test_np, verbose=0)

print(f"Original feature space: {X_train_np.shape[1]} dimensions")
print(f"Encoded feature space: {X_train_encoded.shape[1]} dimensions")
print(f"Dimensionality reduction: {(1 - X_train_encoded.shape[1]/X_train_np.shape[1])*100:.1f}%")

# Create DataFrame for encoded features
encoded_columns = [f'latent_{i+1}' for i in range(encoding_dim)]
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=encoded_columns)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns)

print("\nLatent features extracted successfully")

In [None]:
# Visualize latent space (first 2 dimensions)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
scatter = plt.scatter(X_train_encoded[:, 0], X_train_encoded[:, 1], 
                     c=y_train, cmap='RdYlGn_r', alpha=0.6, s=50)
plt.colorbar(scatter, label='Diabetes (0=Healthy, 1=Diabetic)')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.title('Latent Space Visualization (First 2 Dimensions)', fontsize=12, fontweight='bold')
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(X_train_encoded[y_train==0, 0], X_train_encoded[y_train==0, 1],
           alpha=0.6, label='Healthy', s=50, color='green')
plt.scatter(X_train_encoded[y_train==1, 0], X_train_encoded[y_train==1, 1],
           alpha=0.6, label='Diabetic', s=50, color='red')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.title('Latent Space by Class', fontsize=12, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/latent_space_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Train XGBoost on Latent Features (Hybrid Model)

In [None]:
# Train XGBoost on encoded features
print("Training Hybrid Model (Autoencoder + XGBoost)...\n")

start_time = time.time()

# Calculate scale_pos_weight for class imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

hybrid_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

# Train with early stopping
hybrid_model.fit(
    X_train_encoded, y_train,
    eval_set=[(X_val_encoded, y_val)],
    verbose=False
)

training_time = time.time() - start_time

print(f"✓ Hybrid model training completed in {training_time:.2f} seconds")

## 6. Evaluate Hybrid Model

In [None]:
# Make predictions
y_train_pred_hybrid = hybrid_model.predict(X_train_encoded)
y_val_pred_hybrid = hybrid_model.predict(X_val_encoded)
y_test_pred_hybrid = hybrid_model.predict(X_test_encoded)

# Prediction probabilities
y_train_proba_hybrid = hybrid_model.predict_proba(X_train_encoded)[:, 1]
y_val_proba_hybrid = hybrid_model.predict_proba(X_val_encoded)[:, 1]
y_test_proba_hybrid = hybrid_model.predict_proba(X_test_encoded)[:, 1]

# Evaluation function
def evaluate_model(y_true, y_pred, y_proba, model_name, dataset_name):
    print(f"\n{'='*60}")
    print(f"{model_name} - {dataset_name} Set")
    print('='*60)
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba)
    
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"ROC-AUC:   {roc_auc:.4f}")
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    print(f"\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['Healthy', 'Diabetic']))
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }

# Evaluate on all sets
hybrid_train_metrics = evaluate_model(y_train, y_train_pred_hybrid, y_train_proba_hybrid,
                                     "Hybrid Model", "Train")
hybrid_val_metrics = evaluate_model(y_val, y_val_pred_hybrid, y_val_proba_hybrid,
                                   "Hybrid Model", "Validation")
hybrid_test_metrics = evaluate_model(y_test, y_test_pred_hybrid, y_test_proba_hybrid,
                                    "Hybrid Model", "Test")

## 7. Compare with Baseline Models

In [None]:
# Load baseline comparison
baseline_comparison = pd.read_csv('../reports/baseline_models_comparison.csv')

# Add hybrid model results
hybrid_results = pd.DataFrame([
    {'Model': 'Hybrid (AE+XGB)', 'Set': 'Train', **hybrid_train_metrics},
    {'Model': 'Hybrid (AE+XGB)', 'Set': 'Validation', **hybrid_val_metrics},
    {'Model': 'Hybrid (AE+XGB)', 'Set': 'Test', **hybrid_test_metrics}
])

# Combine all results
full_comparison = pd.concat([baseline_comparison, hybrid_results], ignore_index=True)

print("\n" + "="*90)
print("COMPLETE MODEL COMPARISON (INCLUDING HYBRID)")
print("="*90)
print(full_comparison.to_string(index=False))

# Save updated comparison
full_comparison.to_csv('../reports/all_models_comparison.csv', index=False)
print("\n✓ Comparison saved to ../reports/all_models_comparison.csv")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Filter test set results
test_results = full_comparison[full_comparison['Set'] == 'Test']

# Plot 1: Bar chart of all metrics
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
x = np.arange(len(metrics))
width = 0.25

rf_scores = test_results[test_results['Model'] == 'Random Forest'][metrics].values[0]
xgb_scores = test_results[test_results['Model'] == 'XGBoost'][metrics].values[0]
hybrid_scores = test_results[test_results['Model'] == 'Hybrid (AE+XGB)'][metrics].values[0]

axes[0].bar(x - width, rf_scores, width, label='Random Forest', alpha=0.8)
axes[0].bar(x, xgb_scores, width, label='XGBoost', alpha=0.8)
axes[0].bar(x + width, hybrid_scores, width, label='Hybrid (AE+XGB)', alpha=0.8)
axes[0].set_xlabel('Metrics', fontsize=12)
axes[0].set_ylabel('Score', fontsize=12)
axes[0].set_title('Test Set Performance: All Models', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels([m.upper() for m in metrics], rotation=45)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_ylim([0, 1.1])

# Plot 2: ROC Curves comparison
# Load baseline models
xgb_baseline = joblib.load('../data/xgboost_model.pkl')
rf_baseline = joblib.load('../data/random_forest_model.pkl')

# Get predictions
y_test_proba_rf = rf_baseline.predict_proba(X_test_scaled)[:, 1]
y_test_proba_xgb = xgb_baseline.predict_proba(X_test_scaled)[:, 1]

# Calculate ROC curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_test_proba_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_test_proba_xgb)
fpr_hybrid, tpr_hybrid, _ = roc_curve(y_test, y_test_proba_hybrid)

axes[1].plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC={rf_scores[4]:.3f})', linewidth=2)
axes[1].plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC={xgb_scores[4]:.3f})', linewidth=2)
axes[1].plot(fpr_hybrid, tpr_hybrid, label=f'Hybrid (AUC={hybrid_scores[4]:.3f})', linewidth=2.5)
axes[1].plot([0, 1], [0, 1], 'k--', label='Random Guess', linewidth=1)
axes[1].set_xlabel('False Positive Rate', fontsize=12)
axes[1].set_ylabel('True Positive Rate', fontsize=12)
axes[1].set_title('ROC Curves Comparison - Test Set', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/hybrid_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Feature importance in hybrid model
feature_importance_hybrid = pd.DataFrame({
    'feature': encoded_columns,
    'importance': hybrid_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance_hybrid['feature'], feature_importance_hybrid['importance'])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Latent Feature', fontsize=12)
plt.title('Feature Importance in Hybrid Model (Latent Features)', 
         fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/hybrid_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nLatent Feature Importance:")
print(feature_importance_hybrid)

## 8. Analysis: Dimensionality Reduction Benefits

In [None]:
# Reconstruction error analysis
X_train_reconstructed = autoencoder.predict(X_train_np, verbose=0)
X_test_reconstructed = autoencoder.predict(X_test_np, verbose=0)

train_mse = np.mean((X_train_np - X_train_reconstructed) ** 2, axis=1)
test_mse = np.mean((X_test_np - X_test_reconstructed) ** 2, axis=1)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Reconstruction error by class
axes[0].hist(train_mse[y_train == 0], bins=30, alpha=0.6, label='Healthy', color='green')
axes[0].hist(train_mse[y_train == 1], bins=30, alpha=0.6, label='Diabetic', color='red')
axes[0].set_xlabel('Reconstruction Error (MSE)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Reconstruction Error Distribution by Class', fontsize=12, fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Plot 2: Dimensionality comparison
dimensions = ['Original\n('+str(input_dim)+' features)', 
             'Encoded\n('+str(encoding_dim)+' features)']
dim_values = [input_dim, encoding_dim]
colors = ['skyblue', 'orange']

bars = axes[1].bar(dimensions, dim_values, color=colors, alpha=0.7)
axes[1].set_ylabel('Number of Features', fontsize=12)
axes[1].set_title('Feature Space Dimensionality', fontsize=12, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/autoencoder_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nReconstruction Error Statistics (Train):")
print(f"Healthy - Mean: {train_mse[y_train==0].mean():.4f}, Std: {train_mse[y_train==0].std():.4f}")
print(f"Diabetic - Mean: {train_mse[y_train==1].mean():.4f}, Std: {train_mse[y_train==1].std():.4f}")

## 9. Save Hybrid Model Components

In [None]:
# Save models
autoencoder.save('../data/autoencoder_model.h5')
encoder.save('../data/encoder_model.h5')
joblib.dump(hybrid_model, '../data/hybrid_xgboost_model.pkl')

# Save encoded datasets
joblib.dump((X_train_encoded, X_val_encoded, X_test_encoded), 
           '../data/encoded_datasets.pkl')

print("✓ Hybrid model components saved:")
print("  - Autoencoder: ../data/autoencoder_model.h5")
print("  - Encoder: ../data/encoder_model.h5")
print("  - XGBoost: ../data/hybrid_xgboost_model.pkl")
print("  - Encoded data: ../data/encoded_datasets.pkl")

## Summary

### Hybrid Model Performance:
The autoencoder successfully reduced dimensionality from the original feature space to a compact latent representation,
while maintaining predictive performance.

### Key Benefits:
1. **Dimensionality Reduction**: Compressed features for more efficient modeling
2. **Feature Learning**: Learned meaningful representations automatically
3. **Noise Reduction**: Autoencoder filtering can reduce noise in the data
4. **Interpretability**: Latent features capture essential patterns

### Comparison with Baselines:
- The hybrid model performance should be compared with baseline XGBoost on original features
- Trade-off between model complexity and performance
- Computational efficiency gained from reduced dimensionality

✓ Part 2.2 (Hybrid Model) completed!