# ðŸ§  W03 â€” Dual Attention BiLSTM for RUL Prediction
**Objective**: Train and evaluate the Dual-Attention BiLSTM model (Section III-C2) on M1.

**Architecture**: Input â†’ Feature Attention â†’ BiLSTM â†’ Temporal Attention â†’ Dense â†’ RUL

**Author**: Fatima Khadija Benzine  
**Date**: 22 February 2026

---
## 0. Setup

In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch


project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / 'src'))

from data_loader import MultiDatasetLoader
from preprocessing import PreprocessingPipelineBI, create_sliding_windows, evaluate_per_unit
from attention import (
    build_dual_attention_bilstm,
    extract_attention_weights,
    save_attention_weights,
)

import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

print("All modules imported âœ“")

TensorFlow version: 2.20.0
GPU available: False
All modules imported âœ“


---
## 1. Prepare Data

In [2]:
# Config
DATASET = 'FD001'
W = 30  # window size
FEATURE_SELECTION = 'correlation'  # <-- 'correlation', 'aficv', or 'sensor_only'

# Load
loader = MultiDatasetLoader()
ds = loader.load_cmapss_dataset(DATASET)

Loading FD001 dataset...
  Files: train=True, test=True, rul=True
  - Training data shape: (20631, 26)
  - Training units: 100
  - Training RUL range: [0, 361]
  - Test data shape: (13096, 26)
  - RUL values shape: (100, 1)
  - Test units found: 100 (units: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]...)
  - RUL values provided: 100
    Unit 1: max_cycle=31, base_RUL=112
    Unit 2: max_cycle=49, base_RUL=98
    Unit 3: max_cycle=126, base_RUL=69
âœ“ FD001 loaded: 20631 train, 13096 test samples


In [3]:
from bi_fusion import BIFusionPipeline, CONTINUOUS_BI_VARS
from feature_selection import BIAwareFeatureSelector
from feature_selection_aficv import AFICvFeatureSelector

meta_cols = ['unit', 'cycle', 'rul']

# --- Common preprocessing (Steps 0-3b) ---
train_raw = ds['train'].copy()
test_raw = ds['test'].copy()
train_raw['rul'] = train_raw['rul'].clip(upper=125)
if 'rul' in test_raw.columns:
    test_raw['rul'] = test_raw['rul'].clip(upper=125)

sensor_cols = [c for c in train_raw.columns if c.startswith('sensor_')]
setting_cols = [c for c in train_raw.columns if c.startswith('setting_')]

from preprocessing import DataNormalizer
norm = DataNormalizer(method='minmax')
train_norm = norm.fit_transform(train_raw, sensor_cols + setting_cols)
test_norm = norm.transform(test_raw)

fusion = BIFusionPipeline()
train_fused = fusion.fuse(train_norm, DATASET, split='train', encode_categoricals=True)
test_fused = fusion.fuse(test_norm, DATASET, split='test', encode_categoricals=True)
bi_cols = fusion.get_bi_columns(train_fused)

bi_cont = [c for c in CONTINUOUS_BI_VARS if c in train_fused.columns]
bi_norm = DataNormalizer(method='minmax')
train_fused = bi_norm.fit_transform(train_fused, bi_cont)
test_fused = bi_norm.transform(test_fused)

print(f"Fused shape: {train_fused.shape} train, {test_fused.shape} test")


=== BI Fusion: FD001 (train) ===
  Sensor data: (20631, 27)
  BI data loaded: 20631 rows, 100 units
  Fused data: (20631, 44)
  Features: 21 sensor + 17 BI

=== BI Fusion: FD001 (test) ===
  Sensor data: (13096, 27)
  BI data loaded: 20648 rows, 100 units
  Fused data: (13096, 44)
  Features: 21 sensor + 17 BI
Fused shape: (20631, 44) train, (13096, 44) test


In [4]:
# --- Feature Selection ---
if FEATURE_SELECTION == 'correlation':
    print("=== Correlation-based feature selection ===")
    selector = BIAwareFeatureSelector(variance_threshold=0.01, correlation_threshold=0.95)
    feature_names = selector.select_features(
        data=train_fused, sensor_cols=sensor_cols,
        bi_cols=bi_cols, setting_cols=setting_cols,
        exclude_cols=meta_cols,
    )
    train_sel = selector.transform(train_fused, keep_cols=meta_cols)
    test_sel = selector.transform(test_fused, keep_cols=meta_cols)

elif FEATURE_SELECTION == 'aficv':
    print("=== AFICv Stratified (90%) feature selection ===")
    selector = AFICvFeatureSelector(
        base_learner='xgboost', n_folds=5, cumulative_threshold=0.90,
    )
    feature_names = selector.select_features_stratified(
        data=train_fused, sensor_cols=sensor_cols,
        bi_cols=bi_cols, setting_cols=setting_cols,
        target_col='rul', group_col='unit',
    )
    train_sel = selector.transform(train_fused, keep_cols=meta_cols)
    test_sel = selector.transform(test_fused, keep_cols=meta_cols)
    
elif FEATURE_SELECTION == 'sensor_only':
    print("=== Sensor-only (no BI) â€” ablation baseline ===")
    selector = BIAwareFeatureSelector(variance_threshold=0.01, correlation_threshold=0.95)
    # Run selection to get variance/correlation filtering on sensors
    selector.select_features(
        data=train_fused, sensor_cols=sensor_cols,
        bi_cols=bi_cols, setting_cols=setting_cols,
        exclude_cols=meta_cols,
    )
    # Keep only sensor/setting features, drop all BI
    feature_names = [f for f in selector.selected_features 
                     if f.startswith('sensor_') or f.startswith('setting_')]
    train_sel = train_fused[meta_cols + feature_names].copy()
    test_sel = test_fused[meta_cols + feature_names].copy()

else:
    raise ValueError(f"Unknown selection method: {FEATURE_SELECTION}")

n_features = len(feature_names)
n_sensor = sum(1 for f in feature_names if f.startswith('sensor_') or f.startswith('setting_'))
n_bi = n_features - n_sensor

print(f"\nMethod: {FEATURE_SELECTION}")
print(f"Features ({n_features}): {n_sensor} sensor/setting + {n_bi} BI")
print(f"Selected: {feature_names}")

=== Correlation-based feature selection ===

=== BI-Aware Feature Selection ===
  Input: 21 sensor + 17 BI + 3 setting = 41 total
  Variance filter (sensor/settings only):
    Removed 9: ['sensor_1', 'sensor_5', 'sensor_9', 'sensor_10', 'sensor_14', 'sensor_16', 'sensor_18', 'sensor_19', 'setting_3']
    Kept 15 sensor/setting features
    BI features: 17 (all exempt, all kept)
  Correlation filter (tau=0.95):
    Removed 0:
  Final: 32 features (15 sensor/setting + 17 BI)

Method: correlation
Features (32): 15 sensor/setting + 17 BI
Selected: ['sensor_2', 'sensor_3', 'sensor_4', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_15', 'sensor_17', 'sensor_20', 'sensor_21', 'setting_1', 'setting_2', 'pm_cost', 'cm_cost', 'labor_rate_standard', 'labor_rate_overtime', 'downtime_penalty', 'revenue_per_hour', 'spare_parts_available', 'spare_parts_lead_time', 'technician_available', 'maintenance_window', 'contract_penalty_active', 'production_priority_0', 'pro

In [5]:
# Sliding windows
PAD = False  # <-- SWITCH: True (all samples, zero-padded) or False (skip first W-1)

X_train, y_train = create_sliding_windows(
    train_sel, window_size=W, feature_cols=feature_names, target_col='rul', pad=PAD)
X_test, y_test = create_sliding_windows(
    test_sel, window_size=W, feature_cols=feature_names, target_col='rul', pad=PAD)

print(f"\nX_train: {X_train.shape}  y_train: {y_train.shape}")
print(f"X_test:  {X_test.shape}  y_test:  {y_test.shape}")



[Sliding Window] W=30, features=32
  Units: 100 total, 0 padded, 0 excluded
  Output: X=(17731, 30, 32), y=(17731,)

[Sliding Window] W=30, features=32
  Units: 100 total, 0 padded, 0 excluded
  Output: X=(10196, 30, 32), y=(10196,)

X_train: (17731, 30, 32)  y_train: (17731,)
X_test:  (10196, 30, 32)  y_test:  (10196,)


---
## 2. Build Model

In [None]:
model, attn_model = build_dual_attention_bilstm(
    window_size=W,
    n_features=n_features,
    lstm_units=64,
    feature_attention_dim=32,
    temporal_attention_dim=64,
    dropout_rate=0.3,
    dense_units=32,
    learning_rate=0.001,
)

model.summary()

---
## 3. Train

In [None]:
# Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6
    ),
]

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=256,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1,
)

In [None]:
# Training curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history.history['loss'], label='Train')
axes[0].plot(history.history['val_loss'], label='Validation')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('MSE Loss')
axes[0].set_title('Loss', fontweight='bold')
axes[0].legend()

axes[1].plot(history.history['mae'], label='Train')
axes[1].plot(history.history['val_mae'], label='Validation')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MAE')
axes[1].set_title('Mean Absolute Error', fontweight='bold')
axes[1].legend()

plt.suptitle('Dual-Attention BiLSTM Training â€” M1', fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

---
## 4. Evaluate

In [None]:

from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = model.predict(X_test, batch_size=256).flatten()

# Per-unit standard evaluation
results = evaluate_per_unit(
    y_true=y_test, y_pred=y_pred,
    df=test_sel, window_size=W, pad=PAD,
)


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter (per-unit last prediction)
axes[0].scatter(results['true_last'], results['preds_last'], alpha=0.6, s=20)
axes[0].plot([0, 125], [0, 125], 'r--', linewidth=1.5, label='Perfect')
axes[0].set_xlabel('True RUL')
axes[0].set_ylabel('Predicted RUL')
axes[0].set_title(f"Last-window (RMSE={results['rmse_last']:.2f}, Score={results['score_last']:.0f})", fontweight='bold')
axes[0].legend()
axes[0].set_xlim(0, 130)
axes[0].set_ylim(0, 130)

# Error distribution
errors = results['preds_last'] - results['true_last']
axes[1].hist(errors, bins=20, alpha=0.7, edgecolor='black')
axes[1].axvline(x=0, color='red', linestyle='--')
axes[1].set_xlabel('Prediction Error (pred - true)')
axes[1].set_ylabel('Count')
axes[1].set_title('Error Distribution (100 units)', fontweight='bold')

plt.suptitle(f'Dual-Attention BiLSTM â€” M1 ({FEATURE_SELECTION}, pad={PAD})', fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

---
## 5. Attention Weight Analysis

In [None]:
# Extract attention weights
weights = extract_attention_weights(
    attn_model, X_test, feature_names, batch_size=256
)

print("=== Global Feature Importance (Attention Weights) ===")
print(weights['feature_importance'].to_string(index=False))

In [None]:
# Feature importance bar chart
imp = weights['feature_importance']
type_colors = {'sensor': '#1f77b4', 'setting': '#ff7f0e', 'BI': '#2ca02c'}

fig, ax = plt.subplots(figsize=(10, 8))
data_plot = imp.iloc[::-1]
colors = [type_colors[t] for t in data_plot['type']]

ax.barh(range(len(data_plot)), data_plot['mean_attention_weight'],
        color=colors, alpha=0.85)
ax.set_yticks(range(len(data_plot)))
ax.set_yticklabels(data_plot['feature'], fontsize=9)
ax.set_xlabel('Mean Attention Weight')
ax.set_title('Feature Attention â€” Learned Importance (M1)', fontweight='bold')

legend_elements = [Patch(facecolor=c, label=t) for t, c in type_colors.items()]
ax.legend(handles=legend_elements, loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
# Temporal attention: average weights across test set
mean_temporal = np.mean(weights['temporal_weights'], axis=0)  # (W,)

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(range(W), mean_temporal, color='#457b9d', alpha=0.8)
ax.set_xlabel('Time Step in Window (0 = oldest, 29 = most recent)')
ax.set_ylabel('Mean Attention Weight')
ax.set_title('Temporal Attention â€” Which Time Steps Matter? (M1)', fontweight='bold')
plt.tight_layout()
plt.show()

---
## 6. Attention Dynamics: How Weights Change with Degradation

In [None]:
# Split test samples by RUL range
rul_bins = {
    'Healthy (RUL > 100)': y_test > 100,
    'Mid-life (40 < RUL â‰¤ 100)': (y_test > 40) & (y_test <= 100),
    'Near failure (RUL â‰¤ 40)': y_test <= 40,
}

# Feature attention by degradation phase
per_sample = weights['per_sample_importance']  # (n_samples, n_features)

phase_importance = {}
for phase, mask in rul_bins.items():
    if mask.sum() > 0:
        phase_importance[phase] = np.mean(per_sample[mask], axis=0)

phase_df = pd.DataFrame(phase_importance, index=feature_names)

# Heatmap
fig, ax = plt.subplots(figsize=(8, 10))
sns.heatmap(phase_df, annot=True, fmt='.3f', cmap='YlOrRd',
            linewidths=0.5, ax=ax, cbar_kws={'label': 'Attention Weight'})
ax.set_title('Feature Attention Across Degradation Phases (M1)', fontweight='bold')
ax.set_ylabel('')
plt.tight_layout()
plt.show()

print("\nKey insight: do BI features get more/less attention near failure?")
for phase in phase_importance:
    bi_weight = sum(phase_importance[phase][i] for i, f in enumerate(feature_names)
                    if not f.startswith('sensor_') and not f.startswith('setting_'))
    sensor_weight = sum(phase_importance[phase][i] for i, f in enumerate(feature_names)
                        if f.startswith('sensor_') or f.startswith('setting_'))
    print(f"  {phase}: sensor={sensor_weight:.3f}, BI={bi_weight:.3f}")

In [None]:
# Temporal attention by degradation phase
fig, ax = plt.subplots(figsize=(10, 5))

colors_phase = ['#66c2a5', '#fc8d62', '#e63946']
for i, (phase, mask) in enumerate(rul_bins.items()):
    if mask.sum() > 0:
        mean_t = np.mean(weights['temporal_weights'][mask], axis=0)
        ax.plot(range(W), mean_t, 'o-', label=phase, color=colors_phase[i],
                markersize=4, alpha=0.8)

ax.set_xlabel('Time Step in Window')
ax.set_ylabel('Mean Temporal Attention')
ax.set_title('Temporal Attention by Degradation Phase (M1)', fontweight='bold')
ax.legend()
plt.tight_layout()
plt.show()

---
## 7. Save Weights for Recommendation System

In [None]:
# Save attention weights for later use by the recommendation module
save_dir = project_root / 'results' / 'attention_weights'

save_attention_weights(
    weights_dict=weights,
    save_dir=str(save_dir),
    dataset_name=f'M1_{FEATURE_SELECTION}_pad{PAD}',
    prefix='attn',
)


# Also save the trained model
model_dir = project_root / 'results' / 'models'
model_dir.mkdir(parents=True, exist_ok=True)
model.save(model_dir / f'dual_attention_bilstm_M1_{FEATURE_SELECTION}_pad{PAD}.keras')
print(f"\nModel saved to {model_dir}/dual_attention_bilstm_M1_{FEATURE_SELECTION}.keras")

---
## 8. Summary

| Metric | Value |
|:---|:---|
| RMSE | ... |
| MAE | ... |
| NASA Score | ... |

### Saved for recommendation system:
- `results/attention_weights/attn_M1_feature_importance.csv` â€” global feature ranking
- `results/attention_weights/attn_M1_per_sample.npy` â€” per-sample weights for context-aware recommendations
- `results/attention_weights/attn_M1_temporal.npy` â€” temporal weights
- `results/attention_weights/attn_M1_predictions.csv` â€” RUL predictions

### Next steps:
- ML branch (XGBoost) + hybrid fusion
- Repeat on M2â€“M4
- SHAP comparison with attention weights

In [6]:
from ml_branch import MLBranch, HybridPredictor

# --- ML Branch ---
ml = MLBranch(model_type='random_forest', flatten_strategy='flatten')  # <-- 'xgboost' or 'random_forest' | 'flatten' or 'statistics'
ml.fit(X_train, y_train, feature_names=feature_names)

y_pred_ml = ml.predict(X_test)


# ML standalone evaluation
results_ml = evaluate_per_unit(
    y_true=y_test, y_pred=y_pred_ml,
    df=test_sel, window_size=W, pad=PAD,
)



[ML Branch] random_forest â€” flatten
  Input: (17731, 30, 32) â†’ Flattened: (17731, 960)
  Training complete âœ“

=== Per-Unit Evaluation (100/100 units) ===
  Last window:  RMSE=15.46  MAE=11.56  Score=388.29
  Mean window:  RMSE=42.92  Score=35143.31


In [None]:

# --- Hybrid fusion ---
hybrid = HybridPredictor()
hybrid.optimize_alpha(y_pred_dl, y_pred_ml, y_test, metric='rmse')
y_pred_dl = model.predict(X_test, batch_size=256).flatten()
y_pred_hybrid = hybrid.predict(y_pred_dl, y_pred_ml)
results_hybrid = evaluate_per_unit(
    y_true=y_test, y_pred=y_pred_hybrid,
    df=test_sel, window_size=W, pad=PAD,
)