# ðŸ“Š W02 â€” Data Summary: Sliding Window Shapes (Correlation vs AFICv)
**Objective**: Generate a recap table of final data dimensions (samples, window, features) for both feature selection methods across M1â€“M4.

**Author**: Fatima Khadija Benzine  
**Date**: February 2026

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / 'src'))

from data_loader import MultiDatasetLoader
from preprocessing import PreprocessingPipelineBI, DataNormalizer, create_sliding_windows
from bi_fusion import BIFusionPipeline, CONTINUOUS_BI_VARS
from feature_selection import BIAwareFeatureSelector
from feature_selection_aficv import AFICvFeatureSelector

print("All modules imported âœ“")

In [None]:
loader = MultiDatasetLoader()
datasets = ['FD001', 'FD002', 'FD003', 'FD004']
labels = {'FD001': 'M1', 'FD002': 'M2', 'FD003': 'M3', 'FD004': 'M4'}
meta_cols = ['unit', 'cycle', 'rul']
W = 30

results = []

for ds_name in datasets:
    print(f"\n{'='*70}")
    print(f"  {labels[ds_name]} ({ds_name})")
    print(f"{'='*70}")
    
    ds = loader.load_cmapss_dataset(ds_name)
    train_raw = ds['train'].copy()
    test_raw = ds['test'].copy()
    
    # --- Common preprocessing (Steps 0-3b) ---
    train_raw['rul'] = train_raw['rul'].clip(upper=125)
    if 'rul' in test_raw.columns:
        test_raw['rul'] = test_raw['rul'].clip(upper=125)
    
    sensor_cols = [c for c in train_raw.columns if c.startswith('sensor_')]
    setting_cols = [c for c in train_raw.columns if c.startswith('setting_')]
    
    # Normalize sensors
    norm = DataNormalizer(method='minmax')
    train_norm = norm.fit_transform(train_raw, sensor_cols + setting_cols)
    test_norm = norm.transform(test_raw)
    
    # Fuse BI
    fusion = BIFusionPipeline()
    train_fused = fusion.fuse(train_norm, ds_name, split='train', encode_categoricals=True)
    test_fused = fusion.fuse(test_norm, ds_name, split='test', encode_categoricals=True)
    bi_cols = fusion.get_bi_columns(train_fused)
    
    # Normalize continuous BI
    bi_cont = [c for c in CONTINUOUS_BI_VARS if c in train_fused.columns]
    bi_norm = DataNormalizer(method='minmax')
    train_fused = bi_norm.fit_transform(train_fused, bi_cont)
    test_fused = bi_norm.transform(test_fused)
    
    # ============================================
    # Method 1: Correlation-based
    # ============================================
    print(f"\n--- Correlation-based ---")
    corr_sel = BIAwareFeatureSelector(variance_threshold=0.01, correlation_threshold=0.95)
    corr_features = corr_sel.select_features(
        data=train_fused, sensor_cols=sensor_cols,
        bi_cols=bi_cols, setting_cols=setting_cols,
        exclude_cols=meta_cols,
    )
    
    train_corr = corr_sel.transform(train_fused, keep_cols=meta_cols)
    test_corr = corr_sel.transform(test_fused, keep_cols=meta_cols)
    
    X_train_corr, y_train_corr = create_sliding_windows(train_corr, window_size=W, target_col='rul')
    X_test_corr, y_test_corr = create_sliding_windows(test_corr, window_size=W, target_col='rul')
    
    # ============================================
    # Method 2: AFICv Stratified 90%
    # ============================================
    print(f"\n--- AFICv Stratified (90%) ---")
    aficv_sel = AFICvFeatureSelector(
        base_learner='xgboost', n_folds=5, cumulative_threshold=0.90,
    )
    aficv_features = aficv_sel.select_features_stratified(
        data=train_fused, sensor_cols=sensor_cols,
        bi_cols=bi_cols, setting_cols=setting_cols,
        target_col='rul', group_col='unit',
    )
    
    train_aficv = aficv_sel.transform(train_fused, keep_cols=meta_cols)
    test_aficv = aficv_sel.transform(test_fused, keep_cols=meta_cols)
    
    X_train_aficv, y_train_aficv = create_sliding_windows(train_aficv, window_size=W, target_col='rul')
    X_test_aficv, y_test_aficv = create_sliding_windows(test_aficv, window_size=W, target_col='rul')
    
    # ============================================
    # Collect results
    # ============================================
    n_corr_sensor = sum(1 for f in corr_features if f.startswith('sensor_') or f.startswith('setting_'))
    n_corr_bi = len(corr_features) - n_corr_sensor
    n_aficv_sensor = sum(1 for f in aficv_features if f.startswith('sensor_') or f.startswith('setting_'))
    n_aficv_bi = len(aficv_features) - n_aficv_sensor
    
    results.append({
        'Machine': labels[ds_name],
        'Train units': train_raw['unit'].nunique(),
        'Test units': test_raw['unit'].nunique(),
        # Correlation
        'Corr: features': len(corr_features),
        'Corr: sensor': n_corr_sensor,
        'Corr: BI': n_corr_bi,
        'Corr: train samples': X_train_corr.shape[0],
        'Corr: test samples': X_test_corr.shape[0],
        'Corr: X_train shape': str(X_train_corr.shape),
        'Corr: X_test shape': str(X_test_corr.shape),
        # AFICv
        'AFICv: features': len(aficv_features),
        'AFICv: sensor': n_aficv_sensor,
        'AFICv: BI': n_aficv_bi,
        'AFICv: train samples': X_train_aficv.shape[0],
        'AFICv: test samples': X_test_aficv.shape[0],
        'AFICv: X_train shape': str(X_train_aficv.shape),
        'AFICv: X_test shape': str(X_test_aficv.shape),
    })

print(f"\n{'='*70}")
print("Done âœ“")

---
## Summary Table

In [None]:
df = pd.DataFrame(results)

# Clean display table
display_cols = [
    'Machine', 'Train units', 'Test units',
    'Corr: features', 'Corr: train samples', 'Corr: test samples',
    'AFICv: features', 'AFICv: train samples', 'AFICv: test samples',
]
print("=== Data Dimensions Summary (W=30) ===\n")
print(df[display_cols].to_string(index=False))

In [None]:
# Full shapes table
shape_cols = [
    'Machine',
    'Corr: X_train shape', 'Corr: X_test shape',
    'AFICv: X_train shape', 'AFICv: X_test shape',
]
print("=== Tensor Shapes (samples, W, features) ===\n")
print(df[shape_cols].to_string(index=False))

In [None]:
# LaTeX table for thesis
print("=== LaTeX Table ===")
print()
print(r"\begin{table}[htbp]")
print(r"    \centering")
print(r"    \caption{Data dimensions after preprocessing and sliding window ($W=30$).}")
print(r"    \label{tab:data_dimensions}")
print(r"    \resizebox{\columnwidth}{!}{%")
print(r"    \begin{tabular}{@{}l cc cc cc cc@{}}")
print(r"        \toprule")
print(r"        & & & \multicolumn{3}{c}{\textbf{Correlation-based}} & \multicolumn{3}{c}{\textbf{AFICv Stratified (90\%)}} \\")
print(r"        \cmidrule(lr){4-6} \cmidrule(lr){7-9}")
print(r"        \textbf{Machine} & \textbf{Train} & \textbf{Test} & Features & Train & Test & Features & Train & Test \\")
print(r"        & units & units & & samples & samples & & samples & samples \\")
print(r"        \midrule")
for _, r in df.iterrows():
    print(f"        {r['Machine']} & {r['Train units']} & {r['Test units']} & "
          f"{r['Corr: features']} & {r['Corr: train samples']} & {r['Corr: test samples']} & "
          f"{r['AFICv: features']} & {r['AFICv: train samples']} & {r['AFICv: test samples']} \\")
print(r"        \bottomrule")
print(r"    \end{tabular}%")
print(r"    }")
print(r"\end{table}")