# Data Preprocessing

**Objective:** Prepare NASA C-MAPSS dataset for LSTM model training

**Steps:**
1. Load raw data (train/test sets)
2. Calculate RUL (Remaining Useful Life) labels
3. Remove constant sensors (zero variance)
4. Normalize features to [0, 1] range
5. Apply RUL clipping (max = 125 cycles)
6. Create 30-cycle sequences for LSTM ( In Progress )
7. Split into train/validation/test sets ( In Progress )
8. Save processed data for modeling ( In Progress )

In [None]:
# --------------------------------------------------------------------------
# Import required libraries for data preprocessing and visualization
# --------------------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os

np.random.seed(42)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# --------------------------------------------------------------------------
# Load raw training and test data from NASA C-MAPSS dataset
# --------------------------------------------------------------------------

index_names = ['engine_id', 'cycle']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = [f'sensor_{i}' for i in range(1, 22)]
col_names = index_names + setting_names + sensor_names

train = pd.read_csv('CMaps/train_FD001.txt', sep='\s+', header=None, names=col_names)
test = pd.read_csv('CMaps/test_FD001.txt', sep='\s+', header=None, names=col_names)
rul_test = pd.read_csv('CMaps/RUL_FD001.txt', sep='\s+', header=None, names=['RUL'])

print(f"Training samples: {len(train):,}")
print(f"Test samples: {len(test):,}")
print(f"Test engines: {len(rul_test)}")

In [None]:
# --------------------------------------------------------------------------
# Calculate RUL (Remaining Useful Life) for training and test data
# --------------------------------------------------------------------------

def add_rul(df):
    """Add RUL column based on max cycle per engine"""
    df_copy = df.copy()
    max_cycles = df_copy.groupby('engine_id')['cycle'].max().reset_index()
    max_cycles.columns = ['engine_id', 'max_cycle']
    df_copy = df_copy.merge(max_cycles, on='engine_id', how='left')
    df_copy['RUL'] = df_copy['max_cycle'] - df_copy['cycle']
    df_copy.drop('max_cycle', axis=1, inplace=True)
    return df_copy

def add_rul_test(test_df, rul_df):
    """Add RUL to test data using true RUL values"""
    test_copy = test_df.copy()
    max_cycles = test_copy.groupby('engine_id')['cycle'].max().reset_index()
    max_cycles.columns = ['engine_id', 'max_cycle']
    max_cycles['true_rul'] = rul_df['RUL'].values
    test_copy = test_copy.merge(max_cycles, on='engine_id', how='left')
    test_copy['RUL'] = test_copy['true_rul'] + (test_copy['max_cycle'] - test_copy['cycle'])
    test_copy.drop(['max_cycle', 'true_rul'], axis=1, inplace=True)
    return test_copy

train = add_rul(train)
test = add_rul_test(test, rul_test)

print(f"Training RUL range: {train['RUL'].min()} to {train['RUL'].max()}")
print(f"Test RUL range: {test['RUL'].min()} to {test['RUL'].max()}")

In [None]:
# --------------------------------------------------------------------------
# Remove constant sensors with near-zero variance
# --------------------------------------------------------------------------

sensor_cols = [col for col in train.columns if col.startswith('sensor_')]
sensor_variance = train[sensor_cols].var()
constant_sensors = sensor_variance[sensor_variance < 0.01].index.tolist()

print(f"Removing {len(constant_sensors)} constant sensors:")
for sensor in constant_sensors:
    print(f"  {sensor}: variance = {sensor_variance[sensor]:.6f}")

train.drop(constant_sensors, axis=1, inplace=True)
test.drop(constant_sensors, axis=1, inplace=True)

remaining_sensors = [col for col in train.columns if col.startswith('sensor_')]
print(f"\nRemaining sensors: {len(remaining_sensors)}")
print(f"  {remaining_sensors}")

In [None]:
# --------------------------------------------------------------------------
# Normalize features to [0, 1] range using MinMaxScaler
# --------------------------------------------------------------------------

feature_cols = setting_names + remaining_sensors

print(f"Normalizing {len(feature_cols)} features:")
print(f"  Settings: {setting_names}")
print(f"  Sensors: {remaining_sensors}")

scaler = MinMaxScaler()
train[feature_cols] = scaler.fit_transform(train[feature_cols])
test[feature_cols] = scaler.transform(test[feature_cols])

print("\nFeatures normalized to [0, 1] range")
print("\nSample (first 5 rows):")
print(train[feature_cols].head())

In [None]:
# --------------------------------------------------------------------------
# Apply RUL clipping to focus on degradation phase
# --------------------------------------------------------------------------

MAX_RUL = 125

print(f"Before clipping:")
print(f"  Train: {train['RUL'].min()} to {train['RUL'].max()}")
print(f"  Test: {test['RUL'].min()} to {test['RUL'].max()}")

train['RUL'] = train['RUL'].clip(upper=MAX_RUL)
test['RUL'] = test['RUL'].clip(upper=MAX_RUL)

print(f"\nAfter clipping (max={MAX_RUL}):")
print(f"  Train: {train['RUL'].min()} to {train['RUL'].max()}")
print(f"  Test: {test['RUL'].min()} to {test['RUL'].max()}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(train['RUL'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('RUL (cycles)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Training RUL Distribution')
axes[0].axvline(MAX_RUL, color='red', linestyle='--', label=f'Max = {MAX_RUL}')
axes[0].legend()

axes[1].hist(test['RUL'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_xlabel('RUL (cycles)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Test RUL Distribution')
axes[1].axvline(MAX_RUL, color='red', linestyle='--', label=f'Max = {MAX_RUL}')
axes[1].legend()

plt.tight_layout()
plt.show()