# Week 2 — Feature Engineering & Preparing Training Data

In Week 2, we transform the raw CMAPSS turbofan dataset into a machine learning-ready format. 
This includes creating RUL labels, normalizing sensor values, selecting useful sensors, and 
building time-series sequences for LSTM/GRU/CNN models.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path(r"C:\Users\Kal\Predictive Maintenance Aircraft Engine")
DATA_DIR = BASE_DIR / "data" / "raw"

col_names = [
    "engine_id", "cycle",
    "setting_1", "setting_2", "setting_3",
] + [f"sensor_{i}" for i in range(1, 22)]

train_df = pd.read_csv(DATA_DIR / "train_FD001.txt", sep=r"\s+", header=None, names=col_names)
test_df  = pd.read_csv(DATA_DIR / "test_FD001.txt",  sep=r"\s+", header=None, names=col_names)
rul_df   = pd.read_csv(DATA_DIR / "RUL_FD001.txt",   sep=r"\s+", header=None, names=["RUL"])

train_df.head()

Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


## Creating Remaining Useful Life (RUL) Labels

Each engine degrades until failure. For training data, we compute the RUL by subtracting the 
current cycle from the engine's final cycle.

In [2]:
# Compute max cycle per engine
max_cycle = train_df.groupby("engine_id")["cycle"].max()

# Merge max_cycle back into training data
train_df = train_df.merge(max_cycle.rename("max_cycle"), on="engine_id")

# Compute RUL
train_df["RUL"] = train_df["max_cycle"] - train_df["cycle"]
train_df = train_df.drop(columns=["max_cycle"])

train_df.head()


Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


## Clipping RUL Values

We clip RUL at 125 cycles (standard practice in PHM research).
This prevents the model from learning unrealistic high values for early cycles.

In [4]:
train_df["RUL"] = train_df["RUL"].clip(upper=125)

## Select Useful Sensors

In [5]:
useful_sensors = [
    "sensor_2", "sensor_3", "sensor_4", 
    "sensor_7", "sensor_8",
    "sensor_11", "sensor_12", "sensor_13", "sensor_14"
]

## Selecting Informative Sensors

Week 1 EDA shows that many sensors are flat (uninformative).  
We keep only the sensors that show clear degradation trends.

In [6]:
selected_features = ["engine_id", "cycle", "setting_1", "setting_2", "setting_3"] + useful_sensors + ["RUL"]

train_df = train_df[selected_features]
train_df.head()

Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_11,sensor_12,sensor_13,sensor_14,RUL
0,1,1,-0.0007,-0.0004,100.0,641.82,1589.7,1400.6,554.36,2388.06,47.47,521.66,2388.02,8138.62,125
1,1,2,0.0019,-0.0003,100.0,642.15,1591.82,1403.14,553.75,2388.04,47.49,522.28,2388.07,8131.49,125
2,1,3,-0.0043,0.0003,100.0,642.35,1587.99,1404.2,554.26,2388.08,47.27,522.42,2388.03,8133.23,125
3,1,4,0.0007,0.0,100.0,642.35,1582.79,1401.87,554.45,2388.11,47.13,522.86,2388.08,8133.83,125
4,1,5,-0.0019,-0.0002,100.0,642.37,1582.85,1406.22,554.0,2388.06,47.28,522.19,2388.04,8133.8,125


## Normalizing Sensor and Setting Data

Normalization ensures the model treats all sensors fairly.

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

cols_to_scale = useful_sensors + ["setting_1", "setting_2", "setting_3"]

train_df[cols_to_scale] = scaler.fit_transform(train_df[cols_to_scale])

train_df.head()


Unnamed: 0,engine_id,cycle,setting_1,setting_2,setting_3,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_11,sensor_12,sensor_13,sensor_14,RUL
0,1,1,0.45977,0.166667,0.0,0.183735,0.406802,0.309757,0.726248,0.242424,0.369048,0.633262,0.205882,0.199608,125
1,1,2,0.609195,0.25,0.0,0.283133,0.453019,0.352633,0.628019,0.212121,0.380952,0.765458,0.279412,0.162813,125
2,1,3,0.252874,0.75,0.0,0.343373,0.369523,0.370527,0.710145,0.272727,0.25,0.795309,0.220588,0.171793,125
3,1,4,0.54023,0.5,0.0,0.343373,0.256159,0.331195,0.740741,0.318182,0.166667,0.889126,0.294118,0.174889,125
4,1,5,0.390805,0.333333,0.0,0.349398,0.257467,0.404625,0.668277,0.242424,0.255952,0.746269,0.235294,0.174734,125


## Creating Time-Series Windows

Deep learning models require fixed-length sequences.
We create sliding windows of 30 cycles per engine.

In [8]:
sequence_length = 30

def create_sequences(df, seq_len):
    sequences = []
    labels = []
    
    for engine in df["engine_id"].unique():
        engine_df = df[df["engine_id"] == engine].reset_index(drop=True)
        data = engine_df[useful_sensors + ["setting_1", "setting_2", "setting_3"]].values
        target = engine_df["RUL"].values
        
        for i in range(len(engine_df) - seq_len):
            sequences.append(data[i:i+seq_len])
            labels.append(target[i+seq_len])
            
    return np.array(sequences), np.array(labels)

X_train, y_train = create_sequences(train_df, sequence_length)

X_train.shape, y_train.shape

((17631, 30, 12), (17631,))

## Save Processed Data

In [9]:
np.save(BASE_DIR / "data" / "X_train_fd001.npy", X_train)
np.save(BASE_DIR / "data" / "y_train_fd001.npy", y_train)