# Day-4

In [1]:
# 1. Imports and Data Loading
import pandas as pd
import numpy as np

In [2]:
# Suppress warnings for clearer output
import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load the feature-engineered dataset from previous step (adjust path as needed)
df = pd.read_csv('/content/drive/MyDrive/Infosys_Internship/Data Preparation/cmapss_feature_engineered_FD001.csv')  # Assume feature engineered file

# Basic info
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (20531, 68)


Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_17_rollmean5,sensor_17_rollstd5,sensor_18_rollmean5,sensor_18_rollstd5,sensor_19_rollmean5,sensor_19_rollstd5,sensor_20_rollmean5,sensor_20_rollstd5,sensor_21_rollmean5,sensor_21_rollstd5
0,1,2,0.0019,-0.0003,100.0,0.0,-1.06178,0.211528,-0.643726,-1.776357e-15,...,-0.926028,-2.638069,0.0,0.0,0.0,0.0,1.367661,-1.534785,1.404213,-2.630752
1,1,3,-0.0043,0.0003,100.0,0.0,-0.661813,-0.413166,-0.525953,-1.776357e-15,...,-1.453702,0.786588,0.0,0.0,0.0,0.0,1.192984,-1.166192,1.123794,-0.599129
2,1,4,0.0007,0.0,100.0,0.0,-0.661813,-1.261314,-0.784831,-1.776357e-15,...,-1.321784,0.327771,0.0,0.0,0.0,0.0,0.991014,-0.547098,1.0646,-0.923458
3,1,5,-0.0019,-0.0002,100.0,0.0,-0.621816,-1.251528,-0.301518,-1.776357e-15,...,-1.08433,0.610846,0.0,0.0,0.0,0.0,0.896034,-0.625804,1.095643,-1.143645
4,1,6,-0.0043,-0.0001,100.0,0.0,-1.161771,-0.987297,-1.173703,-1.776357e-15,...,-1.242633,0.743509,0.0,0.0,0.0,0.0,0.791228,-1.279538,0.981948,-1.24503


In [5]:
# Columns to use as features (exclude id and cycle, plus target if any)
exclude_cols = ['engine_id', 'cycle']
feature_cols = [col for col in df.columns if col not in exclude_cols]

# Sort data by engine_id and cycle to ensure correct temporal order
df = df.sort_values(['engine_id', 'cycle']).reset_index(drop=True)

In [6]:
def generate_rolling_windows(data, engine_col, features, window_size=30):
    sequences = []
    engine_ids = []
    cycle_ids = []

    for engine in data[engine_col].unique():
        engine_data = data[data[engine_col] == engine]
        engine_features = engine_data[features].values

        # Generate sequences with rolling window
        for i in range(window_size - 1, len(engine_data)):
            seq = engine_features[i - window_size + 1 : i + 1]
            sequences.append(seq)
            engine_ids.append(engine)
            cycle_ids.append(engine_data.iloc[i]['cycle'])

    # Convert to array for modeling
    sequences = np.array(sequences)
    return sequences, engine_ids, cycle_ids

In [7]:
window_size = 30  # Typical rolling window length; adjust as needed
sequences, engine_ids, cycle_ids = generate_rolling_windows(df, 'engine_id', feature_cols, window_size)

print("Shape of rolling window sequences:", sequences.shape)  # (num_sequences, window_size, num_features)
print("Example sequence shape:", sequences[0].shape)

Shape of rolling window sequences: (17631, 30, 66)
Example sequence shape: (30, 66)


In [8]:
# Print the first sequence info
print(f"Engine ID: {engine_ids[0]}, Cycle: {cycle_ids[0]}")
print("Sequence data for first time window (shape {}):".format(sequences[0].shape))
print(sequences[0])

Engine ID: 1, Cycle: 31.0
Sequence data for first time window (shape (30, 66)):
[[ 1.90000000e-03 -3.00000000e-04  1.00000000e+02 ... -1.53478503e+00
   1.40421343e+00 -2.63075241e+00]
 [-4.30000000e-03  3.00000000e-04  1.00000000e+02 ... -1.16619245e+00
   1.12379400e+00 -5.99129117e-01]
 [ 7.00000000e-04  0.00000000e+00  1.00000000e+02 ... -5.47097938e-01
   1.06460040e+00 -9.23457744e-01]
 ...
 [ 1.20000000e-03 -1.00000000e-04  1.00000000e+02 ... -1.06697113e+00
   1.02930247e+00  4.59904157e-01]
 [-2.20000000e-03  0.00000000e+00  1.00000000e+02 ... -6.06104727e-01
   9.90458379e-01  3.42825323e-01]
 [ 1.40000000e-03  5.00000000e-04  1.00000000e+02 ... -1.31993488e+00
   8.16751090e-01  3.47734290e-01]]


In [9]:
# Check that sequence length matches window size
assert sequences.shape[1] == window_size, "Sequence window length mismatch"

# Check that sequences are ordered by cycle (manual inspection example)
assert all(cycle_ids[i] > cycle_ids[i-1] or engine_ids[i] != engine_ids[i-1] for i in range(1, len(cycle_ids))), "Cycle order violation"

print("Basic validation checks passed.")

Basic validation checks passed.


In [11]:
# Save sequences and metadata for modeling
np.save('/content/drive/MyDrive/Infosys_Internship/Data Preparation/rolling_window_sequences.npy', sequences)
pd.DataFrame({'engine_id': engine_ids, 'cycle': cycle_ids}).to_csv('/content/drive/MyDrive/Infosys_Internship/Data Preparation/sequence_metadata.csv', index=False)