In [None]:
#Feature Engineering - Load processed datasets
import pandas as pd
import numpy as np

# Load all processed datasets
telemetry = pd.read_csv("/content/processed/telemetry_processed.csv")
errors = pd.read_csv("/content/processed/errors_processed.csv")
failures = pd.read_csv("/content/processed/failures_processed.csv")
maintenance = pd.read_csv("/content/processed/maintenance_processed.csv")
machines = pd.read_csv("/content/processed/machines_processed.csv")

# Convert datetime columns
telemetry['datetime'] = pd.to_datetime(telemetry['datetime'])
errors['datetime'] = pd.to_datetime(errors['datetime'])
failures['datetime'] = pd.to_datetime(failures['datetime'])
maintenance['datetime'] = pd.to_datetime(maintenance['datetime'])

print("Loaded processed datasets:")
print(f"Telemetry: {telemetry.shape}")
print(f"Errors: {errors.shape}")
print(f"Failures: {failures.shape}")
print(f"Maintenance: {maintenance.shape}")
print(f"Machines: {machines.shape}")

Loaded processed datasets:
Telemetry: (876100, 6)
Errors: (3919, 3)
Failures: (761, 3)
Maintenance: (3286, 3)
Machines: (100, 3)


In [None]:
# Merge machines info
telemetry = telemetry.merge(machines, on="machineID", how="left")

# Sort for rolling features
telemetry = telemetry.sort_values(["machineID", "datetime"])

print("After merging machines data:")
print(f"Telemetry shape: {telemetry.shape}")
print(telemetry.head())

After merging machines data:
Telemetry shape: (876100, 8)
             datetime  machineID        volt      rotate    pressure  \
0 2015-01-01 06:00:00          1  176.217853  418.504078  113.077935   
1 2015-01-01 07:00:00          1  162.879223  402.747490   95.460525   
2 2015-01-01 08:00:00          1  170.989902  527.349825   75.237905   
3 2015-01-01 09:00:00          1  162.462833  346.149335  109.248561   
4 2015-01-01 10:00:00          1  157.610021  435.376873  111.886648   

   vibration   model  age  
0  45.087686  model3   18  
1  43.413973  model3   18  
2  34.178847  model3   18  
3  41.122144  model3   18  
4  25.990511  model3   18  


In [None]:
# Rolling Window Features

roll_windows = [3, 6, 12]  # hours

for w in roll_windows:
    telemetry[f"volt_mean_{w}h"] = telemetry.groupby("machineID")["volt"].rolling(w).mean().reset_index(0,drop=True)
    telemetry[f"rotate_mean_{w}h"] = telemetry.groupby("machineID")["rotate"].rolling(w).mean().reset_index(0,drop=True)
    telemetry[f"pressure_mean_{w}h"] = telemetry.groupby("machineID")["pressure"].rolling(w).mean().reset_index(0,drop=True)
    telemetry[f"vibration_mean_{w}h"] = telemetry.groupby("machineID")["vibration"].rolling(w).mean().reset_index(0,drop=True)

    telemetry[f"volt_std_{w}h"] = telemetry.groupby("machineID")["volt"].rolling(w).std().reset_index(0,drop=True)
    telemetry[f"rotate_std_{w}h"] = telemetry.groupby("machineID")["rotate"].rolling(w).std().reset_index(0,drop=True)
    telemetry[f"pressure_std_{w}h"] = telemetry.groupby("machineID")["pressure"].rolling(w).std().reset_index(0,drop=True)
    telemetry[f"vibration_std_{w}h"] = telemetry.groupby("machineID")["vibration"].rolling(w).std().reset_index(0,drop=True)

print("Added rolling window features")
print(f"Dataset shape: {telemetry.shape}")

Added rolling window features
Dataset shape: (876100, 32)


In [None]:
# Change Rates (diff)

for col in ["volt", "rotate", "pressure", "vibration"]:
    telemetry[f"{col}_diff"] = telemetry.groupby("machineID")[col].diff()

# Percentage Changes
for col in ["volt", "rotate", "pressure", "vibration"]:
    telemetry[f"{col}_pct"] = telemetry.groupby("machineID")[col].pct_change()

print("Added change rates and percentage changes")
print(f"Dataset shape: {telemetry.shape}")

Added change rates and percentage changes
Dataset shape: (876100, 40)


In [None]:
# Encode Errors as Binary Flags
errors["error_flag"] = 1
errors_pivot = errors.pivot_table(
    index=["datetime", "machineID"],
    columns="errorID",
    values="error_flag",
    fill_value=0
).reset_index()

telemetry = telemetry.merge(errors_pivot, on=["datetime","machineID"], how="left")
telemetry.fillna({col:0 for col in errors["errorID"].unique()}, inplace=True)

print("Added error flags")
print(f"Dataset shape: {telemetry.shape}")

Added error flags
Dataset shape: (876100, 45)


In [None]:
# Add Maintenance Flags
maintenance["maint_flag"] = 1
maint_pivot = maintenance.pivot_table(
    index=["datetime","machineID"],
    columns="comp",
    values="maint_flag",
    fill_value=0
).reset_index()

telemetry = telemetry.merge(maint_pivot, on=["datetime","machineID"], how="left")
telemetry.fillna({col:0 for col in maintenance["comp"].unique()}, inplace=True)

print("Added maintenance flags")
print(f"Dataset shape: {telemetry.shape}")

Added maintenance flags
Dataset shape: (876100, 49)


In [None]:
# Create Failure Label (next 24 hours)

failures["fail_flag"] = 1
telemetry = telemetry.merge(failures, on=["datetime","machineID"], how="left")
telemetry["fail_flag"] = telemetry["fail_flag"].fillna(0)

# Label future failure within next 24 hours
telemetry = telemetry.sort_values(["machineID","datetime"])
telemetry["failure_next_24h"] = (
    telemetry.groupby("machineID")["fail_flag"]
    .rolling(24, min_periods=1)
    .max()
    .reset_index(0,drop=True)
)

# Drop current failure (keep future label)
telemetry = telemetry.drop(columns=["fail_flag"])

print("Added failure labels for next 24 hours")
print(f"Dataset shape: {telemetry.shape}")

Added failure labels for next 24 hours
Dataset shape: (876142, 51)


In [None]:
# Final Cleanup
telemetry = telemetry.ffill().bfill()
telemetry = telemetry.dropna()

print("Final dataset shape:", telemetry.shape)
print("Failure distribution:")
print(telemetry["failure_next_24h"].value_counts())

# Save the feature-engineered dataset
telemetry.to_csv("/content/processed/telemetry_feature_engineered.csv", index=False)
print("Saved feature-engineered dataset to: /content/processed/telemetry_feature_engineered.csv")

telemetry.head()

Final dataset shape: (876142, 51)
Failure distribution:
failure_next_24h
0.0    858865
1.0     17277
Name: count, dtype: int64
Saved feature-engineered dataset to: /content/processed/telemetry_feature_engineered.csv


Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,model,age,volt_mean_3h,rotate_mean_3h,...,error2,error3,error4,error5,comp1,comp2,comp3,comp4,failure,failure_next_24h
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,model3,18,170.028993,449.533798,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,comp4,0.0
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,model3,18,170.028993,449.533798,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,comp4,0.0
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,model3,18,170.028993,449.533798,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,comp4,0.0
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,model3,18,165.443986,425.41555,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,comp4,0.0
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,model3,18,163.687586,436.292011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,comp4,0.0


In [31]:
#feature summary
print("Feature Egineering Summary ")
print(f"Final dataset shape: {telemetry.shape}")
print(f"Number of features: {len(telemetry.columns)}")
print(f"Number of machines: {telemetry['machineID'].nunique()}")
print(f"Date range: {telemetry['datetime'].min()} to {telemetry['datetime'].max()}")
print(f"Failure rate: {telemetry['failure_next_24h'].mean():.4f}")

print("\nFeature categories:")
sensor_features = [col for col in telemetry.columns if any(sensor in col for sensor in ['volt', 'rotate', 'pressure', 'vibration'])]
print(f"Sensor features: {len(sensor_features)}")

error_features = [col for col in telemetry.columns if 'error' in col.lower()]
print(f"Error features: {len(error_features)}")

maint_features = [col for col in telemetry.columns if col in maintenance['comp'].unique()]
print(f"Maintenance features: {len(maint_features)}")

print(f"\nFirst 20 features: {list(telemetry.columns[:20])}")

Feature Egineering Summary 
Final dataset shape: (876142, 51)
Number of features: 51
Number of machines: 100
Date range: 2015-01-01 06:00:00 to 2016-01-01 06:00:00
Failure rate: 0.0197

Feature categories:
Sensor features: 36
Error features: 5
Maintenance features: 4

First 20 features: ['datetime', 'machineID', 'volt', 'rotate', 'pressure', 'vibration', 'model', 'age', 'volt_mean_3h', 'rotate_mean_3h', 'pressure_mean_3h', 'vibration_mean_3h', 'volt_std_3h', 'rotate_std_3h', 'pressure_std_3h', 'vibration_std_3h', 'volt_mean_6h', 'rotate_mean_6h', 'pressure_mean_6h', 'vibration_mean_6h']


In [30]:
# I will Mount on my Google Drive and save the file because it is about 600 MB
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Copy the feature engineered file to Google Drive
!cp "/content/processed/telemetry_feature_engineered.csv" "/content/drive/MyDrive/"

print("File mounted to Google Drive!")
print("File location: /content/drive/MyDrive/telemetry_feature_engineered.csv")

Mounted at /content/drive
File mounted to Google Drive!
File location: /content/drive/MyDrive/telemetry_feature_engineered.csv
