In [101]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import config

Load data

In [102]:
df = pd.read_csv('../Data/preprocessed_data.csv')
df.head()

Unnamed: 0,datetime,machineID,Type,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Target,RUL_hours,Failure_Type,month,hour,dayofweek
0,2015-03-19 20:05:05,228,H,295.0,305.0,1497,40.2,118,0,20744,No Failure,3,20,3
1,2015-06-16 09:27:48,201,H,295.0,305.0,1494,41.0,246,0,18040,No Failure,6,9,1
2,2017-02-08 16:01:14,29,L,310.0,320.0,1428,64.7,300,0,656,No Failure,2,16,2
3,2016-02-17 00:18:19,83,M,302.1,313.4,1472,45.2,300,0,8192,No Failure,2,0,2
4,2015-04-09 23:11:44,205,M,300.2,310.3,1497,40.4,148,0,19304,No Failure,4,23,3


In [103]:
# Datetime
# Convert to datetime
df['datetime'] = pd.to_datetime(df['datetime'])

# Extract components
df['month'] = df['datetime'].dt.month
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek

In [104]:
# Check existing machineID
if 'machineID' in df.columns:
    print(f" machineID column exists: {df['machineID'].nunique()} unique machines")
    
    # Verify distribution
    machine_counts = df.groupby('machineID').size()
    print(f"   Min records/machine: {machine_counts.min()}")
    print(f"   Max records/machine: {machine_counts.max()}")
    print(f"   Mean records/machine: {machine_counts.mean():.0f}")
    
    # Check if distribution is suspicious (all equal = bug!)
    if machine_counts.min() == machine_counts.max():
        print("\ WARNING: All machines have EQUAL records!")
        print(" This indicates forced assignment bug!")
        print(" Attempting to fix...")
        
        # Drop corrupted machineID and extract fresh from Product_ID
        df = df.drop('machineID', axis=1)
        df['machineID'] = df['Product_ID'].str.extract('(\d+)').astype(int)
        print(f" Re-extracted machineID from Product_ID")
        print(f" New unique machines: {df['machineID'].nunique()}")
else:
    # Extract from Product_ID if not exists
    df['machineID'] = df['Product_ID'].str.extract('(\d+)').astype(int)
    print(f" Extracted machineID from Product_ID")
    print(f" Unique machines: {df['machineID'].nunique()}")

 machineID column exists: 400 unique machines
   Min records/machine: 63
   Max records/machine: 375
   Mean records/machine: 250


In [105]:
# sort by machine and time
df = df.sort_values(['machineID', 'datetime']).reset_index(drop=True)

Feature engineering

In [106]:
# calculate machine age
# Machine age in hours (cumulative operation time)
df['machine_age_hours'] = df.groupby('machineID').cumcount() * 8

# Hours since last reading
df['hours_since_last'] = df.groupby('machineID')['datetime'].diff().dt.total_seconds() / 3600
df['hours_since_last'] = df['hours_since_last'].fillna(8)

print("Machine age features created")
print(f"   • machine_age_hours: 0 - {df['machine_age_hours'].max():.0f} hours")
print(f"   • hours_since_last: mean={df['hours_since_last'].mean():.1f}h")

Machine age features created
   • machine_age_hours: 0 - 2992 hours
   • hours_since_last: mean=84.6h


In [107]:
# Temperature features
df['Temp_Difference'] = df['Process_temperature'] - df['Air_temperature']
df['Temp_Rate_of_Change'] = df.groupby('machineID')['Process_temperature'].diff().fillna(0)

# Power & Torque features
df['Power'] = df['Torque'] * df['Rotational_speed'] / 9.5488
df['Torque_Speed_Ratio'] = df['Torque'] / (df['Rotational_speed'] + 1)

# RPM Variance (rolling window per machine)
df['RPM_Variance'] = df.groupby('machineID')['Rotational_speed'].transform(
    lambda x: x.rolling(window=min(50, len(x)), min_periods=1).std()
).fillna(0)

In [108]:
# One hot encode
type_dummies = pd.get_dummies(df['Type'], prefix='Type')
df = pd.concat([df, type_dummies], axis=1)

print(f" Type encoded: {', '.join(type_dummies.columns.tolist())}")

 Type encoded: Type_H, Type_L, Type_M


Calculate RUL

In [109]:
# Initialize
df['RUL_hours'] = -1

# Calculate per machine
for machine_id in df['machineID'].unique():
    machine_data = df[df['machineID'] == machine_id]
    
    # Check if machine fails
    failure_indices = machine_data[machine_data['Target'] == 1].index
    
    if len(failure_indices) > 0:
        failure_idx = failure_indices[0]
        failure_time = df.loc[failure_idx, 'datetime']
        
        # Calculate RUL for all readings
        for idx in machine_data.index:
            current_time = df.loc[idx, 'datetime']
            rul = (failure_time - current_time).total_seconds() / 3600
            df.loc[idx, 'RUL_hours'] = max(0, rul)

In [110]:
# Statistics
rul_data = df[df['RUL_hours'] >= 0]
print(f"\n✓ RUL calculated for {len(rul_data):,} records")
print(f"   Min: {rul_data['RUL_hours'].min():.0f} hours")
print(f"   Max: {rul_data['RUL_hours'].max():,.0f} hours")
print(f"   Mean: {rul_data['RUL_hours'].mean():,.0f} hours")
print(f"   Median: {rul_data['RUL_hours'].median():,.0f} hours")
print(f"   Failed machines: {len(df[df['Target']==1]['machineID'].unique())}")


✓ RUL calculated for 78,843 records
   Min: 0 hours
   Max: 23,886 hours
   Mean: 3,038 hours
   Median: 1,604 hours
   Failed machines: 250


save engineered data

In [111]:
output_path = '../Data/engineered_data.csv'
df.to_csv(output_path, index=False)
print(f"\nData saved to: {output_path}")


Data saved to: ../Data/engineered_data.csv
