In [22]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import config

Load data

In [23]:
# load data
df = pd.read_csv('../Data/predictive_maintenance.csv')
df.head()

Unnamed: 0,datetime,machineID,Type,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Target,RUL_hours,Failure_Type,month,hour,dayofweek
0,2015-03-19 20:05:05,228,H,295.0,305.0,1497,40.2,118,0,20744,No Failure,3,20,3
1,2015-06-16 09:27:48,201,H,295.0,305.0,1494,41.0,246,0,18040,No Failure,6,9,1
2,2017-02-08 16:01:14,29,L,310.0,320.0,1428,64.7,300,0,656,No Failure,2,16,2
3,2016-02-17 00:18:19,83,M,302.1,313.4,1472,45.2,300,0,8192,No Failure,2,0,2
4,2015-04-09 23:11:44,205,M,300.2,310.3,1497,40.4,148,0,19304,No Failure,4,23,3


Basic data info

In [24]:
# basic data info
print("\nColumn types")
print(df.dtypes)



Column types
datetime                object
machineID                int64
Type                    object
Air_temperature        float64
Process_temperature    float64
Rotational_speed         int64
Torque                 float64
Tool_wear                int64
Target                   int64
RUL_hours                int64
Failure_Type            object
month                    int64
hour                     int64
dayofweek                int64
dtype: object


In [25]:
print("\nBasic Statistics")
print(df.describe())



Basic Statistics
           machineID  Air_temperature  Process_temperature  Rotational_speed  \
count  100000.000000    100000.000000        100000.000000     100000.000000   
mean      167.546840       303.772080           314.891460       1436.255490   
std       105.147196         5.553468             5.900532         47.305605   
min         1.000000       295.000000           305.000000       1272.000000   
25%        80.000000       300.300000           310.500000       1429.000000   
50%       160.000000       304.400000           317.300000       1438.000000   
75%       239.000000       310.000000           320.000000       1463.000000   
max       400.000000       310.000000           320.000000       1503.000000   

              Torque      Tool_wear         Target      RUL_hours  \
count  100000.000000  100000.000000  100000.000000  100000.000000   
mean       50.060780     288.413630       0.002500    6905.226400   
std         8.009187      46.877092       0.049938    

In [26]:
print("\nMissing values")
print(df.isnull().sum())


Missing values
datetime               0
machineID              0
Type                   0
Air_temperature        0
Process_temperature    0
Rotational_speed       0
Torque                 0
Tool_wear              0
Target                 0
RUL_hours              0
Failure_Type           0
month                  0
hour                   0
dayofweek              0
dtype: int64


Data type conversion

In [27]:
# convert types based on column content
if 'UDI' in df.columns:
    df['UDI'] = df['UDI'].astype(int)
    
if 'Product ID' in df.columns:
    df['Product ID'] = df['Product ID'].astype(str)
    
if 'Type' in df.columns:
    df['Type'] = df['Type'].astype('category')

# Numeric columns
numeric_cols = [
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]'
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
print("Data types converted")

Data types converted


Rename Columns

In [28]:
# create clean column names
column_mapping = {
    'Air temperature [K]': 'Air_temperature',
    'Process temperature [K]': 'Process_temperature',
    'Rotational speed [rpm]': 'Rotational_speed',
    'Torque [Nm]': 'Torque',
    'Tool wear [min]': 'Tool_wear',
    'Failure Type': 'Failure_Type'
}

df = df.rename(columns=column_mapping)
print("Columns renamed:")
print(df.columns.tolist())

Columns renamed:
['datetime', 'machineID', 'Type', 'Air_temperature', 'Process_temperature', 'Rotational_speed', 'Torque', 'Tool_wear', 'Target', 'RUL_hours', 'Failure_Type', 'month', 'hour', 'dayofweek']


Remove duplicates

In [29]:
initial_count = len(df)
df = df.drop_duplicates()
final_count = len(df)

if initial_count > final_count:
    print(f"Removed {initial_count - final_count} duplicates")
else:
    print("No duplicates found")

No duplicates found


Data quality check

In [30]:
# Check value ranges
print("\n1. Temperature ranges:")
if 'Air_temperature' in df.columns:
    print(f"   Air: {df['Air_temperature'].min():.1f} - {df['Air_temperature'].max():.1f} K")
if 'Process_temperature' in df.columns:
    print(f"   Process: {df['Process_temperature'].min():.1f} - {df['Process_temperature'].max():.1f} K")

print("\n2. Rotational speed range:")
if 'Rotational_speed' in df.columns:
    print(f"   {df['Rotational_speed'].min()} - {df['Rotational_speed'].max()} RPM")

print("\n3. Torque range:")
if 'Torque' in df.columns:
    print(f"   {df['Torque'].min():.1f} - {df['Torque'].max():.1f} Nm")

print("\n4. Tool wear range:")
if 'Tool_wear' in df.columns:
    print(f"   {df['Tool_wear'].min()} - {df['Tool_wear'].max()} min")

# Check targets
print("\n5. Target distribution:")
if 'Target' in df.columns:
    print(df['Target'].value_counts())
    print(f"   Failure rate: {df['Target'].mean()*100:.2f}%")

print("\n6. Machine types:")
if 'Type' in df.columns:
    print(df['Type'].value_counts())



1. Temperature ranges:
   Air: 295.0 - 310.0 K
   Process: 305.0 - 320.0 K

2. Rotational speed range:
   1272 - 1503 RPM

3. Torque range:
   39.5 - 72.2 Nm

4. Tool wear range:
   0 - 300 min

5. Target distribution:
Target
0    99750
1      250
Name: count, dtype: int64
   Failure rate: 0.25%

6. Machine types:
Type
M    47294
L    30834
H    21872
Name: count, dtype: int64


Save model

In [None]:
output_path = '../Data/preprocessed_data.csv'
df.to_csv(output_path, index=False)
print(f"\nPreprocessed data saved to: {output_path}")
print(f"Total records: {len(df):,}")
print(f"Total columns: {len(df.columns)}")


Preprocessed data saved to: ../Data/preprocessed_data.csv
Total records: 100,000
Total columns: 14
