In [1]:
!pip -q install gdown

import gdown
import zipfile
import os
import pandas as pd
import numpy as np
from pathlib import Path
import logging


In [2]:
# Download and extract the dataset
file_id = "1MNPMnS1eQw8fGvE1F9E6emUhFhHod_mS"
zip_path = "/content/pdm_data.zip"
extract_dir = "/content/pdm_data"

# Download zip from Google Drive
gdown.download(f"https://drive.google.com/uc?id={file_id}", zip_path, quiet=False)

# Extract zip
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_dir)

print("Extracted files:", os.listdir(extract_dir))


Downloading...
From (original): https://drive.google.com/uc?id=1MNPMnS1eQw8fGvE1F9E6emUhFhHod_mS
From (redirected): https://drive.google.com/uc?id=1MNPMnS1eQw8fGvE1F9E6emUhFhHod_mS&confirm=t&uuid=3060d187-78d6-45c8-b63e-73e58c392fc3
To: /content/pdm_data.zip
100%|██████████| 32.5M/32.5M [00:00<00:00, 52.1MB/s]


Extracted files: ['PdM_errors.csv', 'PdM_machines.csv', 'PdM_failures.csv', 'PdM_telemetry.csv', 'PdM_maint.csv']


In [3]:
#Load all datasets
base_path = "/content/pdm_data"

datasets = {
    "telemetry": pd.read_csv(os.path.join(base_path, "PdM_telemetry.csv")),
    "errors": pd.read_csv(os.path.join(base_path, "PdM_errors.csv")),
    "failures": pd.read_csv(os.path.join(base_path, "PdM_failures.csv")),
    "machines": pd.read_csv(os.path.join(base_path, "PdM_machines.csv")),
    "maintenance": pd.read_csv(os.path.join(base_path, "PdM_maint.csv")),
}

# Preview dataset keys
print("datasets:", list(datasets.keys()))

datasets: ['telemetry', 'errors', 'failures', 'machines', 'maintenance']


In [4]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Copy the feature engineered file to Google Drive
!cp "/content/processed/telemetry_feature_engineered.csv" "/content/drive/MyDrive/"

print("File mounted to Google Drive!")
print("File location: /content/drive/MyDrive/telemetry_feature_engineered.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cp: cannot stat '/content/processed/telemetry_feature_engineered.csv': No such file or directory
File mounted to Google Drive!
File location: /content/drive/MyDrive/telemetry_feature_engineered.csv


In [13]:
import pandas as pd

# Update the path to your Google Drive location
file_path = "/content/drive/MyDrive/telemetry_feature_engineered.csv"
# Load CSV
telemetry = pd.read_csv(file_path)

# Convert datetime column
telemetry['datetime'] = pd.to_datetime(telemetry['datetime'])

# Final cleanup (if any missing values)
telemetry = telemetry.ffill().bfill()
telemetry = telemetry.dropna()

print("Dataset loaded. Shape:", telemetry.shape)
print("Failure distribution:")
print(telemetry['failure_next_24h'].value_counts())
# Features and target
X = telemetry.drop(columns=['datetime', 'machineID', 'failure_next_24h']).copy()
y = telemetry['failure_next_24h'].values.astype(np.float32)



Dataset loaded. Shape: (876142, 51)
Failure distribution:
failure_next_24h
0.0    858865
1.0     17277
Name: count, dtype: int64


In [14]:
X = telemetry.drop(columns=['datetime', 'machineID', 'failure_next_24h']).copy()

# List all non-numeric columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical columns to encode:", categorical_cols)


Categorical columns to encode: ['model', 'failure']


In [15]:
from sklearn.preprocessing import LabelEncoder

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])


In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

print("Scaled features shape:", X_scaled.shape)


Scaled features shape: (876142, 48)


In [20]:
from tensorflow.keras.utils import Sequence
import numpy as np

class TelemetrySequence(Sequence):
    def __init__(self, X, y, sequence_length=12, batch_size=128, indices=None):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        self.sequence_length = sequence_length
        self.batch_size = batch_size

        if indices is None:
            self.indices = np.arange(sequence_length, len(X))
        else:
            self.indices = indices

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
        X_batch = np.array([self.X[i-self.sequence_length:i] for i in batch_indices], dtype=np.float32)
        y_batch = np.array([self.y[i] for i in batch_indices], dtype=np.float32)
        return X_batch, y_batch


In [21]:
sequence_length = 12
batch_size = 128

# All sequence starting indices
all_indices = np.arange(sequence_length, len(X_scaled))

# 80% train, 20% validation
train_size = int(0.8 * len(all_indices))
train_indices = all_indices[:train_size]
val_indices = all_indices[train_size:]

# Generators
train_gen = TelemetrySequence(X_scaled, y, sequence_length=sequence_length, batch_size=batch_size, indices=train_indices)
val_gen = TelemetrySequence(X_scaled, y, sequence_length=sequence_length, batch_size=batch_size, indices=val_indices)


In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

sequence_length = 12  # number of timesteps in each sequence
num_features = X_scaled.shape[1]

model = Sequential()
model.add(LSTM(128, input_shape=(sequence_length, num_features), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [24]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=20,
    callbacks=[early_stop]
)




Epoch 1/20


  self._warn_if_super_not_called()


[1m5476/5476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 9ms/step - accuracy: 0.9795 - loss: 0.1020 - val_accuracy: 0.9769 - val_loss: 0.0850
Epoch 2/20
[1m5476/5476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 10ms/step - accuracy: 0.9812 - loss: 0.0732 - val_accuracy: 0.9787 - val_loss: 0.0803
Epoch 3/20
[1m5476/5476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 9ms/step - accuracy: 0.9856 - loss: 0.0651 - val_accuracy: 0.9839 - val_loss: 0.0714
Epoch 4/20
[1m5476/5476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 9ms/step - accuracy: 0.9865 - loss: 0.0617 - val_accuracy: 0.9840 - val_loss: 0.0694
Epoch 5/20
[1m5476/5476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 9ms/step - accuracy: 0.9872 - loss: 0.0591 - val_accuracy: 0.9851 - val_loss: 0.0676
Epoch 6/20
[1m5476/5476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 9ms/step - accuracy: 0.9872 - loss: 0.0583 - val_accuracy: 0.9857 - val_loss: 0.0608
Epoch 7/20
[1m5476/

In [25]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_pred = []
y_true = []

for X_batch, y_batch in val_gen:
    # Predict probabilities and convert to 0/1
    y_pred_batch = (model.predict(X_batch) > 0.5).astype(int)
    y_pred.extend(y_pred_batch.flatten())
    y_true.extend(y_batch.flatten())

y_pred = np.array(y_pred)
y_true = np.array(y_true)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8

KeyboardInterrupt: 

In [26]:
print("Classification Report:")
print(classification_report(y_true, y_pred, digits=4))


Classification Report:
              precision    recall  f1-score   support

         0.0     0.9916    0.9973    0.9945    127697
         1.0     0.8379    0.6249    0.7159      2863

    accuracy                         0.9891    130560
   macro avg     0.9148    0.8111    0.8552    130560
weighted avg     0.9883    0.9891    0.9883    130560

