## Device maintenance Task
Given a data set containing data collected from sensors that are monitoring a  
certain device a stakeholder perceived the need to improve the maintenance  
of this device.  


He approached you and described the Business case as:  
We have this data and I understand that we need to improve the way we  
schedule the maintenance for this device, can you assess this data and create  
an AI solution for it?  


The data is composed of 53 features and 1 Class.  
One of the features is a timestamp, the others are sensor observations.  
Can you help me to improve the maintenance somehow?  
You should prepare the code and a short presentation (10 minutes max)  
explaining your approach and why you decided to use the chosen approach.  

### Deliverables:
Code in an exported notebook  
Presentation

In [1]:
# !pip install -q -U umap-learn
# !pip install imbalanced-learn
# ! pip install torch

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, balanced_accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import random
import json
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report
import utils
pd.set_option('display.max_rows', 60)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
df = pd.read_csv('data/DATA.csv', index_col=0)
print(df.shape)
df.head()

(220320, 54)


Unnamed: 0,timestamp,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
0,2018-04-01 00:00:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
1,2018-04-01 00:01:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
2,2018-04-01 00:02:00,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,15.61777,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL
3,2018-04-01 00:03:00,2.460474,47.09201,53.1684,46.397568,628.125,76.98898,13.31742,16.24711,15.69734,...,40.88541,39.0625,64.81481,51.21528,38.19444,155.9606,66.84028,240.4514,203.125,NORMAL
4,2018-04-01 00:04:00,2.445718,47.13541,53.2118,46.397568,636.4583,76.58897,13.35359,16.21094,15.69734,...,41.40625,38.77315,65.10416,51.79398,38.77315,158.2755,66.55093,242.1875,201.3889,NORMAL


In [5]:
df.isnull().sum().sort_values(ascending=False)

sensor_15         220320
sensor_50          77017
sensor_51          15383
sensor_00          10208
sensor_07           5451
sensor_08           5107
sensor_06           4798
sensor_09           4595
sensor_01            369
sensor_30            261
sensor_29             72
sensor_32             68
sensor_18             46
sensor_17             46
sensor_22             41
sensor_25             36
sensor_16             31
sensor_49             27
sensor_48             27
sensor_47             27
sensor_46             27
sensor_45             27
sensor_44             27
sensor_43             27
sensor_42             27
sensor_41             27
sensor_40             27
sensor_39             27
sensor_38             27
sensor_14             21
sensor_26             20
sensor_03             19
sensor_10             19
sensor_13             19
sensor_12             19
sensor_11             19
sensor_05             19
sensor_04             19
sensor_02             19
sensor_36             16


In [6]:
# Drop empty column
df.drop(["sensor_15"], axis = 1, inplace = True)

In [7]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')

In [8]:
cols = df.columns[:-1]
print(len(cols), cols)

52 Index(['timestamp', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03',
       'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08',
       'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13',
       'sensor_14', 'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19',
       'sensor_20', 'sensor_21', 'sensor_22', 'sensor_23', 'sensor_24',
       'sensor_25', 'sensor_26', 'sensor_27', 'sensor_28', 'sensor_29',
       'sensor_30', 'sensor_31', 'sensor_32', 'sensor_33', 'sensor_34',
       'sensor_35', 'sensor_36', 'sensor_37', 'sensor_38', 'sensor_39',
       'sensor_40', 'sensor_41', 'sensor_42', 'sensor_43', 'sensor_44',
       'sensor_45', 'sensor_46', 'sensor_47', 'sensor_48', 'sensor_49',
       'sensor_50', 'sensor_51'],
      dtype='object')


# Preprocessing

In [9]:
df, cols = utils.preprocess(df)

# Filtering
Remove maintenance and last normal period


In [10]:
df = df[df["machine_status"] != "MAINTENANCE"]
print(df.shape)

(205843, 55)


In [11]:
df = df[df['idx'] <= df[df["machine_status"] == "BROKEN"]["idx"].values[-1]]
print(df.shape)
df['idx'] = np.arange(df.shape[0])

(152039, 55)


# Create survival columns

In [12]:
prev = 0
survival = np.zeros(df.shape[0])
incident = np.zeros(df.shape[0])
broken_idx = df[df['y'] == 0]['idx'].values
for ii, i in enumerate(broken_idx):
    survival[prev:i+1] = np.arange(i-prev+1)[::-1]
    incident[prev:i+1] = np.ones(i-prev+1) * ii
    prev = i + 1
df['survival'] = survival.astype(int)
df['incident_nb'] = incident.astype(int)

# Dataset creation for training

In [13]:
df.set_index(np.arange(len(df)), inplace = True)

In [20]:
prediction_horizon = 1000
window_size = 20
X, y_reg, y_classif, incident_ref, idx_ref = utils.prepare_dataset(df, cols, window_size,  prediction_horizon, autocorr_window=1)
print(X.shape, y_reg.shape, Counter(y_classif))

(7599, 51, 20) (7599,) Counter({0: 7249, 1: 350})


In [26]:
idx_train, idx_test, Y_train, Y_test = train_test_split(np.arange(len(y_classif)), 
    y_classif, test_size=0.3, random_state=1, stratify=y_classif)
print(Counter(Y_train), Counter(Y_test))

Counter({0: 5074, 1: 245}) Counter({0: 2175, 1: 105})


In [27]:
train_dataset = TensorDataset(torch.tensor(X[idx_train].transpose(0,2,1), dtype=torch.float32), torch.tensor(y_classif[idx_train], dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X[idx_test].transpose(0,2,1), dtype=torch.float32), torch.tensor(y_classif[idx_test], dtype=torch.long))
train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=50, shuffle=False)

In [28]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        self.sigmoid = nn.Sigmoid()

    
    def forward(self, x):
        # LSTM with hidden states
        lstm_out, (hidden, _) = self.lstm(x)
        # Only take the output from the last time step
        hidden = hidden[-1]
        out = self.classifier(hidden)
        return out

# Parameters for the LSTM model
input_dim = 51  # number of features per timestep
hidden_dim = 100  # hidden layer size
output_dim = 1  # number of classes
num_layers = 2  # number of LSTM layers

# Initialize the LSTM model
model = LSTMClassifier(input_dim, hidden_dim, output_dim, num_layers)
print(model)
# model(xt)

LSTMClassifier(
  (lstm): LSTM(51, 100, num_layers=2, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=1, bias=True)
  )
  (sigmoid): Sigmoid()
)


In [29]:
class_weights = torch.tensor([0.6])
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# Train the model
num_epochs = 100
total_loss = 0
for epoch in range(num_epochs):
    total_loss = 0
    for i, (inputs, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, target.float().view(-1, 1))
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], train loss {total_loss}')

Epoch [1/100], Step [107/107], train loss 41.14000126346946
Epoch [2/100], Step [107/107], train loss 13.437049891799688
Epoch [3/100], Step [107/107], train loss 13.391793897375464
Epoch [4/100], Step [107/107], train loss 13.371289612725377
Epoch [5/100], Step [107/107], train loss 13.25111173838377
Epoch [6/100], Step [107/107], train loss 12.946743007749319
Epoch [7/100], Step [107/107], train loss 12.867713704705238
Epoch [8/100], Step [107/107], train loss 12.61963115632534
Epoch [9/100], Step [107/107], train loss 12.640232594683766
Epoch [10/100], Step [107/107], train loss 12.759952707216144
Epoch [11/100], Step [107/107], train loss 12.60412055812776
Epoch [12/100], Step [107/107], train loss 12.610445758327842
Epoch [13/100], Step [107/107], train loss 12.448856864124537
Epoch [14/100], Step [107/107], train loss 12.271467560902238
Epoch [15/100], Step [107/107], train loss 12.231596982106566
Epoch [16/100], Step [107/107], train loss 12.349553745239973
Epoch [17/100], Step 

# Test model

In [30]:
predictions =[]
truth = []
for i, (inputs, target) in enumerate(test_loader):
    optimizer.zero_grad()
    outputs = model(inputs)
    probabilities = torch.sigmoid(outputs)
    predictions.append((probabilities.detach() > 0.5).float().numpy())
    truth.append(target.numpy())

predictions = np.concatenate(predictions).flatten()
target = np.concatenate(truth)
print(Counter(predictions), Counter(target))
print(f1_score(target, predictions))

Counter({0.0: 2195, 1.0: 85}) Counter({0: 2175, 1: 105})
0.8
