In [1]:
# Refer to path of package
import sys
sys.path.append('/Users/watcharapongwongrattanasirikul/Documents/Git/Jupyter/my_env/lib/python3.8/site-packages')

In [2]:
# Import Lib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report
import re
from random import choices, randint, randrange, uniform

In [3]:
from ml_helper import MlHelper
from eda_helper import EdaHelper
from sampling_helper import SamplingHelper
from impute_helper import ImputeHelper
from impute_helper import imputation_strategy
from ml_helper import model_type
from ml_helper import resampler_type
from ml_helper import scaler_type
import cleansing_helper as CleansingHelper

In [4]:
file_path = '/Users/watcharapongwongrattanasirikul/Documents/Git/predictive-maintenance/test_timeseries.csv'

df = pd.read_csv(file_path)
df.head(2)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,8001,M22860,M,300.8,312.0,1443,53.3,151,0,0,0,0,0,0
1,8002,L55181,L,300.8,312.0,1374,50.2,154,0,0,0,0,0,0


In [5]:
df.drop(['RNF'], axis=1, inplace=True)
df.head(2)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF
0,8001,M22860,M,300.8,312.0,1443,53.3,151,0,0,0,0,0
1,8002,L55181,L,300.8,312.0,1374,50.2,154,0,0,0,0,0


In [6]:
# Rename of columns
df = df.rename(columns={
    'Air temperature [K]':'AirTemp',
    'Process temperature [K]':'ProcessTemp',
    'Rotational speed [rpm]':'RotationalSpeed',
    'Torque [Nm]':'Torque',
    'Tool wear [min]': 'ToolWear',
    'Machine failure':'MF'
})
df.head(2)

Unnamed: 0,UDI,Product ID,Type,AirTemp,ProcessTemp,RotationalSpeed,Torque,ToolWear,MF,TWF,HDF,PWF,OSF
0,8001,M22860,M,300.8,312.0,1443,53.3,151,0,0,0,0,0
1,8002,L55181,L,300.8,312.0,1374,50.2,154,0,0,0,0,0


In [7]:
# The importance feature of power failure  is Power then we create the Power feature 
# Power(kW) = Torque (N.m) x Speed (rpm)

df['Power'] = df['Torque'] * df['RotationalSpeed']
df.head(2)

Unnamed: 0,UDI,Product ID,Type,AirTemp,ProcessTemp,RotationalSpeed,Torque,ToolWear,MF,TWF,HDF,PWF,OSF,Power
0,8001,M22860,M,300.8,312.0,1443,53.3,151,0,0,0,0,0,76911.9
1,8002,L55181,L,300.8,312.0,1374,50.2,154,0,0,0,0,0,68974.8


In [8]:
# The importance feature of heat dissipation failure  is temperature
df['AirDiff'] = df['ProcessTemp'] - df['AirTemp']
df.head(2)

Unnamed: 0,UDI,Product ID,Type,AirTemp,ProcessTemp,RotationalSpeed,Torque,ToolWear,MF,TWF,HDF,PWF,OSF,Power,AirDiff
0,8001,M22860,M,300.8,312.0,1443,53.3,151,0,0,0,0,0,76911.9,11.2
1,8002,L55181,L,300.8,312.0,1374,50.2,154,0,0,0,0,0,68974.8,11.2


In [9]:
df['TorqueToolWear'] = df['Torque'] * df['ToolWear']
df.head(2)

Unnamed: 0,UDI,Product ID,Type,AirTemp,ProcessTemp,RotationalSpeed,Torque,ToolWear,MF,TWF,HDF,PWF,OSF,Power,AirDiff,TorqueToolWear
0,8001,M22860,M,300.8,312.0,1443,53.3,151,0,0,0,0,0,76911.9,11.2,8048.3
1,8002,L55181,L,300.8,312.0,1374,50.2,154,0,0,0,0,0,68974.8,11.2,7730.8


### Utility Function

In [10]:
# This function use to encode data point to be class label
# a = data less than or equal percentile 25
# b = data less than or equal percentile 50
# c = data less than or equal percentile 75
# d = data greater than percentile 75

def encode_feature(df, is_print_info=False):
    
    mean = np.mean(df)
    std = np.std(df)
    
    if is_print_info:
        print(f'mean: {mean}')
        print(f'std: {std}')
        print(f'a: {mean - (3*std)}')
        print(f'b: {mean - (2*std)}')
        print(f'c: {mean - (1*std)}')
        print(f'd: {mean}')
        print(f'e: {mean + (1*std)}')
        print(f'f: {mean + (2*std)}')
        print(f'g: {mean + (3*std)}')
    
    encoder_list = []
    
    for data_point in df:
        
        if is_print_info:
            print(data_point)
        
        if data_point <= (mean - (3*std)):
            encoder_list.append('a')
            continue
        
        if data_point <= (mean - (2*std)):
            encoder_list.append('b')
            continue
            
        if data_point <= (mean - (1*std)):
            encoder_list.append('c')
            continue
            
        if data_point <= mean:
            encoder_list.append('d')
            continue
            
        if data_point <= (mean + (1*std)):
            encoder_list.append('e')
            continue
            
        if data_point <= (mean + (2*std)):
            encoder_list.append('f')
            continue       
            
        if data_point <= (mean + (3*std)):
            encoder_list.append('g')
            continue   
                        
        encoder_list.append('h')

    return encoder_list

In [11]:
def get_expression(data, num_pattern=1):
    
    
    or_flag = False
    expression_group = []
    
    for j in range(num_pattern):
        reg_result = ""
        
        for idx, i in enumerate(data[j]):

            if i == '|' and reg_result == "":
                raise Exception(f"The symbol '|' can't be the first parameter: {data}")

            if i == '|' and (idx + 1) == len(data[j]):
                raise Exception(f"The symbol '|' can't be the last parameter: {data}")

            if i in ('a', 'b', 'c', 'd','e','f','g','h','.*') and or_flag == False:
                reg_result = reg_result + i
                buf = i
                continue

            if i in ('a', 'b', 'c', 'd','e','f','g','h','.*') and or_flag == True:
                reg_result = reg_result + f"{i}]"
                or_flag = False
                continue

            if or_flag == True and (i == '|' or i == '.*'):
                reg_result = reg_result + 'd'
                continue

            if i == '|':
                reg_result = reg_result[0:(len(reg_result)- len(buf))]
                reg_result = reg_result + f"[{buf}|"
                or_flag = True
                continue
                
        expression_group.append('.*'+reg_result+".*")
 
    return expression_group

### Model

In [12]:
# Custom Ensemble Classification Model

from sklearn.base import BaseEstimator

class MachineFailureClassification(BaseEstimator):
    def __init__(self):
        self.TWF = [['e', 'e', 'e', 'e', 'e', 'e'], ['b', 'e', 'g', 'g', 'b', 'e']]
        self.HDF = [['d', '|', 'c', 'd', 'd', 'e'], ['g', 'f', 'd', 'f', 'c', 'c']]
        self.PWF = [['e', 'c', '|', 'd', 'd', 'd'], ['g', 'c', 'f', 'g', 'f', 'b']]
        self.OSF = [['e', 'e', 'e', 'e', 'e', 'e'], ['d', 'f', 'd', 'f', 'f', 'c']]
        
        self.AIRDIFF_THRESHOLD = 9.27  # This is mean - (0.5 x sd)
    
    def fit(self):
        return self
    
    def predict(self, df, encoded_power, encoded_torque, encoded_toolwear):
        
        fails_type = []
        predict_fail = False
        
        # ------------------------- Predict PWF -------------------------
        predict_pwf_fail = False
        encoded_power = ''.join(encoded_power)
        
        expressions_pwf = get_expression(self.PWF, num_pattern=2)
        
        for expression in expressions_pwf:
            if predict_pwf_fail == True:
                pass
            else:
                match_pwf = re.search(expression, encoded_power)
                
                if match_pwf is not None:
                    predict_pwf_fail = True
                    fails_type.append('PWF')
                else:
                    predict_pwf_fail = False
        
        
        # ------------------------- Predict HDF -------------------------
        predict_hdf_fail = False
        encoded_torque = ''.join(encoded_torque)
        
        expressions_hdf = get_expression(self.HDF, num_pattern=2)
        
        for expression in expressions_hdf:
            if predict_hdf_fail == True:
                pass
            else:
                match_hdf = re.search(expression, encoded_torque)
                
                if (match_hdf is not None) and (df['AirDiff'].mean() < self.AIRDIFF_THRESHOLD):
                    predict_hdf_fail = True
                    fails_type.append('HDF')
                else:
                    predict_hdf = False
        
    
        # ------------------------- Predict OSF -------------------------
        predict_osf_fail = False
        encoded_toolwear = ''.join(encoded_toolwear)
        
        expressions_osf = get_expression(self.OSF, num_pattern=2)
        
        for expression in expressions_osf:
            if predict_osf_fail == True:
                pass
            else:
                match_osf = re.search(expression, encoded_toolwear)
                
                if match_osf is not None:
                    predict_osf_fail = True
                    fails_type.append('OSF')
                else:
                    predict_osf_fail = False
        
            
        # ------------------------- Predict TWF -------------------------
        predict_twf_fail = False    
        encoded_toolwear = ''.join(encoded_toolwear)
        
        expressions_twf = get_expression(self.TWF, num_pattern=2)
        
        for expression in expressions_twf:
            if predict_twf_fail == True:
                pass
            else:
                match_twf = re.search(expression, encoded_toolwear)
                
                if match_twf is not None:
                    predict_twf_fail = True
                    fails_type.append('TWF')
                else:
                    predict_twf_fail = False

            
        # ------------------------- Summary -------------------------
        
        predict_fail = predict_pwf_fail | predict_hdf_fail | predict_osf_fail | predict_twf_fail
         
        return predict_fail, fails_type
    

### Prediction

In [13]:
# Prepare data

encoded_torque = encode_feature(df['Torque'])
encoded_power = encode_feature(df['Power'])
encoded_toolwear = encode_feature(df['ToolWear'])
encoder_torquetoolwear = encode_feature(df['TorqueToolWear'])

In [14]:
# Initial input

ml_classification = MachineFailureClassification()

In [15]:
def prediction_period(df, window_size:int, is_print_logs=False):
    
    # Calcurate number of rolling
    number_rolling = (len(df)/window_size) - 1
    
    # Initial setting for window
    start = 0
    end = window_size
    
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    # Start rolling
    for idx in range(int(number_rolling)):
        
        torque = encoded_torque[start:end]
        power = encoded_power[start:end]
        toolwear = encoded_toolwear[start:end]
        torquetoolwear = encoder_torquetoolwear[start:end]
        
        # Predict fail on the new window
        predict_fail, fails_type = ml_classification.predict(df[start:end], power, torque, toolwear)
        
        if is_print_logs:
            print(f'round {idx}: Will next window is failed?: {predict_fail} with type: {fails_type}')
        
        # Actual next window fail
        df_MF = df['MF']
        
        fail_next_period = list(df_MF[end:end+window_size])
        is_fail_next_period = sum(fail_next_period)>0
    
        # Calculate Fitness Score
        if  predict_fail == True and is_fail_next_period == True:
            TP += 1
            
        if predict_fail == True and is_fail_next_period == False:
            FP += 1
            
        if predict_fail == False and is_fail_next_period == False:
            TN += 1
            
        if predict_fail == False and is_fail_next_period == True:
            FN += 1
        
        
        start += window_size
        end += window_size
        
    # Calculate accuracy
    if (TN + FP + TP + FN) == 0:
        accuracy = 0
    else:
        accuracy = (TN + TP)/(TN + FP + TP + FN)

    # Calculate recall    
    if (TP+FN) == 0:
        recall = 0
    else:
        recall = TP/(TP + FN)

    # Calculate precision
    if (TP+FP) == 0:
        precision = 0
    else:
        precision = TP/(TP+FP)

    # Calculate f1 score
    if (precision+recall) == 0:
        f1_score = 0
    else:
        f1_score = 2 * ((precision * recall)/(precision+recall))
    
    return(f1_score, accuracy, precision, recall)
    
    

In [16]:
f1_score, accuracy, precision, recall = prediction_period(
    df=df,
    window_size=25,
    is_print_logs=True
)

round 0: Will next window is failed?: True with type: ['PWF', 'OSF', 'TWF']
round 1: Will next window is failed?: False with type: []
round 2: Will next window is failed?: False with type: []
round 3: Will next window is failed?: True with type: ['OSF', 'TWF']
round 4: Will next window is failed?: False with type: []
round 5: Will next window is failed?: True with type: ['PWF']
round 6: Will next window is failed?: True with type: ['OSF', 'TWF']
round 7: Will next window is failed?: False with type: []
round 8: Will next window is failed?: True with type: ['PWF']
round 9: Will next window is failed?: True with type: ['OSF', 'TWF']
round 10: Will next window is failed?: True with type: ['PWF', 'OSF', 'TWF']
round 11: Will next window is failed?: True with type: ['PWF']
round 12: Will next window is failed?: True with type: ['PWF']
round 13: Will next window is failed?: True with type: ['PWF', 'OSF', 'TWF']
round 14: Will next window is failed?: False with type: []
round 15: Will next wi

In [17]:
f1_score

0.631578947368421

In [18]:
accuracy

0.6455696202531646

In [19]:
recall

0.8888888888888888

In [20]:
precision

0.4897959183673469