In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [17]:
from h1st.model.predictive_model import PredictiveModel
class RuleModel(PredictiveModel):
    # 98 percentile
    daily_thresholds = {
        "volt": 188.83,  # >
        "rotate": 373.05,  # <
        "vibration": 49.38,  # >
    }

    def predict_pointwise(self, input_data):
        daily_average = np.average(
            input_data.reshape((-1, 4)), axis=0
        )
        volt_val = daily_average[0]
        rotate_val = daily_average[1]
        vibration_val = daily_average[3]

        pred = {"comp1": 0, "comp2": 0, "comp4": 0}
        pred['comp1'] = 1 if volt_val > self.daily_thresholds["volt"] else 0
        pred['comp2'] = 1 if rotate_val < self.daily_thresholds["rotate"] else 0
        pred['comp4'] = 1 if vibration_val > self.daily_thresholds["vibration"] else 0
        return pred

    def predict(self, input_data):
        df = input_data['X']
        return {'predictions': pd.DataFrame(
            map(self.predict_pointwise, df.values), 
        )}  

In [28]:
import numpy as np
from h1st.model.fuzzy import (
    FuzzyVariables,
    FuzzyMembership as fm,
    FuzzyRules,
    FuzzyModel,
    FuzzyModeler
)

def get_metadata(data):
    res = {}
    for k, v in dict(data.describe().loc['max']).items():
        res[k] = {'max': v}
    for k, v in dict(data.describe().loc['min']).items():
        res[k].update({'min': v})    
    return res

def create_fuzzy_model(data):
    metadata = get_metadata(data) # df_model3_daily
    fuzzy_vars = FuzzyVariables()
    fuzzy_vars.add(
        var_name='volt',
        var_type='antecedent',
        var_range=np.arange(
            metadata['volt']['min'], 
            metadata['volt']['max'], 
            0.1
        ),
        membership_funcs=[('high', fm.SIGMOID, [188.83, 0.25]), # 180
                        ('low', fm.SIGMOID, [188.83, -0.25])]
    )
    fuzzy_vars.add(
        var_name='rotate',
        var_type='antecedent',
        var_range=np.arange(
            metadata['rotate']['min'], 
            metadata['rotate']['max'], 
            0.1
        ),
        membership_funcs=[('high', fm.SIGMOID, [373.05, 0.06]), # 400
                        ('low', fm.SIGMOID, [373.05, -0.15])]
    )
    fuzzy_vars.add(
        var_name='vibration',
        var_type='antecedent',
        var_range=np.arange(
            metadata['vibration']['min'], 
            metadata['vibration']['max'], 
            0.1
        ),
        membership_funcs=[('high', fm.SIGMOID, [49.38, 0.5]), # 44 ?
                        ('low', fm.SIGMOID, [49.38, -0.5])]
    )
    fuzzy_vars.add(
        var_name='comp1',
        var_type='consequent',
        var_range=np.arange(0, 1+1e-5, 0.1),
        membership_funcs=[('false', fm.GAUSSIAN, [0, 0.4]),
                        ('true', fm.GAUSSIAN, [1, 0.4])]
    )
    fuzzy_vars.add(
        var_name='comp2',
        var_type='consequent',
        var_range=np.arange(0, 1+1e-5, 0.1),
        membership_funcs=[('false', fm.GAUSSIAN, [0, 0.4]),
                        ('true', fm.GAUSSIAN, [1, 0.4])]
    )
    fuzzy_vars.add(
        var_name='comp4',
        var_type='consequent',
        var_range=np.arange(0, 1+1e-5, 0.1),
        membership_funcs=[('false', fm.GAUSSIAN, [0, 0.4]),
                        ('true', fm.GAUSSIAN, [1, 0.4])]
    )

    fuzzy_rule = FuzzyRules()
    fuzzy_rule.add(
        'rule1',
        if_term=fuzzy_vars.get('volt')['high']&fuzzy_vars.get('rotate')['high']&fuzzy_vars.get('vibration')['low'],
        then_term=fuzzy_vars.get('comp1')['true']
    )
    fuzzy_rule.add(
        'rule2',
        if_term=fuzzy_vars.get('rotate')['low']&fuzzy_vars.get('volt')['low']&fuzzy_vars.get('vibration')['low'],
        then_term=fuzzy_vars.get('comp2')['true']
    )
    fuzzy_rule.add(
        'rule3',
        if_term=fuzzy_vars.get('vibration')['high']&fuzzy_vars.get('volt')['low']&fuzzy_vars.get('rotate')['high'],
        then_term=fuzzy_vars.get('comp4')['true']
    )
    fuzzy_rule.add(
        'rule4',
        if_term=fuzzy_vars.get('volt')['low'],
        then_term=fuzzy_vars.get('comp1')['false']
    )
    fuzzy_rule.add(
        'rule5',
        if_term=fuzzy_vars.get('rotate')['high'],
        then_term=fuzzy_vars.get('comp2')['false']
    )
    fuzzy_rule.add(
        'rule6',
        if_term=fuzzy_vars.get('vibration')['low'],
        then_term=fuzzy_vars.get('comp4')['false']
    )

    class CustomFuzzyModel(FuzzyModel):
        def process_rule(self, input_data: dict) -> dict:
            if self.rule_engine is None:
                raise ValueError(
                    (
                        "Property rule_engine is None. Please load your rule_engine "
                        "to run this method."
                    )
                )

            input_data = np.array(list(input_data.values()))
            daily_average = np.average(
                input_data.reshape((-1, 4)), axis=0
            )
            input_data = {
                "volt": daily_average[0],
                "rotate": daily_average[1],
                "vibration": daily_average[3]
            } 
            for key, value in input_data.items():
                self.rule_engine.input[key] = value
            self.rule_engine.compute()

            outputs = {}
            for cls in self.rule_engine.ctrl.consequents:
                outputs[cls.label] = round(self.rule_engine.output[cls.label], 5)
            return outputs

    modeler = FuzzyModeler(model_class=CustomFuzzyModel)
    teacher = modeler.build_model(fuzzy_vars, fuzzy_rule)
    return teacher

In [19]:
# you can download the data from kaggle using the following link.
# https://www.kaggle.com/datasets/arnabbiswas1/microsoft-azure-predictive-maintenance

import os
from datetime import datetime, timedelta

dir_path = os.getcwd() + '/azure_iot'
path = f'{dir_path}/'
telemetry_url = 'PdM_telemetry.csv'
df = pd.read_csv(path + 'PdM_telemetry.csv')
df.loc[:, 'datetime'] = pd.to_datetime(df['datetime'])
df.loc[:, 'datetime'] = df['datetime'] - pd.Timedelta(hours=6)
df_machines = pd.read_csv(path + 'PdM_machines.csv')
df = df.join(df_machines.set_index('machineID'), on='machineID')
df.loc[:, 'date'] = df['datetime'].dt.date
df_failures = pd.read_csv(path + 'PdM_failures.csv')
df_failures.shape

for model_type in ['model1', 'model2', 'model3', 'model4']:
    id_list = df[df.model == model_type].machineID.unique()
    print(f'{model_type} failure records:', df_failures[df_failures.machineID.isin(id_list)].shape)

df_model3 = df[df.model=='model3']
df_model3 = df_model3[df_model3.date != datetime(2016, 1, 1).date()]
df_model3.shape

df_model3_failures = df_failures[df_failures.machineID.isin(df_model3.machineID.unique())]
df_model3_failures.shape

df_model3_failures['datetime'] = pd.to_datetime(df_model3_failures['datetime'])
df_model3_failures['date'] = df_model3_failures['datetime'].apply(lambda x: x.date())
df_model3_failures['date_1'] = df_model3_failures['date'] - timedelta(days=1)
df_model3_failures.failure.value_counts()

df_model3_daily = df_model3.groupby(['date', 'machineID']).agg('mean')

percentile = 0.96
print('volt:', df_model3_daily['volt'].quantile(percentile))
print('rotate:', df_model3_daily['rotate'].quantile(1-percentile))
print('vibration:', df_model3_daily['vibration'].quantile(percentile))

df_model3.machineID.nunique()

model1 failure records: (189, 3)
model2 failure records: (168, 3)
model3 failure records: (221, 3)
model4 failure records: (183, 3)
volt: 178.994274651537
rotate: 405.58135851278706
vibration: 43.93598497211916



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


35

In [20]:
keys = ['machineID', 'date']
features = ['volt', 'rotate', 'pressure', 'vibration']
column_values = []

for idx in range(24):
    column_values.extend([f'{feature}_{idx}' for feature in features])
print(column_values)

['volt_0', 'rotate_0', 'pressure_0', 'vibration_0', 'volt_1', 'rotate_1', 'pressure_1', 'vibration_1', 'volt_2', 'rotate_2', 'pressure_2', 'vibration_2', 'volt_3', 'rotate_3', 'pressure_3', 'vibration_3', 'volt_4', 'rotate_4', 'pressure_4', 'vibration_4', 'volt_5', 'rotate_5', 'pressure_5', 'vibration_5', 'volt_6', 'rotate_6', 'pressure_6', 'vibration_6', 'volt_7', 'rotate_7', 'pressure_7', 'vibration_7', 'volt_8', 'rotate_8', 'pressure_8', 'vibration_8', 'volt_9', 'rotate_9', 'pressure_9', 'vibration_9', 'volt_10', 'rotate_10', 'pressure_10', 'vibration_10', 'volt_11', 'rotate_11', 'pressure_11', 'vibration_11', 'volt_12', 'rotate_12', 'pressure_12', 'vibration_12', 'volt_13', 'rotate_13', 'pressure_13', 'vibration_13', 'volt_14', 'rotate_14', 'pressure_14', 'vibration_14', 'volt_15', 'rotate_15', 'pressure_15', 'vibration_15', 'volt_16', 'rotate_16', 'pressure_16', 'vibration_16', 'volt_17', 'rotate_17', 'pressure_17', 'vibration_17', 'volt_18', 'rotate_18', 'pressure_18', 'vibration

In [21]:
def preprocess_data(list_of_daily_data, features):
    x_train_list = []
    y_train_list = []
    for idx, df_daily_one in list_of_daily_data:
        mid = idx[0]
        date = idx[1]

        if df_daily_one.shape[0] != 24:
            continue

        df_filtered_f = df_model3_failures[
            (df_model3_failures.date_1==date)&(df_model3_failures.machineID==mid)]
        y_label = {"comp1": 0, "comp2": 0, "comp4": 0}
        if df_filtered_f.shape[0] >= 1:
            for i in range(df_filtered_f.shape[0]):
                y_label.update({df_filtered_f['failure'].iloc[i]: 1})
        x_train_list.append(np.concatenate(df_daily_one[features].values).tolist())
        y_train_list.append(y_label)
    return x_train_list, y_train_list

In [22]:
from sklearn.utils import resample

def handle_data_imbalance(df_x, df_y):
    df_train_y_normal = df_y[df_y.sum(axis=1) == 0]
    df_train_y_normal_resampled = resample(df_train_y_normal,
                replace=True,
                n_samples=100,
                random_state=42)
    df_train_x_normal_resampled = df_x.filter(items = df_train_y_normal_resampled.index, axis=0)
    assert all(df_train_y_normal_resampled.head().index == df_train_x_normal_resampled.head().index)

    df_train_y_abnormal = df_y[df_y.sum(axis=1) != 0]
    df_train_y_abnormal_resampled = resample(df_train_y_abnormal,
                replace=True,
                n_samples=600,
                random_state=42)
    df_train_y_abnormal_resampled.shape
    df_train_x_abnormal_resampled = df_x.filter(items = df_train_y_abnormal_resampled.index, axis=0)
    df_train_x_abnormal_resampled.shape
    assert all(df_train_y_abnormal_resampled.head().index == df_train_x_abnormal_resampled.head().index)

    df_train_y_final = pd.concat([df_train_y_normal_resampled, df_train_y_abnormal_resampled], axis=0)
    df_train_x_final = pd.concat([df_train_x_normal_resampled, df_train_x_abnormal_resampled], axis=0)

    return df_train_x_final, df_train_y_final

In [30]:
from collections import defaultdict
from sklearn.model_selection import KFold
from h1st.model.oracle import OracleModeler
from h1st.model.oracle.ensembler_modelers import MLPEnsembleModeler

# prepare cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=3)

all_metrics = defaultdict(lambda: defaultdict(list))
all_index = defaultdict(lambda: defaultdict(list))

# 1. build rule-based model
bool_teacher = RuleModel()
fuzzy_teacher = create_fuzzy_model(df_model3_daily)
fuzzy_thresholds = {'comp1': 0.46, 'comp2': 0.48, 'comp4': 0.46}

for no, m_ids in enumerate(kfold.split(df_model3.machineID.unique())):
    train_id, test_id = m_ids
    df_train = df_model3[df_model3.machineID.isin(train_id)]
    df_test = df_model3[df_model3.machineID.isin(test_id)]

    temp_gb = df_train.groupby(keys)
    list_of_train_daily = [item for item in temp_gb]

    temp_gb = df_test.groupby(keys)
    list_of_test_daily = [item for item in temp_gb]

    # print(len(train_id), len(test_id),df_train.shape, df_test.shape)
    # print(f'number of data points in train dataset: {len(list_of_train_daily)}')
    # print(f'number of data points in test dataset: {len(list_of_test_daily)}')

    
    x_train_list, y_train_list = preprocess_data(list_of_train_daily, features)
    x_test_list, y_test_list = preprocess_data(list_of_test_daily, features)

    df_train_x = pd.DataFrame(x_train_list, columns=column_values)
    df_train_y = pd.DataFrame(y_train_list)
    df_test_x = pd.DataFrame(x_test_list, columns=column_values)
    df_test_y = pd.DataFrame(y_test_list)

    df_train_x_final, df_train_y_final = handle_data_imbalance(df_train_x, df_train_y)
    # print(df_train_x.shape, df_train_y.shape, df_test_x.shape, df_test_y.shape)
    # print(no, df_train_x_final.shape, df_train_y_final.shape)   
    input_data = {
        "unlabeled_data": df_train_x_final,
        "labeled_data": {
            "X_train": df_train_x_final.reset_index(drop=True),
            "y_train": df_train_y_final.reset_index(drop=True),
            "X_test": df_test_x,
            "y_test": df_test_y,
        },
    }

    # 3. build oracle
    # 3.1 bool
    modeler = OracleModeler()
    oracle_with_bool = modeler.build_model(
        data=input_data,
        teacher_model=bool_teacher)

    # # 3.2 fuzzy
    oracle_with_fuzzy = modeler.build_model(
        data=input_data,
        teacher_model=fuzzy_teacher,
        fuzzy_thresholds=fuzzy_thresholds)

    # 3.3 bool + ml_ensemble
    oracle_with_bool_ml = modeler.build_model(
        data=input_data, 
        teacher_model=bool_teacher,
        ensembler_modeler=MLPEnsembleModeler)

    # 3.4 fuzzy + ml_ensemble
    oracle_with_fuzzy_ml = modeler.build_model(
        data=input_data, 
        teacher_model=fuzzy_teacher,
        fuzzy_thresholds=fuzzy_thresholds,
        ensembler_modeler=MLPEnsembleModeler) 

    # 3.5 bool + ml_ensemble + x
    oracle_with_bool_ml_x = modeler.build_model(
        data=input_data, 
        teacher_model=bool_teacher,
        ensembler_modeler=MLPEnsembleModeler,
        inject_x_in_ensembler=True)

    # 3.6 fuzzy + ml_ensemble + x
    oracle_with_fuzzy_ml_x = modeler.build_model(
        data=input_data, 
        teacher_model=fuzzy_teacher,
        fuzzy_thresholds=fuzzy_thresholds,
        ensembler_modeler=MLPEnsembleModeler,
        inject_x_in_ensembler=True) 

    model_map = {
        "oracle_with_bool": oracle_with_bool,
        "oracle_with_fuzzy": oracle_with_fuzzy,
        "oracle_with_bool_ml": oracle_with_bool_ml,
        "oracle_with_fuzzy_ml": oracle_with_fuzzy_ml,
        "oracle_with_bool_ml_x": oracle_with_bool_ml_x,
        "oracle_with_fuzzy_ml_x": oracle_with_fuzzy_ml_x,        
    }

    # 4. collect evaluation results
    for name, oracle in model_map.items():
        for metrics in ["f1_score", 'precision', 'recall']:
            for label in ['comp1', 'comp2', 'comp4']:
                temp = oracle.metrics[metrics][label]
                s1, s2 = temp.pop('students')
                temp.update({'student1': s1, 'student2': s2})
                all_metrics[metrics][name].append({label: temp})
                # all_index[metrics][name].append(f'{label}_{name}_{no}')

2022-10-10 10:02:57.929 | INFO     | h1st.model.oracle.oracle_modeler:build_model:142 - Evaluated all sub models successfully.
2022-10-10 10:03:04.970 | INFO     | h1st.model.oracle.oracle_modeler:build_model:142 - Evaluated all sub models successfully.
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
X has feature names, but StandardScaler was fitted without feature names
2022-10-10 10:03:08.157 | INFO     | h1st.model.oracle.oracle_modeler:build_model:142 - E

In [33]:
all_avg_metrics = []
oracle_names = []
for metrics_type in ['f1_score', 'precision', 'recall']:
    for oracle_name, metrics in all_metrics[metrics_type].items():

        temp = defaultdict(list)
        for m in metrics:
            for label, met in m.items():
                temp[label].append(met)

        for label in temp:
            df_metrics = pd.DataFrame(temp[label])
            # print(metrics_type, oracle_name, df_metrics.mean().values)
            all_avg_metrics.append(
                [metrics_type, label] + list(df_metrics.mean().values)
            )
            oracle_names.append(f'{label}_{oracle_name}')

final_metrics = pd.DataFrame(
    all_avg_metrics,
    columns=['metrics_type', 'label', 'teacher', 'ensemblers', 'student1', 'student2'],
    index=oracle_names
)

In [34]:
final_metrics[final_metrics.label=='comp1']

Unnamed: 0,metrics_type,label,teacher,ensemblers,student1,student2
comp1_oracle_with_bool,f1_score,comp1,0.196812,0.166924,0.157436,0.178076
comp1_oracle_with_fuzzy,f1_score,comp1,0.230684,0.195534,0.191666,0.191432
comp1_oracle_with_bool_ml,f1_score,comp1,0.196812,0.135528,0.157436,0.178076
comp1_oracle_with_fuzzy_ml,f1_score,comp1,0.230684,0.214818,0.191666,0.191432
comp1_oracle_with_bool_ml_x,f1_score,comp1,0.196812,0.18163,0.157436,0.178076
comp1_oracle_with_fuzzy_ml_x,f1_score,comp1,0.230684,0.178948,0.191666,0.191432
comp1_oracle_with_bool,precision,comp1,0.123628,0.108272,0.133334,0.108418
comp1_oracle_with_fuzzy,precision,comp1,0.139714,0.128682,0.125338,0.112046
comp1_oracle_with_bool_ml,precision,comp1,0.123628,0.085832,0.133334,0.108418
comp1_oracle_with_fuzzy_ml,precision,comp1,0.139714,0.154648,0.125338,0.112046


In [35]:
final_metrics[final_metrics.label=='comp2']

Unnamed: 0,metrics_type,label,teacher,ensemblers,student1,student2
comp2_oracle_with_bool,f1_score,comp2,0.24649,0.187436,0.050794,0.258216
comp2_oracle_with_fuzzy,f1_score,comp2,0.243028,0.1188,0.112102,0.26408
comp2_oracle_with_bool_ml,f1_score,comp2,0.24649,0.290644,0.050794,0.258216
comp2_oracle_with_fuzzy_ml,f1_score,comp2,0.243028,0.317284,0.112102,0.26408
comp2_oracle_with_bool_ml_x,f1_score,comp2,0.24649,0.277346,0.050794,0.258216
comp2_oracle_with_fuzzy_ml_x,f1_score,comp2,0.243028,0.28135,0.112102,0.26408
comp2_oracle_with_bool,precision,comp2,0.172528,0.154808,0.083332,0.194088
comp2_oracle_with_fuzzy,precision,comp2,0.16492,0.195556,0.150794,0.186646
comp2_oracle_with_bool_ml,precision,comp2,0.172528,0.197184,0.083332,0.194088
comp2_oracle_with_fuzzy_ml,precision,comp2,0.16492,0.199784,0.150794,0.186646


In [36]:
final_metrics[final_metrics.label=='comp4']

Unnamed: 0,metrics_type,label,teacher,ensemblers,student1,student2
comp4_oracle_with_bool,f1_score,comp4,0.296398,0.265862,0.253068,0.239716
comp4_oracle_with_fuzzy,f1_score,comp4,0.28269,0.253772,0.270234,0.221276
comp4_oracle_with_bool_ml,f1_score,comp4,0.296398,0.25613,0.253068,0.239716
comp4_oracle_with_fuzzy_ml,f1_score,comp4,0.28269,0.25279,0.270234,0.221276
comp4_oracle_with_bool_ml_x,f1_score,comp4,0.296398,0.245874,0.253068,0.239716
comp4_oracle_with_fuzzy_ml_x,f1_score,comp4,0.28269,0.243598,0.270234,0.221276
comp4_oracle_with_bool,precision,comp4,0.18855,0.164394,0.158658,0.144296
comp4_oracle_with_fuzzy,precision,comp4,0.170894,0.151054,0.160876,0.128074
comp4_oracle_with_bool_ml,precision,comp4,0.18855,0.152436,0.158658,0.144296
comp4_oracle_with_fuzzy_ml,precision,comp4,0.170894,0.164916,0.160876,0.128074


In [32]:
final_metrics[final_metrics.label=='comp1'].to_csv('azure_comp1.csv')
final_metrics[final_metrics.label=='comp2'].to_csv('azure_comp2.csv')
final_metrics[final_metrics.label=='comp4'].to_csv('azure_comp4.csv')