In [1]:
# Import libraries

import numpy as np
from numpy import quantile, where, random

import pandas as pd
import random

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import multivariate_normal

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import max_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

from itertools import combinations

import warnings

import sys

sys.path.insert(0, 'C:/Users/gioca/GitHub/projects/maintenance_industry_4_2024/supporting_scripts/WP_4_20240528/')
from utility import read_all_test_data_from_path, extract_selected_feature, prepare_sliding_window, FaultDetectReg, read_all_csvs_one_test, run_cv_one_motor, show_reg_result, show_clf_result

In [2]:
# Data preprocessing definition (training)

from scipy.signal import butter, filtfilt

def compensate_seq_bias(df: pd.DataFrame):
    ''' # Description
    Adjust for the sequence-to-sequence bias.
    '''
    # Tranform the features relative to the first data point.
    df['temperature'] = df['temperature'] - df['temperature'].iloc[0]
    df['voltage'] = df['voltage'] - df['voltage'].iloc[0]
    df['position'] = df['position'] - df['position'].iloc[0]

# Function to design a Butterworth low-pass filter
def butter_lowpass(cutoff, fs, order=5):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

# Function to apply the Butterworth low-pass filter
def lowpass_filter(data, cutoff_freq, sampling_freq, order=5):
    b, a = butter_lowpass(cutoff_freq, sampling_freq, order=order)
    filtered_data = filtfilt(b, a, data)
    return filtered_data

# Set parameters for the low-pass filter
cutoff_frequency = .5  # Adjust as needed
sampling_frequency = 10  # Assuming your data is evenly spaced in time

def customized_outlier_removal(df: pd.DataFrame):
    ''' # Description
    Remove outliers from the dataframe based on defined valid ranges. 
    Define a valid range of temperature and voltage. 
    Use ffil function to replace the invalid measurement with the previous value.
    '''
    df['position'] = df['position'].where(df['position'] <= 1000, np.nan)
    df['position'] = df['position'].where(df['position'] >= 0, np.nan)
    df['position'] = df['position'].ffill()
    df['position'] = lowpass_filter(df['position'], cutoff_frequency, sampling_frequency)
    df['position'] = df['position'].rolling(window=10, min_periods=1).mean()
    df['position'] = df['position'].round()

    df['temperature'] = df['temperature'].where(df['temperature'] <= 100, np.nan)
    df['temperature'] = df['temperature'].where(df['temperature'] >= 0, np.nan)
    df['temperature'] = df['temperature'].rolling(window=10, min_periods=1).mean()

    # Make sure that the difference between the current and previous temperature cannot be too large.
    # Define your threshold
    threshold = 10
    # Shift the 'temperature' column by one row to get the previous temperature
    prev_tmp = df['temperature'].shift(1)
    # Calculate the absolute difference between current and previous temperature
    temp_diff = np.abs(df['temperature'] - prev_tmp)
    # Set the temperature to NaN where the difference is larger than the threshold
    df.loc[temp_diff > threshold, 'temperature'] = np.nan
    df['temperature'] = df['temperature'].ffill()

    df['voltage'] = df['voltage'].where(df['voltage'] >= 6000, np.nan)
    df['voltage'] = df['voltage'].where(df['voltage'] <= 8000, np.nan)
    df['voltage'] = df['voltage'].ffill()
    df['voltage'] = lowpass_filter(df['voltage'], cutoff_frequency, sampling_frequency)
    df['voltage'] = df['voltage'].rolling(window=5, min_periods=1).mean()

def pre_processing(df: pd.DataFrame):
    ''' ### Description
    Preprocess the data:
    - remove outliers
    - Adjust for the sequence-to-sequence bias.
    - add new features about the difference between the current and previous n data point.
    '''     
    # Start processing.
    customized_outlier_removal(df)
    compensate_seq_bias(df)

from utility import read_all_csvs_one_test
import matplotlib.pyplot as plt
import os

base_dictionary = 'C:/Users/gioca/OneDrive - Politecnico di Milano/POLISSS/CentraleSupelec/Maintenance&I4.0/training_data/'

# Get all the folders in the base_dictionary
path_list = os.listdir(base_dictionary)
# Only keep the folders, not the excel file.
path_list = path_list[:-1]

# Read the data.
df_data_smooth = pd.DataFrame()
for tmp_path in path_list:
    path = base_dictionary + tmp_path
    # Read the data with the customized outlier removal function.
    tmp_df = read_all_csvs_one_test(path, tmp_path, pre_processing)
    df_data_smooth = pd.concat([df_data_smooth, tmp_df])
    df_data_smooth = df_data_smooth.reset_index(drop=True)

# df_data_smooth_tr = df_data_smooth[df_data_smooth['test_condition'].isin(normal_test_id)]

In [3]:
# Data reading and pre-processing (training)

# Read the data.
df_tr = pd.DataFrame()
for tmp_path in path_list:
    path = base_dictionary + tmp_path
    # Read the data with the customized outlier removal function.
    tmp_df = read_all_csvs_one_test(path, tmp_path, customized_outlier_removal)
    df_tr = pd.concat([df_tr, tmp_df])
    df_tr = df_tr.reset_index(drop=True)

conditions_path = r'C:/Users/gioca/OneDrive - Politecnico di Milano/POLISSS/CentraleSupelec/Maintenance&I4.0/training_data/Test conditions.xlsx'
conditions = pd.read_excel(conditions_path)
df_merged = df_tr.merge(conditions, how='left', left_on='test_condition', right_on='Test id')
df_tr['operation'] = df_merged['Description']
list_operations = pd.unique(conditions.Description).tolist()

In [4]:
# Data preprocessing definition (testing)

from scipy.signal import butter, filtfilt

def compensate_seq_bias(df: pd.DataFrame):
    ''' # Description
    Adjust for the sequence-to-sequence bias.
    '''
    # Tranform the features relative to the first data point.
    df['temperature'] = df['temperature'] - df['temperature'].iloc[0]
    df['voltage'] = df['voltage'] - df['voltage'].iloc[0]
    df['position'] = df['position'] - df['position'].iloc[0]

# Function to design a Butterworth low-pass filter
def butter_lowpass(cutoff, fs, order=5):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

# Function to apply the Butterworth low-pass filter
def lowpass_filter(data, cutoff_freq, sampling_freq, order=5):
    b, a = butter_lowpass(cutoff_freq, sampling_freq, order=order)
    filtered_data = filtfilt(b, a, data)
    return filtered_data

# Set parameters for the low-pass filter
cutoff_frequency = .5  # Adjust as needed
sampling_frequency = 10  # Assuming your data is evenly spaced in time

def customized_outlier_removal(df: pd.DataFrame):
    ''' # Description
    Remove outliers from the dataframe based on defined valid ranges. 
    Define a valid range of temperature and voltage. 
    Use ffil function to replace the invalid measurement with the previous value.
    '''
    df['position'] = df['position'].where(df['position'] <= 1000, np.nan)
    df['position'] = df['position'].where(df['position'] >= 0, np.nan)
    df['position'] = df['position'].ffill()
    df['position'] = lowpass_filter(df['position'], cutoff_frequency, sampling_frequency)
    df['position'] = df['position'].rolling(window=10, min_periods=1).mean()
    df['position'] = df['position'].round()

    df['temperature'] = df['temperature'].where(df['temperature'] <= 100, np.nan)
    df['temperature'] = df['temperature'].where(df['temperature'] >= 0, np.nan)
    df['temperature'] = df['temperature'].rolling(window=10, min_periods=1).mean()

    # Make sure that the difference between the current and previous temperature cannot be too large.
    # Define your threshold
    threshold = 10
    # Shift the 'temperature' column by one row to get the previous temperature
    prev_tmp = df['temperature'].shift(1)
    # Calculate the absolute difference between current and previous temperature
    temp_diff = np.abs(df['temperature'] - prev_tmp)
    # Set the temperature to NaN where the difference is larger than the threshold
    df.loc[temp_diff > threshold, 'temperature'] = np.nan
    df['temperature'] = df['temperature'].ffill()

    df['voltage'] = df['voltage'].where(df['voltage'] >= 6000, np.nan)
    df['voltage'] = df['voltage'].where(df['voltage'] <= 8000, np.nan)
    df['voltage'] = df['voltage'].ffill()
    df['voltage'] = lowpass_filter(df['voltage'], cutoff_frequency, sampling_frequency)
    df['voltage'] = df['voltage'].rolling(window=5, min_periods=1).mean()

def pre_processing(df: pd.DataFrame):
    ''' ### Description
    Preprocess the data:
    - remove outliers
    - Adjust for the sequence-to-sequence bias.
    - add new features about the difference between the current and previous n data point.
    '''     
    # Start processing.
    customized_outlier_removal(df)
    compensate_seq_bias(df)

from utility import read_all_csvs_one_test
import matplotlib.pyplot as plt
import os

base_dictionary = 'C:/Users/gioca/OneDrive - Politecnico di Milano/POLISSS/CentraleSupelec/Maintenance&I4.0/testing_data/'

# Get all the folders in the base_dictionary
path_list = os.listdir(base_dictionary)
# Only keep the folders, not the excel file.
path_list = path_list[:-1]

# Read the data.
df_data_smooth = pd.DataFrame()
for tmp_path in path_list:
    path = base_dictionary + tmp_path
    # Read the data with the customized outlier removal function.
    tmp_df = read_all_csvs_one_test(path, tmp_path, pre_processing)
    df_data_smooth = pd.concat([df_data_smooth, tmp_df])
    df_data_smooth = df_data_smooth.reset_index(drop=True)

# df_data_smooth_tr = df_data_smooth[df_data_smooth['test_condition'].isin(normal_test_id)]

In [5]:
# Data reading and pre-processing (test)

# Read the data.

df_te = pd.DataFrame()
for tmp_path in path_list:
    path = base_dictionary + tmp_path
    # Read the data with the customized outlier removal function.
    tmp_df_test = read_all_csvs_one_test(path, tmp_path, customized_outlier_removal)
    df_te = pd.concat([df_te, tmp_df_test])
    df_te = df_te.reset_index(drop=True)

conditions_path = r'C:/Users/gioca/OneDrive - Politecnico di Milano/POLISSS/CentraleSupelec/Maintenance&I4.0/testing_data/Test conditions.xlsx'
conditions = pd.read_excel(conditions_path)
df_merged = df_te.merge(conditions, how='left', left_on='test_condition', right_on='Test id')
df_te['operation'] = df_merged['Description']
list_operations = pd.unique(conditions.Description).tolist()

In [6]:
def find_test_conditions_for_motor_x(dataset, motor_number):
    """
    Trova le condizioni di test per le quali, riducendo il dataset, compare almeno un data_motor_x_label = 1.
    
    Args:
    - dataset (pd.DataFrame): Il dataset da analizzare.
    - motor_number (int): Il numero del motore da considerare.
    
    Returns:
    - List: Lista delle test_condition che soddisfano il criterio.
    """
    # Nome della colonna da cercare
    target_column = f'data_motor_{motor_number}_label'
    
    # Trova tutte le condizioni di test uniche
    test_conditions = dataset['test_condition'].unique()
    
    # Lista per memorizzare le condizioni di test valide
    valid_conditions = []
    
    # Verifica ogni condizione di test
    for condition in test_conditions:
        # Filtra il dataset per la condizione di test corrente
        filtered_dataset = dataset[dataset['test_condition'] == condition]
        
        # Verifica se esiste almeno una riga con data_motor_x_label = 1
        if (filtered_dataset[target_column] == 1).any():
            valid_conditions.append(condition)
    
    return valid_conditions

In [15]:
from imblearn.over_sampling import SMOTE

motor_number = 3
smote = SMOTE(random_state=42)

# Trova le condizioni di test valide per il motore corrente
valid_conditions = find_test_conditions_for_motor_x(df_tr, motor_number)
    
# Filtra il DataFrame originale usando le condizioni di test valide
df_tr_x = df_tr[df_tr['test_condition'].isin(valid_conditions)]

# Split the dataset based on the value of y
df_tr_x_class_0 = df_tr_x[df_tr_x['data_motor_3_label'] == 0]
df_tr_x_class_1 = df_tr_x[df_tr_x['data_motor_3_label'] == 1]

# Randomly reduce the number of instances in df_class_0 by half
df_tr_x_class_0 = df_tr_x_class_0.sample(frac=0.045, random_state=42)

# Merge the two groups to obtain a new balanced dataset
df_balanced = pd.concat([df_tr_x_class_0, df_tr_x_class_1])

# Initialize rf model
rf_model = RandomForestClassifier()
results = []

# Dividi il dataset in test e training
df_train = df_balanced
df_test = df_te

# Converti 'operation' in variabili dummy
feature_columns = [f'data_motor_{i}_position' for i in range(1, 7)] + \
                  [f'data_motor_{i}_temperature' for i in range(1, 7)] + \
                  [f'data_motor_{i}_voltage' for i in range(1, 7)] + \
                  ['operation']

# Converti 'operation' in variabili dummy e concatena con le altre feature
X_train = pd.get_dummies(df_train[feature_columns])
X_test = pd.get_dummies(df_test[feature_columns])

# Assicurati che le colonne di X_train e X_test siano le stesse
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

y_train = df_train[f'data_motor_{motor_number}_label']
# y_test = df_test[f'data_motor_{motor_number}_label']

# Addestra il modello sui dati di training
rf_model.fit(X_train, y_train)

# Fai previsioni sui dati di test
y_pred = rf_model.predict(X_test)

# Calcola le metriche
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, zero_division=0)
# recall = recall_score(y_test, y_pred, zero_division=0)
# f1 = f1_score(y_test, y_pred, zero_division=0)

# Memorizza i risultati
# results.append({
        # 'condition': condition,
        #  'accuracy': accuracy,
        #  'precision': precision,
        #  'recall': recall,
        #  'f1_score': f1
#  })

df_pred = pd.DataFrame({
    'time': df_te['time'],
    'test_condition': df_te['test_condition']
})

for x in range(1, 7):
    df_pred[f'data_motor_{x}_label'] = -1

# Inserimento dei valori di y_pred nella colonna data_motor_3_label
df_pred['data_motor_3_label'] = y_pred

In [13]:
# Random Forest on the failure dataset, with specific motor features and not including the operation as feature

models = []

for motor_number in range(1, 6+1):

    print("Motor ", motor_number)

    # Trova le condizioni di test valide per il motore corrente
    valid_conditions = find_test_conditions_for_motor_x(df_tr, motor_number)
    
    # Filtra il DataFrame originale usando le condizioni di test valide
    df_tr_x = df_tr[df_tr['test_condition'].isin(valid_conditions)]

    # Initialize rf model

    rf_model = RandomForestClassifier()
    results = []

    for condition in valid_conditions:
        # Dividi il dataset in test e training
        df_train = df_tr_x[df_tr_x['test_condition'] != condition]
        df_test = df_tr_x[df_tr_x['test_condition'] == condition]
    
        X_train = df_train[[f'data_motor_{motor_number}_position', f'data_motor_{motor_number}_temperature', f'data_motor_{motor_number}_voltage']]
        y_train = df_train[[f'data_motor_{motor_number}_label']]
        y_train = y_train[f'data_motor_{motor_number}_label']
    
        X_test = df_test[[f'data_motor_{motor_number}_position', f'data_motor_{motor_number}_temperature', f'data_motor_{motor_number}_voltage']]
        y_test = df_test[[f'data_motor_{motor_number}_label']]
        y_test = y_test[f'data_motor_{motor_number}_label']
    
        # Addestra il modello sui dati di training
        rf_model.fit(X_train, y_train)
    
        # Fai previsioni sui dati di test
        y_pred = rf_model.predict(X_test)
    
        # Calcola le metriche
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
    
        # Memorizza i risultati
        results.append({
            'condition': condition,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })
    
    # Calcolare le medie delle metriche
    avg_accuracy = sum(result['accuracy'] for result in results) / len(results)
    avg_precision = sum(result['precision'] for result in results) / len(results)
    avg_recall = sum(result['recall'] for result in results) / len(results)
    avg_f1 = sum(result['f1_score'] for result in results) / len(results)

    # Stampare i risultati medi per ogni motore
    print(f"Average results for motor {motor_number}:")
    print(f"Average Accuracy: {avg_accuracy:.2f}")
    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")
    print("")

Motor  1
Average results for motor 1:
Average Accuracy: 0.66
Average Precision: 0.22
Average Recall: 0.05
Average F1 Score: 0.04

Motor  2
Average results for motor 2:
Average Accuracy: 0.34
Average Precision: 0.35
Average Recall: 0.34
Average F1 Score: 0.06

Motor  3
Average results for motor 3:
Average Accuracy: 0.94
Average Precision: 0.00
Average Recall: 0.00
Average F1 Score: 0.00

Motor  4
Average results for motor 4:
Average Accuracy: 0.12
Average Precision: 0.42
Average Recall: 0.71
Average F1 Score: 0.22

Motor  5
Average results for motor 5:
Average Accuracy: 0.87
Average Precision: 0.00
Average Recall: 0.00
Average F1 Score: 0.00

Motor  6
Average results for motor 6:
Average Accuracy: 0.54
Average Precision: 0.43
Average Recall: 0.46
Average F1 Score: 0.32



In [8]:
# Random Forest on all the datasets, including the operation as feature

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assicurati che 'operation' sia una colonna categoriale
df_tr['operation'] = df_tr['operation'].astype('category')

all_conditions = conditions['Test id'].tolist()

models = []

for motor_number in range(1, 7):
    print("Motor ", motor_number)

    # Trova le condizioni di test valide per il motore corrente
    valid_conditions = find_test_conditions_for_motor_x(df_tr, motor_number)
    
    # Filtra il DataFrame originale usando le condizioni di test valide
    # df_tr_x = df_tr[df_tr['test_condition'].isin(valid_conditions)]

    # Initialize rf model
    rf_model = RandomForestClassifier()
    results = []

    for condition in all_conditions:
        # Dividi il dataset in test e training
        df_train = df_tr[df_tr['test_condition'] != condition]
        df_test = df_tr[df_tr['test_condition'] == condition]

        # Converti 'operation' in variabili dummy
        X_train = pd.get_dummies(df_train[[f'data_motor_{motor_number}_position', f'data_motor_{motor_number}_temperature', f'data_motor_{motor_number}_voltage', 'operation']])
        X_test = pd.get_dummies(df_test[[f'data_motor_{motor_number}_position', f'data_motor_{motor_number}_temperature', f'data_motor_{motor_number}_voltage', 'operation']])

        # Assicurati che le colonne di X_train e X_test siano le stesse
        X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

        y_train = df_train[f'data_motor_{motor_number}_label']
        y_test = df_test[f'data_motor_{motor_number}_label']

        # Addestra il modello sui dati di training
        rf_model.fit(X_train, y_train)

        # Fai previsioni sui dati di test
        y_pred = rf_model.predict(X_test)

        # Calcola le metriche
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        # Memorizza i risultati
        results.append({
            'condition': condition,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })

        print("Test done: ", condition)

    # Calcolare le medie delle metriche
    avg_accuracy = sum(result['accuracy'] for result in results) / len(results)
    avg_precision = sum(result['precision'] for result in results) / len(results)
    avg_recall = sum(result['recall'] for result in results) / len(results)
    avg_f1 = sum(result['f1_score'] for result in results) / len(results)

    # Stampare i risultati medi per ogni motore
    print(f"Average results for motor {motor_number}:")
    print(f"Average Accuracy: {avg_accuracy:.2f}")
    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")
    print("")


Motor  1
Test done:  20240105_164214
Test done:  20240105_165300
Test done:  20240105_165972
Test done:  20240320_152031
Test done:  20240320_153841
Test done:  20240320_155664
Test done:  20240321_122650
Test done:  20240325_135213
Test done:  20240325_152902
Test done:  20240325_155003
Test done:  20240425_093699
Test done:  20240425_094425
Test done:  20240426_140055
Test done:  20240426_141190
Test done:  20240426_141532
Test done:  20240426_141602
Test done:  20240426_141726
Test done:  20240426_141938
Test done:  20240426_141980
Test done:  20240503_163963
Test done:  20240503_164435
Test done:  20240503_164675
Test done:  20240503_165189
Test done:  20240524_094877
Test done:  20240524_100052
Test done:  20240524_101062
Test done:  20240524_101487
Test done:  20240524_102066
Test done:  20240524_102736
Test done:  20240524_102301
Test done:  20240524_103973
Test done:  20240524_104453
Test done:  20240524_104923
Test done:  20240524_105370
Test done:  20240524_105836
Test done: 

In [11]:
# Random Forest on failure datasets, with the specific motor features and including the operation as feature

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assicurati che 'operation' sia una colonna categoriale
df_tr['operation'] = df_tr['operation'].astype('category')

all_conditions = conditions['Test id'].tolist()

models = []

for motor_number in range(1, 7):
    print("Motor ", motor_number)

    # Trova le condizioni di test valide per il motore corrente
    valid_conditions = find_test_conditions_for_motor_x(df_tr, motor_number)
    
    # Filtra il DataFrame originale usando le condizioni di test valide
    df_tr_x = df_tr[df_tr['test_condition'].isin(valid_conditions)]

    # Initialize rf model
    rf_model = RandomForestClassifier()
    results = []

    for condition in valid_conditions:
        # Dividi il dataset in test e training
        df_train = df_tr_x[df_tr_x['test_condition'] != condition]
        df_test = df_tr_x[df_tr_x['test_condition'] == condition]

        # Converti 'operation' in variabili dummy
        X_train = pd.get_dummies(df_train[[f'data_motor_{motor_number}_position', f'data_motor_{motor_number}_temperature', f'data_motor_{motor_number}_voltage', 'operation']])
        X_test = pd.get_dummies(df_test[[f'data_motor_{motor_number}_position', f'data_motor_{motor_number}_temperature', f'data_motor_{motor_number}_voltage', 'operation']])

        # Assicurati che le colonne di X_train e X_test siano le stesse
        X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

        y_train = df_train[f'data_motor_{motor_number}_label']
        y_test = df_test[f'data_motor_{motor_number}_label']

        # Addestra il modello sui dati di training
        rf_model.fit(X_train, y_train)

        # Fai previsioni sui dati di test
        y_pred = rf_model.predict(X_test)

        # Calcola le metriche
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        # Memorizza i risultati
        results.append({
            'condition': condition,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })


    # Calcolare le medie delle metriche
    avg_accuracy = sum(result['accuracy'] for result in results) / len(results)
    avg_precision = sum(result['precision'] for result in results) / len(results)
    avg_recall = sum(result['recall'] for result in results) / len(results)
    avg_f1 = sum(result['f1_score'] for result in results) / len(results)

    # Stampare i risultati medi per ogni motore
    print(f"Average results for motor {motor_number}:")
    print(f"Average Accuracy: {avg_accuracy:.2f}")
    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")
    print("")


Motor  1
Average results for motor 1:
Average Accuracy: 0.52
Average Precision: 0.26
Average Recall: 0.40
Average F1 Score: 0.11

Motor  2
Average results for motor 2:
Average Accuracy: 0.37
Average Precision: 0.35
Average Recall: 0.37
Average F1 Score: 0.11

Motor  3
Average results for motor 3:
Average Accuracy: 0.94
Average Precision: 0.00
Average Recall: 0.00
Average F1 Score: 0.00

Motor  4
Average results for motor 4:
Average Accuracy: 0.12
Average Precision: 0.42
Average Recall: 0.71
Average F1 Score: 0.22

Motor  5
Average results for motor 5:
Average Accuracy: 0.87
Average Precision: 0.00
Average Recall: 0.00
Average F1 Score: 0.00

Motor  6
Average results for motor 6:
Average Accuracy: 0.53
Average Precision: 0.40
Average Recall: 0.43
Average F1 Score: 0.29



In [12]:
# Random Forest on failure datasets, with all the features and including the operation as feature

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assicurati che 'operation' sia una colonna categoriale
df_tr['operation'] = df_tr['operation'].astype('category')

all_conditions = conditions['Test id'].tolist()

models = []

for motor_number in range(1, 7):
    print("Motor ", motor_number)

    # Trova le condizioni di test valide per il motore corrente
    valid_conditions = find_test_conditions_for_motor_x(df_tr, motor_number)
    
    # Filtra il DataFrame originale usando le condizioni di test valide
    df_tr_x = df_tr[df_tr['test_condition'].isin(valid_conditions)]

    # Initialize rf model
    rf_model = RandomForestClassifier()
    results = []

    for condition in valid_conditions:
        # Dividi il dataset in test e training
        df_train = df_tr_x[df_tr_x['test_condition'] != condition]
        df_test = df_tr_x[df_tr_x['test_condition'] == condition]

        # Converti 'operation' in variabili dummy
        feature_columns = [f'data_motor_{i}_position' for i in range(1, 7)] + \
                          [f'data_motor_{i}_temperature' for i in range(1, 7)] + \
                          [f'data_motor_{i}_voltage' for i in range(1, 7)] + \
                          ['operation']

        # Converti 'operation' in variabili dummy e concatena con le altre feature
        X_train = pd.get_dummies(df_train[feature_columns])
        X_test = pd.get_dummies(df_test[feature_columns])

        # Assicurati che le colonne di X_train e X_test siano le stesse
        X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

        y_train = df_train[f'data_motor_{motor_number}_label']
        y_test = df_test[f'data_motor_{motor_number}_label']

        # Addestra il modello sui dati di training
        rf_model.fit(X_train, y_train)

        # Fai previsioni sui dati di test
        y_pred = rf_model.predict(X_test)

        # Calcola le metriche
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        # Memorizza i risultati
        results.append({
            'condition': condition,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })


    # Calcolare le medie delle metriche
    avg_accuracy = sum(result['accuracy'] for result in results) / len(results)
    avg_precision = sum(result['precision'] for result in results) / len(results)
    avg_recall = sum(result['recall'] for result in results) / len(results)
    avg_f1 = sum(result['f1_score'] for result in results) / len(results)

    # Stampare i risultati medi per ogni motore
    print(f"Average results for motor {motor_number}:")
    print(f"Average Accuracy: {avg_accuracy:.2f}")
    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")
    print("")


Motor  1
Average results for motor 1:
Average Accuracy: 0.55
Average Precision: 0.05
Average Recall: 0.40
Average F1 Score: 0.09

Motor  2
Average results for motor 2:
Average Accuracy: 0.05
Average Precision: 0.05
Average Recall: 0.67
Average F1 Score: 0.09

Motor  3
Average results for motor 3:
Average Accuracy: 0.94
Average Precision: 0.00
Average Recall: 0.00
Average F1 Score: 0.00

Motor  4
Average results for motor 4:
Average Accuracy: 0.38
Average Precision: 0.39
Average Recall: 0.37
Average F1 Score: 0.15

Motor  5
Average results for motor 5:
Average Accuracy: 0.85
Average Precision: 0.00
Average Recall: 0.00
Average F1 Score: 0.00

Motor  6
Average results for motor 6:
Average Accuracy: 0.70
Average Precision: 0.49
Average Recall: 0.40
Average F1 Score: 0.34



In [5]:
model = models[5]

x_t_6 = df_test[['data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']]
y_t_6 = df_test[['data_motor_6_label']]
y_t_6 = y_t_6['data_motor_6_label']

x_tr = df_tr[['data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage']]
y_tr = df_tr[['data_motor_6_label']]
y_tr = y_tr['data_motor_6_label']

y_p_6 = model.predict(x_t_6)

print('accuracy: ', accuracy_score(y_t_6, y_p_6))
print('precision: ', precision_score(y_t_6, y_p_6))
print('recall: ', recall_score(y_t_6, y_p_6))
print('f1: ', f1_score(y_t_6, y_p_6))

AttributeError: predict is not available when novelty=False, use fit_predict if you want to predict on training data. Use novelty=True if you want to use LOF for novelty detection and predict on new unseen data.