# Libs

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, LSTM, Dense, Dropout, GlobalMaxPooling1D, LayerNormalization, MultiHeadAttention, Add, Flatten, Concatenate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
from utils.futurai_ppd import drop_transitorio_desligado
import matplotlib.pyplot as plt
from sktime.datatypes._panel._convert import from_2d_array_to_nested

import warnings
import os
warnings.filterwarnings('ignore')

# Import dataset

In [None]:
base_name = 'Depurador 762-28-006'
timestamp = "Timestamp"

df_dataset = pd.read_csv('data/' + base_name + '.csv', sep=";", decimal=".", encoding="utf-8-sig")
df_dataset[timestamp] = pd.to_datetime(df_dataset[timestamp], format="%Y-%m-%d %H:%M:%S")
df_dataset.drop(columns=["762H0336.PV", "762H0342.PV", "762N0015.SP", "762P0013.SP"], inplace=True)
df_dataset.dropna(inplace=True)
print(f"Dataset shape: {df_dataset.shape}")
df_dataset.head()

## Remove periods Off

In [None]:
pre_process = []
pp_var_ref_desligado = "762-28-006.CR"
pp_valor_ref_desligado = 5
pp_tempo_ref_desligado = 0
pp_pre_corte_transitorio = 0
pp_pos_corte_transitorio = 0
pre_process.append(  
{
   "after_cut": pp_pos_corte_transitorio,
   "interval_off": pp_tempo_ref_desligado,
   "limit_off": pp_valor_ref_desligado,
   "pre_cut": pp_pre_corte_transitorio,
   "variable_off": pp_var_ref_desligado
  })

for pro in pre_process:
    df_dataset,_,_ = drop_transitorio_desligado(df_dataset,pro["variable_off"],pro["limit_off"],pro["interval_off"],timestamp,pre_corte=pro["pre_cut"],pos_corte=pro["after_cut"])
print(f"Dataset shape: {df_dataset.shape}")
df_dataset.head()

## Create label for anomaly

In [None]:
periodos_de_Falhas = [
    (pd.Timestamp('2024-05-03 11:00:00'), pd.Timestamp('2024-05-03 11:35:00')),
    (pd.Timestamp('2024-06-25 17:20:00'), pd.Timestamp('2024-08-02 14:00:00')),
    (pd.Timestamp('2024-10-19 10:40:00'), pd.Timestamp('2024-10-19 10:50:00')),
    (pd.Timestamp('2024-10-19 10:40:00'), pd.Timestamp('2024-10-19 10:50:00')),
    (pd.Timestamp('2024-10-21 12:00:00'), pd.Timestamp('2024-10-22 00:35:00')),
    (pd.Timestamp('2024-10-24 03:10:00'), pd.Timestamp('2024-10-27 00:00:00')),
    (pd.Timestamp('2024-11-14 06:40:00'), pd.Timestamp('2024-11-14 19:45:00')),
    (pd.Timestamp('2024-11-25 21:45:00'), pd.Timestamp('2024-11-25 22:03:00')),
    (pd.Timestamp('2024-11-27 15:00:00'), pd.Timestamp('2024-11-27 15:07:00')),
    (pd.Timestamp('2024-11-27 16:04:00'), pd.Timestamp('2024-11-26 16:11:00')),
    (pd.Timestamp('2024-11-30 13:30:00'), pd.Timestamp('2024-11-30 15:52:00')),
    (pd.Timestamp('2024-12-09 20:30:00'), pd.Timestamp('2024-12-11 07:00:00')),
    (pd.Timestamp('2025-03-05 15:17:00'), pd.Timestamp('2025-03-05 15:24:00')),
    (pd.Timestamp('2025-03-15 18:30:00'), pd.Timestamp('2025-03-15 19:15:00')),
    (pd.Timestamp('2025-03-18 11:40:00'), pd.Timestamp('2025-03-18 20:00:00')),
    (pd.Timestamp('2025-06-16 15:20:00'), pd.Timestamp('2025-06-17 07:38:00')),
]

# Inicializa a nova coluna com valor 0
df_dataset['Falhas'] = 0

# Para cada período de Falhas, marca os índices onde o timestamp está dentro desse período
for inicio, fim in periodos_de_Falhas:
    df_dataset.loc[(df_dataset[timestamp] >= inicio) & (df_dataset[timestamp] <= fim), 'Falhas'] = 1
    
df_dataset

## Import TAGs and descriptions

In [None]:
df_subsistema = pd.read_csv('data/'+ base_name + '_subsistema.csv', sep=";", decimal=".", encoding="utf-8-sig")
df_subsistema

## Plot variables

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_dataset['Timestamp'], 
    y=df_dataset["762P0034.PV"], 
    mode='lines', 
    name='762P0034.PV',
    line=dict(color='black')
))

fig.add_trace(go.Scatter(
    x=df_dataset['Timestamp'], 
    y=df_dataset["Falhas"], 
    mode='lines', 
    name='Falhas',
    line=dict(color='red')
))

fig.update_layout(
    template='plotly_white',
    hovermode='x unified'
)
fig.show()

# Random Forest Approach v1

## Data Train

In [None]:
start_date = pd.to_datetime("2024-05-03 09:00:00")
end_date = pd.to_datetime("2024-05-03 14:00:00")
mask = (df_dataset[timestamp] >= start_date) & (df_dataset[timestamp] <= end_date)
df_train = df_dataset.loc[mask]
print(df_train.shape)

start_date = pd.to_datetime("2024-07-27 06:00:00")
end_date = pd.to_datetime("2024-07-27 13:00:00")
mask = (df_dataset[timestamp] >= start_date) & (df_dataset[timestamp] <= end_date)
df_train2 = df_dataset.loc[mask]
df_train = pd.concat([df_train, df_train2])
print(df_train.shape)

start_date = pd.to_datetime("2025-06-16 12:00:00")
end_date = pd.to_datetime("2025-06-16 21:00:00")
mask = (df_dataset[timestamp] >= start_date) & (df_dataset[timestamp] <= end_date)
df_train3 = df_dataset.loc[mask]
df_train = pd.concat([df_train, df_train3])
print(df_train.shape)

start_date = pd.to_datetime("2024-09-09 08:00:00")
end_date = pd.to_datetime("2024-09-09 14:00:00")
mask = (df_dataset[timestamp] >= start_date) & (df_dataset[timestamp] <= end_date)
df_train4 = df_dataset.loc[mask]
df_train = pd.concat([df_train, df_train4])
print(df_train.shape)

start_date = pd.to_datetime("2024-06-30 12:00:00")
end_date = pd.to_datetime("2024-07-01 12:00:00")
mask = (df_dataset[timestamp] >= start_date) & (df_dataset[timestamp] <= end_date)
df_train5 = df_dataset.loc[mask]
df_train = pd.concat([df_train, df_train5])
print(df_train.shape)

start_date = pd.to_datetime("2024-03-04 00:00:00")
end_date = pd.to_datetime("2024-03-05 00:00:00")
mask = (df_dataset[timestamp] >= start_date) & (df_dataset[timestamp] <= end_date)
df_train6 = df_dataset.loc[mask]
df_train = pd.concat([df_train, df_train6])
print(df_train.shape)

In [None]:
print("Falhas Train:",df_train[df_train["Falhas"]==1].shape[0])
print("Normal Train:",df_train[df_train["Falhas"]==0].shape[0])

## Data Test

In [None]:
start_date = pd.to_datetime("2024-10-20 00:00:00")
end_date = pd.to_datetime("2024-10-27 00:00:00")

mask = (df_dataset[timestamp] >= start_date) & (df_dataset[timestamp] <= end_date)
df_test = df_dataset.loc[mask]
df_test.shape

In [None]:
print("Falhas Test:",df_test[df_test["Falhas"]==1].shape[0])
print("Normal Test:",df_test[df_test["Falhas"]==0].shape[0])

## Base Scalling

In [None]:
## Train Scalling
X_train = df_train.drop([timestamp,"Falhas"],axis=1)
y_train = df_train["Falhas"]

train_mean = X_train.mean()
train_std = X_train.std()

X_train = (X_train - train_mean) / train_std


## Test Scalling
X_test = df_test.drop([timestamp,"Falhas"],axis=1)
y_test = df_test["Falhas"]
X_test = (X_test - train_mean) / train_std

## Fit Model

In [None]:
rf_clf = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=1, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None)

k_folds = KFold(n_splits = 5)
scores = cross_val_score(rf_clf, X_train, y_train, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

rf_clf.fit(X_train, y_train)

## Predict

In [None]:
y_pred = rf_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

In [None]:
plt.plot(y_pred)
plt.plot(y_test.values)

# sktime

In [None]:
# X_train and X_test conversion to sktime format
X_train_sktime = from_2d_array_to_nested(X_train.values)
X_test_sktime = from_2d_array_to_nested(X_test.values)

# Keep labels the same for sktime
y_train_sktime = y_train
y_test_sktime = y_test

## WEALSEL + MUSE

In [None]:
from sktime.classification.dictionary_based import MUSE

muse_clf = MUSE(anova=True, variance=False, bigrams=True, window_inc=2, alphabet_size=4, use_first_order_differences=True, feature_selection='chi2', p_threshold=0.05, support_probabilities=False, n_jobs=1, random_state=1)
muse_clf.fit(X_train_sktime, y_train_sktime)

In [None]:
y_pred_muse = muse_clf.predict(X_test_sktime)

accuracy = accuracy_score(y_test_sktime, y_pred_muse)
f1 = f1_score(y_test_sktime, y_pred_muse, average='weighted')
recall = recall_score(y_test_sktime, y_pred_muse, average='weighted')
precision = precision_score(y_test_sktime, y_pred_muse, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

## ROCKET

In [None]:
from sktime.classification.kernel_based import RocketClassifier

rocket_clf = RocketClassifier(num_kernels=10000, rocket_transform='rocket', max_dilations_per_kernel=32, n_features_per_kernel=4, use_multivariate='yes', n_jobs=1, random_state=1)
rocket_clf.fit(X_train_sktime, y_train_sktime)

In [None]:
y_pred_rocket= rocket_clf.predict(X_test_sktime)

accuracy = accuracy_score(y_test_sktime, y_pred_rocket)
f1 = f1_score(y_test_sktime, y_pred_rocket, average='weighted')
recall = recall_score(y_test_sktime, y_pred_rocket, average='weighted')
precision = precision_score(y_test_sktime, y_pred_rocket, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

## TimeMIL