# Libs

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objects as go
from utils.futurai_ppd import drop_transitorio_desligado
import matplotlib.pyplot as plt
from sktime.datatypes._panel._convert import from_2d_array_to_nested
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

import warnings
import os
warnings.filterwarnings('ignore')

# Import dataset

In [None]:
base_name = 'Depurador 762-28-006'
timestamp = "Timestamp"

df_dataset = pd.read_csv('data/' + base_name + '.csv', sep=";", decimal=".", encoding="utf-8-sig")
df_dataset[timestamp] = pd.to_datetime(df_dataset[timestamp], format="%Y-%m-%d %H:%M:%S")
## Drop columns with NaN values, constant values or irrelevant to the analysis
df_dataset.drop(columns=["762H0336.PV", "762H0342.PV", "762N0015.SP", "762P0013.SP", "762-34-073.CR"], inplace=True, errors='ignore')
## Drop rows with NaN values
df_dataset.dropna(inplace=True)
print(f"Dataset shape: {df_dataset.shape}")

list_variables = df_dataset.columns.tolist()
df_dataset.head()

## Remove periods Off

In [None]:
pre_process = []
pp_var_ref_desligado = "762-28-006.CR"
pp_valor_ref_desligado = 5
pp_tempo_ref_desligado = 0
pp_pre_corte_transitorio = 0
pp_pos_corte_transitorio = 0
pre_process.append(  
{
   "after_cut": pp_pos_corte_transitorio,
   "interval_off": pp_tempo_ref_desligado,
   "limit_off": pp_valor_ref_desligado,
   "pre_cut": pp_pre_corte_transitorio,
   "variable_off": pp_var_ref_desligado
  })

for pro in pre_process:
    df_dataset,_,_ = drop_transitorio_desligado(df_dataset,pro["variable_off"],pro["limit_off"],pro["interval_off"],timestamp,pre_corte=pro["pre_cut"],pos_corte=pro["after_cut"])
print(f"Dataset shape: {df_dataset.shape}")
df_dataset.head()

## Create label for anomaly

In [None]:
periodos_de_Falhas = [
    (pd.Timestamp('2024-05-03 11:00:00'), pd.Timestamp('2024-05-03 11:35:00')),
    (pd.Timestamp('2024-06-25 17:20:00'), pd.Timestamp('2024-08-02 14:00:00')),
    (pd.Timestamp('2024-10-19 10:40:00'), pd.Timestamp('2024-10-19 10:50:00')),
    (pd.Timestamp('2024-10-19 10:40:00'), pd.Timestamp('2024-10-19 10:50:00')),
    (pd.Timestamp('2024-10-21 12:00:00'), pd.Timestamp('2024-10-22 00:35:00')),
    (pd.Timestamp('2024-10-24 03:10:00'), pd.Timestamp('2024-10-27 00:00:00')),
    (pd.Timestamp('2024-11-14 06:40:00'), pd.Timestamp('2024-11-14 19:45:00')),
    (pd.Timestamp('2024-11-25 21:45:00'), pd.Timestamp('2024-11-25 22:03:00')),
    (pd.Timestamp('2024-11-27 15:00:00'), pd.Timestamp('2024-11-27 15:07:00')),
    (pd.Timestamp('2024-11-27 16:04:00'), pd.Timestamp('2024-11-26 16:11:00')),
    (pd.Timestamp('2024-11-30 13:30:00'), pd.Timestamp('2024-11-30 15:52:00')),
    (pd.Timestamp('2024-12-09 20:30:00'), pd.Timestamp('2024-12-11 07:00:00')),
    (pd.Timestamp('2025-03-05 15:17:00'), pd.Timestamp('2025-03-05 15:24:00')),
    (pd.Timestamp('2025-03-15 18:30:00'), pd.Timestamp('2025-03-15 19:15:00')),
    (pd.Timestamp('2025-03-18 11:40:00'), pd.Timestamp('2025-03-18 20:00:00')),
    (pd.Timestamp('2025-06-16 15:20:00'), pd.Timestamp('2025-06-17 07:38:00')),
]

df_dataset['Falhas'] = 0

for inicio, fim in periodos_de_Falhas:
    df_dataset.loc[(df_dataset[timestamp] >= inicio) & (df_dataset[timestamp] <= fim), 'Falhas'] = 1
    
df_dataset

## Import TAGs and descriptions

In [None]:
df_subsistema = pd.read_csv('data/'+ base_name + '_subsistema.csv', sep=";", decimal=".", encoding="utf-8-sig")
df_subsistema

## Plot variables

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_dataset['Timestamp'],
    y=df_dataset["762P0034.PV"],
    mode='lines',
    name='762P0034.PV',
    line=dict(color='black')
))

fig.add_trace(go.Scatter(
    x=df_dataset['Timestamp'],
    y=df_dataset["Falhas"],
    mode='lines',
    name='Falhas',
    line=dict(color='red')
))

fig.update_layout(
    template='plotly_white',
    hovermode='x unified'
)
fig.show()

## Feature extraction

In [None]:
df = df_dataset.set_index(timestamp)

variables = df.drop(columns=["Falhas"])
labels = df["Falhas"]

window = "60min"

agg_funcs = ["mean", "std", "min", "max", "median", "skew", "var", "median"]
X_features = variables.resample(window).agg(agg_funcs)
X_features.columns = ['_'.join(col).strip() for col in X_features.columns.values]

## Most frequent label in the window, handling empty windows
def most_frequent(x):
    return x.mode().iloc[0] if not x.mode().empty else np.nan

y_features = labels.resample(window).agg(most_frequent)

## Remove windows with NaN in X or y
X_features = X_features.dropna()
y_features = y_features.loc[X_features.index].dropna()
X_features = X_features.loc[y_features.index]

### Train Test Split

In [None]:
X_features_train, X_features_test, y_features_train, y_features_test = train_test_split(X_features, y_features, test_size=0.3, random_state=42, shuffle=True)

## Scalling training set
scaler = StandardScaler()
scaler.fit(X_features_train)
X_features_train = pd.DataFrame(scaler.transform(X_features_train), columns=X_features_train.columns, index=X_features_train.index)
X_features_test = pd.DataFrame(scaler.transform(X_features_test), columns=X_features_test.columns, index=X_features_test.index)


## Verify class distribution in train and test sets
print("Class distribution in training set (%):")
print(y_features_train.value_counts(normalize=True) * 100)
print("\nClass distribution in test set (%):")
print(y_features_test.value_counts(normalize=True) * 100)

### SMOTE training set

In [None]:
sm = SMOTE(random_state=42)
X_features_train_balanced, y_features_train_balanced = sm.fit_resample(X_features_train, y_features_train)

## Verify class distribution in train and test sets
print("Class distribution in training set (%):")
print(y_features_train_balanced.value_counts(normalize=True) * 100)

# Random Forest

In [None]:
rf_clf = RandomForestClassifier(
    n_estimators=400, 
    criterion='gini', 
    max_depth=None, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    min_weight_fraction_leaf=0.0, 
    max_features='sqrt', 
    max_leaf_nodes=None, 
    min_impurity_decrease=0.0, 
    bootstrap=True, 
    oob_score=False, 
    n_jobs=None, 
    random_state=42, 
    verbose=0, 
    warm_start=False, 
    class_weight=None, 
    ccp_alpha=0.0, 
    max_samples=None, 
    monotonic_cst=None
)

## Cross validation
k_folds = KFold(n_splits = 5)
scores = cross_val_score(rf_clf, X_features_train_balanced, y_features_train_balanced, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

## Fit model
rf_clf.fit(X_features_train_balanced, y_features_train_balanced)

## Predict and evaluate
y_pred = rf_clf.predict(X_features_test)

accuracy = accuracy_score(y_features_test, y_pred)
f1 = f1_score(y_features_test, y_pred, average='binary')
recall = recall_score(y_features_test, y_pred, average='binary')
precision = precision_score(y_features_test, y_pred, average='binary')
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

# XGBoost

In [None]:
xgb_clf = XGBClassifier(
    n_estimators=400,      # número de árvores
    learning_rate=0.1,     # taxa de aprendizado (quanto menor, mais estável, mas precisa mais árvores)
    max_depth=6,           # profundidade máxima da árvore
    subsample=0.8,         # fração de amostras usadas em cada árvore
    colsample_bytree=0.8,  # fração de variáveis usadas em cada árvore
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"  # ou "mlogloss" para multiclasse
)

## Cross validation
k_folds = KFold(n_splits = 5)
scores = cross_val_score(xgb_clf, X_features_train_balanced, y_features_train_balanced, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

## Fit model
xgb_clf.fit(X_features_train_balanced.to_numpy(), y_features_train_balanced.to_numpy())

## Predict and evaluate
y_pred = xgb_clf.predict(X_features_test.to_numpy())

accuracy = accuracy_score(y_features_test, y_pred)
f1 = f1_score(y_features_test, y_pred, average='binary')
recall = recall_score(y_features_test, y_pred, average='binary')
precision = precision_score(y_features_test, y_pred, average='binary')
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

# New Data for Classification

## Import Data

In [None]:
base_name = 'Depurador 762-28-006'
timestamp = "Timestamp"

df_dataset = pd.read_csv('data/' + base_name + '_teste.csv', sep=";", decimal=".", encoding="utf-8-sig")
df_dataset[timestamp] = pd.to_datetime(df_dataset[timestamp], format="%Y-%m-%d %H:%M:%S")
df_dataset.drop_duplicates(subset=[timestamp], keep='first', inplace=True)
df_dataset.sort_values(by=timestamp, inplace=True)
## Drop columns with NaN values, constant values or irrelevant to the analysis
df_dataset.drop(columns=["762H0336.PV", "762H0342.PV", "762N0015.SP", "762P0013.SP", "762-34-073.CR"], inplace=True, errors='ignore')
## Drop rows with NaN values
df_dataset.dropna(inplace=True)
print(f"Dataset shape: {df_dataset.shape}")
df_dataset = df_dataset[list_variables]

pre_process = []
pp_var_ref_desligado = "762-28-006.CR"
pp_valor_ref_desligado = 5
pp_tempo_ref_desligado = 0
pp_pre_corte_transitorio = 0
pp_pos_corte_transitorio = 0
pre_process.append(  
{
   "after_cut": pp_pos_corte_transitorio,
   "interval_off": pp_tempo_ref_desligado,
   "limit_off": pp_valor_ref_desligado,
   "pre_cut": pp_pre_corte_transitorio,
   "variable_off": pp_var_ref_desligado
  })

for pro in pre_process:
    df_dataset,_,_ = drop_transitorio_desligado(df_dataset,pro["variable_off"],pro["limit_off"],pro["interval_off"],timestamp,pre_corte=pro["pre_cut"],pos_corte=pro["after_cut"])
print(f"Dataset shape after remove offs: {df_dataset.shape}")
df_dataset.head()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_dataset['Timestamp'],
    y=df_dataset["762P0034.PV"],
    mode='lines',
    name='762P0034.PV',
    line=dict(color='black')
))

fig.update_layout(
    template='plotly_white',
    hovermode='x unified'
)
fig.show()

In [None]:
df_predict = df_dataset.set_index(timestamp)

variables = df_predict.drop(columns=["Falhas"])
labels = df_predict["Falhas"]

window = "60min"

agg_funcs = ["mean", "std", "min", "max", "median", "skew", "var", "median"]
X_features = variables.resample(window).agg(agg_funcs)
X_features.columns = ['_'.join(col).strip() for col in X_features.columns.values]

## Most frequent label in the window, handling empty windows
def most_frequent(x):
    return x.mode().iloc[0] if not x.mode().empty else np.nan

y_features = labels.resample(window).agg(most_frequent)

## Remove windows with NaN in X or y
X_features = X_features.dropna()
y_features = y_features.loc[X_features.index].dropna()
X_features = X_features.loc[y_features.index]

X_features = pd.DataFrame(scaler.transform(X_features), columns=X_features.columns, index=X_features.index)
X_features

# sktime

## Train test split timeseries

In [None]:
X = df_dataset.drop(columns=["Falhas", timestamp])
y = df_dataset["Falhas"]

X_raw_train, X_raw_test, y_raw_train, y_raw_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

## Verify class distribution in train and test sets
print("Class distribution in training set (%):")
print(y_raw_train.value_counts(normalize=True) * 100)
print("\nClass distribution in test set (%):")
print(y_raw_test.value_counts(normalize=True) * 100)

## Convert to sktime format

In [None]:
# X_train and X_test conversion to sktime format
X_train_sktime = from_2d_array_to_nested(X_raw_train.values)
X_test_sktime = from_2d_array_to_nested(X_raw_test.values)

# Keep labels the same for sktime
y_train_sktime = y_raw_train
y_test_sktime = y_raw_test

## WEALSEL + MUSE

In [None]:
from sktime.classification.dictionary_based import MUSE

# Fit Model
muse_clf = MUSE(anova=True, variance=False, bigrams=True, window_inc=2, alphabet_size=4, use_first_order_differences=True, feature_selection='chi2', p_threshold=0.05, support_probabilities=False, n_jobs=1, random_state=42)
muse_clf.fit(X_train_sktime, y_train_sktime)

# Predict
y_pred_muse = muse_clf.predict(X_test_sktime)

accuracy = accuracy_score(y_test_sktime, y_pred_muse)
f1 = f1_score(y_test_sktime, y_pred_muse, average='binary')
recall = recall_score(y_test_sktime, y_pred_muse, average='binary')
precision = precision_score(y_test_sktime, y_pred_muse, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

## ROCKET

In [None]:
from sktime.classification.kernel_based import RocketClassifier

# Fit Model
rocket_clf = RocketClassifier(num_kernels=10000, rocket_transform='rocket', max_dilations_per_kernel=32, n_features_per_kernel=4, use_multivariate='yes', n_jobs=1, random_state=42)
rocket_clf.fit(X_train_sktime, y_train_sktime)

# Predict
y_pred_rocket= rocket_clf.predict(X_test_sktime)

accuracy = accuracy_score(y_test_sktime, y_pred_rocket)
f1 = f1_score(y_test_sktime, y_pred_rocket, average='binary')
recall = recall_score(y_test_sktime, y_pred_rocket, average='binary')
precision = precision_score(y_test_sktime, y_pred_rocket, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

## KNeighborsTimeSeriesClassifier

In [None]:
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier

# Fit Model
KNeighbors_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, weights='uniform', algorithm='brute_incr', distance='dtw', distance_params=None, distance_mtype=None, pass_train_distances=False, leaf_size=30, n_jobs=None)
KNeighbors_clf.fit(X_train_sktime, y_train_sktime)
# Predict  
y_pred_kneighbors = KNeighbors_clf.predict(X_test_sktime)

accuracy = accuracy_score(y_test_sktime, y_pred_kneighbors)
f1 = f1_score(y_test_sktime, y_pred_kneighbors, average='weighted')
recall = recall_score(y_test_sktime, y_pred_kneighbors, average='weighted')
precision = precision_score(y_test_sktime, y_pred_kneighbors, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

## TimeMIL