In [2]:
import autorootcwd  # noqa
import pandas as pd
import numpy as np

from src.utils.add_signal_data import (
    add_signal_data_cwru,
    add_signal_data_lva,
    add_signal_data_ottawa,
    add_signal_data_hust,
)

from hamilton import driver
from logging import INFO

import hydra
from hydra import compose, initialize
from omegaconf import OmegaConf
from hydra.core.hydra_config import HydraConfig

from src.data.feature_engineering import feature_extraction_pipeline
from src.utils.logger import setup_logger
from src.data.pydantic_models import BearingDataset
from src.data.preprocess import (
    preprocess_cwru,
    preprocess_lva,
    preprocess_ottawa,
    preprocess_hust,
)

In [3]:
paderborn_metadata = pd.read_pickle(
    "/data/bearing_datasets/paderborn/processed/files_metadata.bz2"
)
cwru_metadata = pd.read_pickle(
    "/data/bearing_datasets/cwru/processed/files_metadata.bz2"
)
ottawa_metadata = pd.read_pickle(
    "/data/bearing_datasets/ottawa/processed/files_metadata.bz2"
)
hust_metadata = pd.read_pickle(
    "/data/bearing_datasets/hust/processed/files_metadata.bz2"
)

In [4]:
paderborn_metadata

Unnamed: 0,waveform_id,bearing_id,bearing_manufacturer,repetition,load_torque,radial_force,rotational_speed,fault_label,severity,type_of_failure,...,date,bpfo,bpfi,bsf,ftf,multiclass_label,inner,outer,ball,fs
0,N15_M07_F04_KA08_16,KA08,IBU,16,0.7,400,1500,outer,2,artificial,...,2014-12-29 12:14:25,3.0704,4.9296,2.036,0.3838,2,0,1,0,64000.0
1,N15_M07_F04_KA08_9,KA08,IBU,9,0.7,400,1500,outer,2,artificial,...,2014-12-29 12:10:56,3.0704,4.9296,2.036,0.3838,2,0,1,0,64000.0
2,N15_M07_F10_KA08_12,KA08,IBU,12,0.7,1000,1500,outer,2,artificial,...,2014-12-29 11:08:33,3.0704,4.9296,2.036,0.3838,2,0,1,0,64000.0
3,N15_M01_F10_KA08_4,KA08,IBU,4,0.1,1000,1500,outer,2,artificial,...,2014-12-29 11:47:00,3.0704,4.9296,2.036,0.3838,2,0,1,0,64000.0
4,N09_M07_F10_KA08_5,KA08,IBU,5,0.7,1000,900,outer,2,artificial,...,2014-12-29 11:28:31,3.0704,4.9296,2.036,0.3838,2,0,1,0,64000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,N15_M01_F10_KB23_19,KB23,MTK,19,0.1,1000,1500,combination,2,real,...,2015-03-25 13:14:38,3.0704,4.9296,2.036,0.3838,,1,1,0,64000.0
2556,N15_M01_F10_KB23_20,KB23,MTK,20,0.1,1000,1500,combination,2,real,...,2015-03-25 13:16:11,3.0704,4.9296,2.036,0.3838,,1,1,0,64000.0
2557,N15_M07_F10_KB23_17,KB23,MTK,17,0.7,1000,1500,combination,2,real,...,2015-03-25 12:39:15,3.0704,4.9296,2.036,0.3838,,1,1,0,64000.0
2558,N15_M07_F04_KB23_18,KB23,MTK,18,0.7,400,1500,combination,2,real,...,2015-03-25 13:46:12,3.0704,4.9296,2.036,0.3838,,1,1,0,64000.0


In [42]:
# Initialize Hydra with the directory where your config is located'/home/joao-paulo-vieira/code/preprocessing/configs/experiment'  # Folder containing the .yaml file
config_path = "../configs"
config_name = "feature_engineering"  # Name of your YAML file without extension


def get_configs(dataset="paderborn", segment=None, resample=False, overlap_pct=None):

    # Use Hydra to load the config
    with initialize(config_path=config_path, version_base="1.3"):
        cfg_hydra = compose(
            config_name=config_name,
            overrides=[f"+experiment/feature_eng=extract_features_{dataset}"],
            return_hydra_config=True,
        )
        HydraConfig().instance().set_config(cfg_hydra)  # Simular o @hydra.main
        cfg = compose(
            config_name=config_name,
            overrides=[f"+experiment/feature_eng=extract_features_{dataset}"],
        )
        cfg_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)

    datasets = hydra.utils.instantiate(cfg.datasets)
    feature_pipeline = hydra.utils.instantiate(cfg.feature_pipeline, _convert_="partial")
    # update_feature_store = hydra.utils.instantiate(cfg.update_feature_store)

    logger = setup_logger(
        name="Feature Engineering",
        log_file="logs/feature_engineering.log",
        format_str="%(asctime)s | %(levelname)s | %(message)s",
        level=INFO,
    )

    cfg_dict["datasets"] = datasets
    cfg_dict["feature_pipeline"] = feature_pipeline
    cfg_dict["logger"] = logger
    cfg_dict['segment'] = segment
    cfg_dict['resample'] = resample
    cfg_dict['overlap_pct'] = overlap_pct if overlap_pct is not None else 0.0

    return cfg_dict


In [43]:
dr = driver.Builder().with_config({}).with_modules(feature_extraction_pipeline).build()

In [44]:
dr;

In [59]:
df

Unnamed: 0,waveform_id,bearing_id,fault_type,severity,load,rpm,fs,bpfo,bpfi,bsf,ftf,inner,outer,ball,cage,duration,vibration,waveform_id_seg
0,16_Healthy_0,16,Healthy,0,400,1780,42000,3.05,4.95,1.986,0.381,0,0,0,0,10.0,"[-7.696933, -7.632468, -7.664701, -7.858096, -...",16_Healthy_0_1
1,16_Healthy_0,16,Healthy,0,400,1780,42000,3.05,4.95,1.986,0.381,0,0,0,0,10.0,"[1.811675, 1.102559, 1.005861, 1.102559, 1.263...",16_Healthy_0_2
2,16_Healthy_0,16,Healthy,0,400,1780,42000,3.05,4.95,1.986,0.381,0,0,0,0,10.0,"[6.87219, 6.163073, 4.615909, 3.294374, 2.4240...",16_Healthy_0_3
3,16_Healthy_0,16,Healthy,0,400,1780,42000,3.05,4.95,1.986,0.381,0,0,0,0,10.0,"[10.417773, 10.192145, 10.353308, 10.836797, 1...",16_Healthy_0_4
4,16_Healthy_0,16,Healthy,0,400,1780,42000,3.05,4.95,1.986,0.381,0,0,0,0,10.0,"[11.159122, 11.223588, 11.320285, 11.12689, 10...",16_Healthy_0_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,4_Inner_2,4,Inner,2,400,1792,42000,3.05,4.95,1.986,0.381,1,0,0,0,10.0,"[81.490652, 69.725747, 71.466308, 76.333433, 6...",4_Inner_2_2096
2096,4_Inner_2,4,Inner,2,400,1792,42000,3.05,4.95,1.986,0.381,1,0,0,0,10.0,"[75.495385, 75.978875, 69.629049, 63.375923, 6...",4_Inner_2_2097
2097,4_Inner_2,4,Inner,2,400,1792,42000,3.05,4.95,1.986,0.381,1,0,0,0,10.0,"[77.139249, 83.714703, 95.286216, 101.539346, ...",4_Inner_2_2098
2098,4_Inner_2,4,Inner,2,400,1792,42000,3.05,4.95,1.986,0.381,1,0,0,0,10.0,"[40.523009, 40.426311, 48.097669, 53.802839, 5...",4_Inner_2_2099


In [71]:
all_features = []
for dataset in ['paderborn', 'cwru', 'ottawa']:

    cfg_dict = get_configs(dataset=dataset, segment=True, resample=False, overlap_pct=0.04 if dataset=='paderborn' else None)
    temp_out = dr.execute(["preprocess_data"], inputs=cfg_dict)
    df = temp_out['preprocess_data'][0].metadata

    if dataset == 'paderborn':
        paderborn_bearings =["K001", "K002", "K003", "K004", "K005", "K006","KA04", "KA15", "KA16", "KA22", "KA30","KI04", "KI14", "KI16", "KI18", "KI21", "KI17"]
        temp_out['preprocess_data'][0].metadata = df[df['bearing_id'].isin(paderborn_bearings)].reset_index(drop=True)
    
    if dataset == 'cwru':
        filtered_cwru = pd.read_csv("data/splits/cwru/filtered_cwru.csv")
        temp_out['preprocess_data'][0].metadata = df[df['waveform_id'].isin(filtered_cwru['waveform_id'])].reset_index(drop=True)

    features = dr.execute(["extract_features"], inputs=cfg_dict, overrides={"preprocess_data": temp_out['preprocess_data']})
    all_features.append(features['extract_features'][0].metadata)


2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadata files
2025-09-19 10:24:08,989 | INFO | Reading metadat

Segmenting dataset Paderborn. Original size: 2560


2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Paderborn
2025-09-19 10:24:11,119 | INFO | Extracting features for Pader

Segmented dataset Paderborn. New size: 10240
Dataset shape before feature extraction: (5440, 23)
Dataset shape after feature extraction: (5440, 38)
Dropping columns: ['bearing_id', 'bearing_manufacturer', 'repetition', 'load_torque', 'radial_force', 'rotational_speed', 'fault_label', 'severity', 'type_of_failure', 'duration', 'date', 'bpfo', 'bpfi', 'bsf', 'ftf', 'multiclass_label', 'inner', 'outer', 'ball', 'fs', 'vibration']


2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadata files
2025-09-19 10:24:47,121 | INFO | Reading metadat

Segmenting dataset CWRU. Original size: 411
Segmented dataset CWRU. New size: 6782
Dataset shape before feature extraction: (1440, 19)


2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadata files
2025-09-19 10:25:04,065 | INFO | Reading metadat

Dataset shape after feature extraction: (1440, 44)
Dropping columns: ['fault_location', 'load', 'rpm', 'fault_type', 'fault_size', 'signal_location', 'fs', 'duration', 'bpfo', 'bpfi', 'bsf', 'ftf', 'inner', 'outer', 'ball', 'multiclass_label', 'signal']


2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extracting features for Ottawa
2025-09-19 10:25:08,649 | INFO | Extract

Segmenting dataset Ottawa. Original size: 60
Segmented dataset Ottawa. New size: 600
Dataset shape before feature extraction: (600, 18)
Dataset shape after feature extraction: (600, 43)
Dropping columns: ['bearing_id', 'fault_type', 'severity', 'load', 'rpm', 'fs', 'bpfo', 'bpfi', 'bsf', 'ftf', 'inner', 'outer', 'ball', 'cage', 'duration', 'vibration']


In [72]:
all_features[0].waveform_id.nunique(), all_features[1].waveform_id.nunique(), all_features[2].waveform_id.nunique()

(1360, 144, 60)

In [73]:
all_features[0].shape, all_features[1].shape, all_features[2].shape # 4 segments per signal for paderborn, 10 segment per signal for cwru, 10 segments per signal for ottawa

((5440, 17), (1440, 27), (600, 27))

In [None]:
all_features[0].to_pickle('data/features/paderborn_features_segmented.pkl')
all_features[1].to_pickle('data/features/cwru_features_segmented.pkl')
all_features[2].to_pickle('data/features/ottawa_features_segmented.pkl')