In [1]:
import autorootcwd  # noqa
import pandas as pd
import numpy as np

from src.utils.add_signal_data import (
    add_signal_data_cwru,
    add_signal_data_lva,
    add_signal_data_ottawa,
    add_signal_data_hust,
)

from hamilton import driver
from logging import INFO

import hydra
from hydra import compose, initialize
from omegaconf import OmegaConf
from hydra.core.hydra_config import HydraConfig

from src.data.feature_engineering import feature_extraction_pipeline
from src.utils.logger import setup_logger
from src.data.pydantic_models import BearingDataset
from src.data.preprocess import (
    preprocess_cwru,
    preprocess_lva,
    preprocess_ottawa,
    preprocess_hust,
)

In [2]:
paderborn_metadata = pd.read_pickle(
    "/data/bearing_datasets/paderborn/processed/files_metadata.bz2"
)
cwru_metadata = pd.read_pickle(
    "/data/bearing_datasets/cwru/processed/files_metadata.bz2"
)
ottawa_metadata = pd.read_pickle(
    "/data/bearing_datasets/ottawa/processed/files_metadata.bz2"
)
hust_metadata = pd.read_pickle(
    "/data/bearing_datasets/hust/processed/files_metadata.bz2"
)

In [3]:
cwru_metadata

Unnamed: 0,waveform_id,fault_location,load,rpm,fault_type,fault_size,signal_location,fs,duration,bpfo,bpfi,bsf,ftf,inner,outer,ball,multiclass_label
0,OR@3_7_DE_DE_48_3HP,DE,3,1730,OR@3,7,DE,48000,10.129667,3.5848,5.4152,4.7135,0.39828,0,1,0,2
1,B_21_FE_FE_12_3HP,FE,3,1730,B,21,FE,12000,10.066750,3.0530,4.9469,3.9874,0.38170,0,0,1,3
2,B_7_DE_BA_12_3HP,DE,3,1730,B,7,BA,12000,10.129667,,,,,0,0,0,0
3,OR@3_7_FE_FE_12_3HP,FE,3,1730,OR@3,7,FE,12000,10.143250,3.0530,4.9469,3.9874,0.38170,0,1,0,2
4,OR@6_14_DE_FE_12_3HP,DE,3,1730,OR@6,14,FE,12000,10.165917,3.0530,4.9469,3.9874,0.38170,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,OR@6_7_DE_DE_48_1HP,DE,1,1772,OR@6,7,DE,48000,10.141750,3.5848,5.4152,4.7135,0.39828,0,1,0,2
407,OR@12_7_FE_FE_12_1HP,FE,1,1772,OR@12,7,FE,12000,10.082000,3.0530,4.9469,3.9874,0.38170,0,1,0,2
408,IR_28_DE_DE_12_1HP,DE,1,1772,IR,28,DE,12000,10.112583,3.5848,5.4152,4.7135,0.39828,1,0,0,1
409,OR@3_21_FE_FE_12_1HP,FE,1,1772,OR@3,21,FE,12000,10.051417,3.0530,4.9469,3.9874,0.38170,0,1,0,2


In [4]:
# Initialize Hydra with the directory where your config is located'/home/joao-paulo-vieira/code/preprocessing/configs/experiment'  # Folder containing the .yaml file
config_path = "../configs"
config_name = "feature_engineering"  # Name of your YAML file without extension

# Use Hydra to load the config
with initialize(config_path=config_path, version_base="1.3"):
    cfg_hydra = compose(
        config_name=config_name,
        overrides=["+experiment=extract_features_cwru"],
        return_hydra_config=True,
    )
    HydraConfig().instance().set_config(cfg_hydra)  # Simular o @hydra.main
    cfg = compose(
        config_name=config_name,
        overrides=["+experiment=extract_features_cwru"],
    )
    cfg_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)

datasets = hydra.utils.instantiate(cfg.datasets)
feature_pipeline = hydra.utils.instantiate(cfg.feature_pipeline, _convert_="partial")
# update_feature_store = hydra.utils.instantiate(cfg.update_feature_store)

logger = setup_logger(
    name="Feature Engineering",
    log_file="logs/feature_engineering.log",
    format_str="%(asctime)s | %(levelname)s | %(message)s",
    level=INFO,
)

cfg_dict["datasets"] = datasets
cfg_dict["feature_pipeline"] = feature_pipeline
cfg_dict["logger"] = logger

In [32]:
cfg_dict['segment'] = True
cfg_dict['resample'] = True
cfg_dict['overlap_pct'] = 0.53

In [33]:
dr = driver.Builder().with_config({}).with_modules(feature_extraction_pipeline).build()

In [34]:
dr

ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

<hamilton.driver.Driver at 0x7f1fdd200c50>

In [35]:
#test = dr.execute(
#    ["preprocess_data"],
#    inputs={
#        "datasets": [
#            BearingDataset(
#                name="CWRU",
#                add_signal_function=add_signal_data_cwru,
#                signal_column="signal",
#                label_column=["inner", "outer", "ball"],
#                preprocess_function=preprocess_cwru,
#                split_function=None,
#            )
#        ],
#        "resample": True,
#        "segment": True,
#        "segment_size": 12000,
#        "overlap_pct": 0.5,
#        "logger": logger,
#    },
#)

In [36]:
f = dr.execute(["preprocess_data"], inputs=cfg_dict)

2025-09-16 13:07:35,396 | INFO | Reading metadata files
2025-09-16 13:07:35,396 INFO: Reading metadata files


Segmenting dataset CWRU. Original size: 411
Segmented dataset CWRU. New size: 7846


In [37]:
df = f['preprocess_data'][0].metadata

In [38]:
tst = cfg_dict.copy()
tst["preprocess_data"] = df.copy()

In [39]:
import pandas
filtered = pandas.read_csv("data/splits/cwru/filtered_cwru.csv")
f['preprocess_data'][0].metadata = df[df['waveform_id'].isin(filtered['waveform_id'])].reset_index(drop=True)

In [42]:
(f['preprocess_data'][0].metadata.waveform_id.value_counts() != 20).sum()

np.int64(0)

In [43]:
#start from preprocess data
test_v2 = dr.execute(["extract_features"], inputs=cfg_dict, overrides={"preprocess_data": f['preprocess_data']})

2025-09-16 13:08:10,012 | INFO | Extracting features for CWRU
2025-09-16 13:08:10,012 INFO: Extracting features for CWRU


Dataset shape before feature extraction: (2880, 19)
Dataset shape after feature extraction: (2880, 44)
Dropping columns: ['fault_location', 'load', 'rpm', 'fault_type', 'fault_size', 'signal_location', 'fs', 'duration', 'bpfo', 'bpfi', 'bsf', 'ftf', 'inner', 'outer', 'ball', 'multiclass_label', 'signal']


In [44]:
test_v2['extract_features'][0].metadata.columns

Index(['waveform_id', 'waveform_id_seg', 'acceleration/rms/global',
       'acceleration/pk-pk/global', 'acceleration/kurt/global',
       'acceleration/skewness/global', 'acceleration/fc/global',
       'envelope/spectralPeak/1.0x-bpfo/500-6000',
       'envelope/spectralPeak/2.0x-bpfo/500-6000',
       'envelope/spectralPeak/3.0x-bpfo/500-6000',
       'envelope/spectralPeak/4.0x-bpfo/500-6000',
       'envelope/spectralPeak/5.0x-bpfo/500-6000',
       'envelope/spectralPeak/1.0x-bpfi/500-6000',
       'envelope/spectralPeak/2.0x-bpfi/500-6000',
       'envelope/spectralPeak/3.0x-bpfi/500-6000',
       'envelope/spectralPeak/4.0x-bpfi/500-6000',
       'envelope/spectralPeak/5.0x-bpfi/500-6000',
       'envelope/spectralPeak/1.0x-bsf/500-6000',
       'envelope/spectralPeak/2.0x-bsf/500-6000',
       'envelope/spectralPeak/3.0x-bsf/500-6000',
       'envelope/spectralPeak/4.0x-bsf/500-6000',
       'envelope/spectralPeak/5.0x-bsf/500-6000',
       'envelope/spectralPeak/1.0x-ftf/500-

In [45]:
features = test_v2['extract_features'][0].metadata

In [46]:
features

Unnamed: 0,waveform_id,waveform_id_seg,acceleration/rms/global,acceleration/pk-pk/global,acceleration/kurt/global,acceleration/skewness/global,acceleration/fc/global,envelope/spectralPeak/1.0x-bpfo/500-6000,envelope/spectralPeak/2.0x-bpfo/500-6000,envelope/spectralPeak/3.0x-bpfo/500-6000,...,envelope/spectralPeak/1.0x-bsf/500-6000,envelope/spectralPeak/2.0x-bsf/500-6000,envelope/spectralPeak/3.0x-bsf/500-6000,envelope/spectralPeak/4.0x-bsf/500-6000,envelope/spectralPeak/5.0x-bsf/500-6000,envelope/spectralPeak/1.0x-ftf/500-6000,envelope/spectralPeak/2.0x-ftf/500-6000,envelope/spectralPeak/3.0x-ftf/500-6000,envelope/spectralPeak/4.0x-ftf/500-6000,envelope/spectralPeak/5.0x-ftf/500-6000
0,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_21,0.159637,2.302379,3.835579,0.033265,7.666698,0.014211,0.011194,0.019861,...,0.010014,0.019861,0.010374,0.006619,0.004782,0.012108,0.009188,0.008307,0.008868,0.011724
1,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_22,0.160692,2.302379,3.795937,0.006127,7.616353,0.012431,0.009888,0.019129,...,0.009013,0.019129,0.013292,0.006377,0.005737,0.012652,0.006775,0.009449,0.010828,0.007354
2,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_23,0.157640,1.241838,2.937076,-0.015038,4.256873,0.009511,0.009050,0.013859,...,0.008534,0.013859,0.009910,0.005741,0.005406,0.013294,0.009442,0.007023,0.004656,0.008733
3,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_24,0.181250,2.924314,7.932640,0.042621,8.264780,0.012809,0.011590,0.014341,...,0.012212,0.014341,0.011365,0.007985,0.004376,0.023940,0.007411,0.011768,0.010512,0.008210
4,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_25,0.218782,3.834453,12.325305,0.052584,8.773079,0.015604,0.012148,0.016463,...,0.017917,0.016463,0.010903,0.007703,0.006293,0.064142,0.018574,0.024286,0.028592,0.015193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,OR@3_21_FE_FE_12_1HP,OR@3_21_FE_FE_12_1HP_7822,0.160258,1.493132,3.971796,-0.065656,4.715474,0.043181,0.018959,0.011801,...,0.007267,0.005674,0.010887,0.006646,0.007024,0.007359,0.002009,0.030043,0.004676,0.006500
2876,OR@3_21_FE_FE_12_1HP,OR@3_21_FE_FE_12_1HP_7823,0.160571,1.435247,3.877006,-0.049626,4.557658,0.044693,0.019704,0.009350,...,0.009748,0.006899,0.010834,0.007569,0.008723,0.006458,0.004063,0.028381,0.007362,0.006086
2877,OR@3_21_FE_FE_12_1HP,OR@3_21_FE_FE_12_1HP_7824,0.159074,1.392972,3.917184,-0.051443,4.421965,0.039492,0.024137,0.011257,...,0.010545,0.005455,0.010605,0.006388,0.010172,0.007826,0.004886,0.005795,0.033248,0.008002
2878,OR@3_21_FE_FE_12_1HP,OR@3_21_FE_FE_12_1HP_7825,0.160724,1.486384,3.970759,-0.032371,4.999982,0.037540,0.027608,0.010441,...,0.010012,0.007878,0.012475,0.006317,0.008152,0.010660,0.002650,0.007202,0.032262,0.004745


In [47]:
features = features.dropna(subset='waveform_id')

In [48]:
features.to_pickle('data/features/cwru_features_segmented.pkl')

In [27]:
test_v2["extract_features"][0].metadata

Unnamed: 0,waveform_id,waveform_id_seg,acceleration/rms/global,acceleration/pk-pk/global,acceleration/kurt/global,acceleration/skewness/global,acceleration/fc/global,envelope/spectralPeak/1.0x-bpfo/500-6000,envelope/spectralPeak/2.0x-bpfo/500-6000,envelope/spectralPeak/3.0x-bpfo/500-6000,...,envelope/spectralPeak/1.0x-bsf/500-6000,envelope/spectralPeak/2.0x-bsf/500-6000,envelope/spectralPeak/3.0x-bsf/500-6000,envelope/spectralPeak/4.0x-bsf/500-6000,envelope/spectralPeak/5.0x-bsf/500-6000,envelope/spectralPeak/1.0x-ftf/500-6000,envelope/spectralPeak/2.0x-ftf/500-6000,envelope/spectralPeak/3.0x-ftf/500-6000,envelope/spectralPeak/4.0x-ftf/500-6000,envelope/spectralPeak/5.0x-ftf/500-6000
10,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_11,0.159637,2.302379,3.835579,0.033265,7.666698,0.002767,0.002683,0.004591,...,0.010450,0.005570,0.003547,0.003840,0.003315,0.004958,0.008653,0.002633,0.005053,0.003902
11,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_12,0.157101,1.241838,2.917226,-0.013452,4.271469,0.004135,0.003246,0.003309,...,0.009329,0.004939,0.004699,0.003078,0.002907,0.004202,0.008112,0.003352,0.005384,0.003243
12,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_13,0.222830,3.834453,11.934818,0.055976,8.613718,0.004194,0.002610,0.005627,...,0.008587,0.004437,0.005476,0.004586,0.003139,0.004516,0.008756,0.001661,0.007023,0.003449
13,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_14,0.170758,2.330427,4.989846,0.009150,7.274491,0.003281,0.002899,0.005695,...,0.009988,0.004305,0.003941,0.003608,0.003049,0.005288,0.001890,0.003120,0.003764,0.002675
14,B_21_FE_FE_12_3HP,B_21_FE_FE_12_3HP_15,0.170553,2.969029,6.038084,0.060255,8.923800,0.004007,0.003542,0.004837,...,0.008876,0.004311,0.004521,0.003517,0.003688,0.004218,0.003293,0.005023,0.005262,0.004294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,,,,,,,,0.042158,0.022042,0.009583,...,0.009126,0.007643,0.010981,0.005718,0.008094,0.007370,0.002956,0.004124,0.032898,0.004995
1436,,,,,,,,0.042579,0.024297,0.009471,...,0.009215,0.009962,0.012953,0.006387,0.006383,0.007829,0.032352,0.006486,0.009452,0.004049
1437,,,,,,,,0.043348,0.018854,0.012007,...,0.007685,0.005528,0.011022,0.006272,0.006996,0.006941,0.003681,0.003939,0.003752,0.030507
1438,,,,,,,,0.040041,0.023629,0.011498,...,0.010298,0.005749,0.010505,0.006497,0.010317,0.007718,0.005157,0.005896,0.032706,0.007716
