In [None]:
import os
from zipfile import ZipFile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer

import sys
sys.path.append('../')
from vibrodiagnostics import (
    pumps,
    extraction
)

In [None]:
EXTRACT = False
PATH = '../datasets'
FEATURES_PATH = os.path.join(PATH, 'features')
DATASET_PATH = os.path.join(PATH, 'FluidPump.zip')
FEATURES = {
    'TD': os.path.join(FEATURES_PATH, 'PUMPS_TD.csv'),
    'FD': os.path.join(FEATURES_PATH, 'PUMPS_FD.csv'),
}
PARTS = 12
FFT_WINDOW = 2**14

In [None]:
def features_time_domain(dataset: ZipFile, filename: str, parts: int = PARTS) -> pd.DataFrame:
    return pumps.features_by_domain(extraction.time_features_calc, dataset, filename, parts=parts)


def features_frequency_domain(dataset: ZipFile, filename: str, parts: int = PARTS) -> pd.DataFrame:
    return pumps.features_by_domain(extraction.frequency_features_calc, dataset, filename, window=FFT_WINDOW, parts=parts)

In [None]:
if EXTRACT is True:
    features = extraction.load_files_split(ZipFile(DATASET_PATH), features_time_domain)
    features.to_csv(FEATURES['TD'], index=False)
else:
    features = pd.read_csv(FEATURES['TD'])
features

In [None]:
if EXTRACT is True:
    features = extraction.load_files_split(ZipFile(DATASET_PATH), features_frequency_domain)
    features.to_csv(FEATURES['FD'], index=False)
else:
    features = pd.read_csv(FEATURES['FD'])
features

In [None]:
# Label counts
features[['device', 'position']].value_counts().to_frame()

In [None]:
datasets = []
domains = ('TD', 'FD')
dimensions = (1, 3)
columns = {
    1: ['z'],
    3: pumps.BEARINGS_COLUMNS
}

for domain in domains:
    for dim in dimensions:
        df = extraction.load_features(FEATURES[domain], columns[dim], pumps.LABEL_COLUMNS)
        df = pumps.assign_labels(df)
        datasets.append({'domain': domain, 'dim': dim, 'data': df})

datasets_domains = pd.DataFrame.from_records(datasets)
datasets_domains

In [None]:
# Range of values in features
for row in datasets:
    df = row['data'].drop(columns=['label'])

    # ? group by according to label
    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(row['domain'], row['dim'])
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i])
    fig.tight_layout()
    plt.show()

In [None]:
# Range of values in features - Power transform
for row in datasets:
    df = row['data'].drop(columns=['label'])

    pt = PowerTransformer(method='yeo-johnson', standardize=True)
    df[df.columns] = pt.fit_transform(df)

    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(row['domain'], row['dim'])
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i])
    fig.tight_layout()
    plt.show()