In [None]:
import os
from zipfile import ZipFile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer

import mafaulda
import extraction

In [None]:
EXTRACT = False
PATH = '../datasets'
FEATURES_PATH = os.path.join(PATH, 'features')
DATASET_PATH = os.path.join(PATH, 'MAFAULDA.zip')
LABELED_DATASET_PATH = os.path.join(FEATURES_PATH, 'MAFAULDA_LABEL.csv')
FEATURES = {
    'TD': os.path.join(FEATURES_PATH, 'MAFAULDA_TD.csv'),
    'FD': os.path.join(FEATURES_PATH, 'MAFAULDA_FD.csv'),
}
PARTS = 1
FFT_WINDOW = 2**15

In [None]:
def features_time_domain(dataset: ZipFile, filename: str, parts: int = PARTS) -> pd.DataFrame:
    return mafaulda.features_by_domain(extraction.time_features_calc, dataset, filename, parts=parts)


def features_frequency_domain(dataset: ZipFile, filename: str, parts: int = PARTS) -> pd.DataFrame:
    return mafaulda.features_by_domain(extraction.frequency_features_calc, dataset, filename, window=FFT_WINDOW, parts=parts)

In [None]:
if EXTRACT is True:
    features = extraction.load_files_split(ZipFile(DATASET_PATH), features_time_domain)
    features.to_csv(FEATURES['TD'], index=False)
else:
    features = pd.read_csv(FEATURES['TD'])
features

In [None]:
if EXTRACT is True:
    features = extraction.load_files_split(ZipFile(DATASET_PATH), features_frequency_domain)
    features.to_csv(FEATURES['FD'], index=False)
else:
    features = pd.read_csv(FEATURES['FD'])
features

In [None]:
# display example severities
df = extraction.load_features(FEATURES['TD'], mafaulda.BEARING_A_COLUMNS, mafaulda.LABEL_COLUMNS) 
df = mafaulda.label_severity(df, 'A', 0.5, True)

In [None]:
# generate different feature sets
datasets = []
domains = ('TD', 'FD')
dimensions = (1, 3)
columns = {
    'A': {
        1: ['ay'],
        3: mafaulda.BEARING_A_COLUMNS
    },
    'B': {
        1: ['by'],
        3: mafaulda.BEARING_B_COLUMNS
    }
}

for domain in domains:
    for dim in dimensions:
        a = extraction.load_features(FEATURES[domain], columns['A'][dim], mafaulda.LABEL_COLUMNS) 
        a = mafaulda.assign_labels(a, 'A')
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A', 'severity': False, 'data': a})

        b = extraction.load_features(FEATURES[domain], columns['B'][dim], mafaulda.LABEL_COLUMNS) 
        b = mafaulda.assign_labels(b, 'B')
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'B', 'severity': False, 'data': b})

        ab = pd.concat([a, b]).reset_index(drop=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A+B', 'severity': False, 'data': ab})

        a = extraction.load_features(FEATURES[domain], columns['A'][dim], mafaulda.LABEL_COLUMNS) 
        a = mafaulda.label_severity(a, 'A', 0.5)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A', 'severity': True, 'data': a})

        b = extraction.load_features(FEATURES[domain], columns['B'][dim], mafaulda.LABEL_COLUMNS) 
        b = mafaulda.label_severity(b, 'B', 0.5)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'B', 'severity': True, 'data': b})

        ab = pd.concat([a, b]).reset_index(drop=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A+B', 'severity': True, 'data': ab})


datasets_domains = pd.DataFrame.from_records(datasets)

# Join columns of features in time and frequency domain
for name, group in datasets_domains.groupby(by=['dim', 'bearing', 'severity']):
    dim, bearing, severity = name
    frames_by_domain = [
        df.drop(columns=['label']).reset_index(drop=True)
        for df in group['data'].values
    ]
    df = pd.concat(frames_by_domain, axis=1)
    df['label'] = group['data'].values[0]['label']
    datasets.append({'domain': 'TD+FD', 'dim': dim, 'bearing': bearing, 'severity': severity, 'data': df})


datasets = pd.DataFrame.from_records(datasets)

In [None]:
datasets

In [None]:
# Zisti počty jednotlivých tried - ovplyvnené cez severity a bearing
# domain ovplyvnuje počet stĺpcov, dim - iba z koľkých pôvodných stĺpcov
# Riadok - bearings, severity
# Stĺpce - počet z každej class
label_counts = []
for name, group in datasets_domains.groupby(by=['severity', 'bearing']):
    severity, bearing = name
    df = group['data'].values[0]
    scenario = {'bearing': bearing, 'severity': severity}
    counts = df['label'].value_counts().to_dict()
    counts['sum'] = sum(counts.values())
    scenario.update(counts)
    label_counts.append(scenario)

pd.DataFrame.from_records(label_counts)

In [None]:
# Range of values in features
for name, group in datasets_domains.groupby(by=['domain', 'dim', 'bearing']):
    df = group['data'].values[0].drop(columns=['label'])

    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(name)
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i])
    fig.tight_layout()
    plt.show()

In [None]:
# Range of values in features - Power transform
for name, group in datasets_domains.groupby(by=['domain', 'dim', 'bearing']):
    df = group['data'].values[0].drop(columns=['label'])

    pt = PowerTransformer(method='yeo-johnson', standardize=True)
    df[df.columns] = pt.fit_transform(df)

    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(name)
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i])
    fig.tight_layout()
    plt.show()

In [None]:
# TODO: 
# Features EDA - corr to rpm, 

# EDA of signals later - prepare functions

# PCA EDA of features

# Apply power transform (next column of data column)