In [None]:
import pandas as pd
import os
import sys
sys.path.append('../')
from vibrodiagnostics import selection, models

FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {
    'normal': 'normal',
    'imbalance': 'imbalance',
    'horizontal-misalignment': 'misalignment',
    'vertical-misalignment': 'misalignment',
    'overhang-cage_fault': 'cage fault',
    'underhang-cage_fault': 'cage fault',
    'underhang-ball_fault': 'ball fault',
    'overhang-ball_fault': 'ball fault',
    'overhang-outer_race': 'outer race fault',
    'underhang-outer_race': 'outer race fault',
}

MAFAULDA_PATH = '../../datasets/MAFAULDA.zip'
FEATURES_PATH =  '../../datasets/features_data/'
MAFAULDA_METADATA = os.path.join(FEATURES_PATH, selection.MAFAULDA_METADATA)

TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)
TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)

In [None]:
RPM = 2500
RPM_RANGE = 500

def load_dataset_info(anomaly=None):
    meta = pd.read_csv(MAFAULDA_METADATA, index_col='filename')
    files = meta[
        (meta['fault'].isin(FAULT_CLASSES)) &
        (meta['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both'))
    ].copy()
    if anomaly is not None:
        return models.fault_labeling(files, FAULT_CLASSES, anomaly_severity=anomaly)
    else:
        return models.fault_labeling(files, FAULT_CLASSES)



filenames = load_dataset_info()
filenames

RPM unlimited

In [None]:
files_all = pd.read_csv(MAFAULDA_METADATA, index_col='filename')
files_all = models.fault_labeling(files_all, FAULT_CLASSES)
print(len(files_all))
print(files_all['fault'].value_counts()  / len(files_all) * 100)
files_all['fault'].value_counts() 

RPM limited counts

In [None]:
print(filenames['fault'].value_counts()  / len(filenames) * 100)
filenames['fault'].value_counts()

In [None]:
filenames.groupby(by='fault')['rpm'].plot.hist(bins=100)

In [None]:
features = pd.read_csv(TD_FEATURES)
features

In [None]:
features = pd.read_csv(TD_FEATURES)
features = models.fault_labeling(features, FAULT_CLASSES)
features

In [None]:
features['fault'].value_counts()

In [None]:
features.groupby(by='fault', observed=True)['rpm'].plot.hist(bins=100)

In [None]:
# correlation of features to rpm
x = features[features.columns[~features.columns.isin(selection.METADATA_COLUMNS_ALL) | features.columns.isin(('rpm',))]]
st = x.corr()['rpm'].abs().sort_values(ascending=False)
st.describe()
#st
#.hist(bins=100)

In [None]:
st.head(50)

In [None]:
features = pd.read_csv(FD_FEATURES)
features = models.fault_labeling(features, FAULT_CLASSES)
features

In [None]:
x = features[features.columns[~features.columns.isin(selection.METADATA_COLUMNS_ALL) | features.columns.isin(('rpm',))]]
st = x.corr()['rpm'].abs().sort_values(ascending=False)
st.describe()

Anomaly counts

In [None]:
features = pd.read_csv(TD_FEATURES)
features = models.fault_labeling(features, FAULT_CLASSES, anomaly_severity=0.6)
features['anomaly'].value_counts()

In [None]:
features = pd.read_csv(TD_FEATURES)
features = models.fault_labeling(features, FAULT_CLASSES, anomaly_severity=0.9)
features['anomaly'].value_counts()

In [None]:
features = pd.read_csv(FD_FEATURES)
features = models.fault_labeling(features, FAULT_CLASSES, anomaly_severity=0.6)
features['anomaly'].value_counts() / 5

In [None]:
features = pd.read_csv(FD_FEATURES)
features = models.fault_labeling(features, FAULT_CLASSES, anomaly_severity=0.9)
features['anomaly'].value_counts() / 5

RPM limited anomaly counts

In [None]:
features = load_dataset_info(anomaly=0.6)
features['anomaly'].value_counts()

In [None]:
features = load_dataset_info(anomaly=0.9)
features['anomaly'].value_counts()