Feature explanatory data analysis

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif, f_classif

import sys
sys.path.append('../../')
from feature.selection import load_td_feat
from feature.models import (
    fault_labeling,
    pipeline_v1,
    cross_cuts_3d,
    cross_cuts_3d_anomalies
)


FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {'normal': 'N', 'imbalance': 'I', 'horizontal-misalignment': 'HM', 'vertical-misalignment': 'VM'}

TD: Features from one sensor position: (ax, ay, az)

In [None]:
features = load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
td_features = fault_labeling(features, FAULT_CLASSES)

In [None]:
td_features.hist(bins=100, figsize=(20, 8), color='grey', ec='black')
plt.tight_layout()
plt.show()

TD-F: Feature selection with F statistic

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(td_features, func_select=f_classif, train=0.6, nfeat=3)

print()
X_train.info()
X_train.head(5)

TD-F: Statistical distribution of features in training set

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

TD-F: Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

TD-MI: Feature selection with Mutual information

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(td_features, func_select=mutual_info_classif, train=0.6, nfeat=3)

print()
X_train.info()
X_train.head(5)

TD-MI: Statistical distribution of features in training set

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

TD-MI: Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

TD-MI: 3D distribution of data points

In [None]:
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(projection='3d')
ax.scatter(
    X_train.loc[:,X_train.columns[0]],
    X_train.loc[:,X_train.columns[1]],
    X_train.loc[:,X_train.columns[2]],
    s=1
)
ax.set_box_aspect(aspect=None, zoom=0.85)
ax.set_xlabel(X_train.columns[0])
ax.set_ylabel(X_train.columns[1])
ax.set_zlabel(X_train.columns[2])
plt.show()

TD-MI: Cross sectional plots of anomalies

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(td_features, train=0.6, nfeat=3, multiclass=True, func_select=mutual_info_classif)
cross_cuts_3d_anomalies(X_train, td_features['anomaly'].iloc[list(X_train.index)])

percentage = len(td_features[td_features['anomaly'] == True]) / len(td_features)
print(f'Percentage of anomalies: {percentage * 100:.2f} %')

FD: Features from one sensor position: (ax, ay, az)

In [None]:
features = load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
fd_features = fault_labeling(features, FAULT_CLASSES)

FD: Statistical distribution of features in training set

In [None]:
fd_features.hist(bins=100, figsize=(20, 8), color='grey', ec='black')
plt.tight_layout()
plt.show()

FD: Feature selection with Pearson correlation
- **TODO**

FD-F: Feature selection with F statistic

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=0.6, nfeat=3, func_select=f_classif)
print()
X_train.info()
X_train.head(5)

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

FD-F: Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

FD-F: Cross sectional plots of anomalies

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=0.6, nfeat=3, multiclass=False, func_select=f_classif)
cross_cuts_3d_anomalies(X_train, fd_features['anomaly'].iloc[list(X_train.index)])

FD: Feature selection with Mutual information

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=0.6, nfeat=3, func_select=mutual_info_classif)
print()
X_train.info()
X_train.head(5)

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

FD-MI: Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

FD-MI: Cross sectional plots of anomalies

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=0.6, nfeat=3, multiclass=False, func_select=mutual_info_classif)
cross_cuts_3d_anomalies(X_train, fd_features['anomaly'].iloc[list(X_train.index)])