# Feature explanatory data analysis

In [None]:
import sys
sys.path.append('../../')
from feature.selection import load_td_feat, load_fd_feat

SOURCES = [load_td_feat, load_fd_feat]
AXIS_SELECT = (['az'], ['ax', 'ay', 'az'])


# Options: (0,0), (0, 1), (1, 0), (1, 1)
SOURCE = SOURCES[1]
AXIS = AXIS_SELECT[1]

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif, f_classif

from feature.selection import corr_classif
from feature.models import (
    fault_labeling,
    pipeline_v1,
    cross_cuts_3d,
    cross_cuts_3d_anomalies
)


FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {'normal': 'N', 'imbalance': 'I', 'horizontal-misalignment': 'HM', 'vertical-misalignment': 'VM'}

## Features from one sensor position: (ax, ay, az)

In [None]:
features = fault_labeling(SOURCE(AXIS, path=FEATURES_PATH), FAULT_CLASSES)
features.head(5)

In [None]:
features.hist(bins=100, figsize=(20, 8), color='grey', ec='black')
plt.tight_layout()
plt.show()

### C
Feature subsets for multiclass classification

In [None]:
for k in range(1, min(len(features.columns) + 1, 10)):
    r = pipeline_v1(features, 0.6, k, corr_classif, multiclass=True)
    print(k, list(r[0].columns))

### C
Feature selection with correlation

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(features, func_select=corr_classif, train=0.6, nfeat=3)

print()
X_train.info()
X_train.head(5)

### C
Statistical distribution of features in training set

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

### C
Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

### F
Feature subsets for multiclass classification

In [None]:
for k in range(1, min(len(features.columns) + 1, 10)):
    r = pipeline_v1(features, 0.6, k, f_classif, multiclass=True, anomaly_ratio=0.1)
    print(k, list(r[0].columns))

### F
Feature subsets for binary classification (anomaly)

In [None]:
for k in range(1, min(len(features.columns) + 1, 10)):
    r = pipeline_v1(features, 0.6, k, f_classif, multiclass=False, anomaly_ratio=0.1)
    print(k, list(r[0].columns))

### F
Feature selection with F statistic

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(features, func_select=f_classif, train=0.6, nfeat=3)

print()
X_train.info()
X_train.head(5)

### F

Statistical distribution of features in training set

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

### F
Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

### F
Feature subsets for multiclass classification

In [None]:
for k in range(1, min(len(features.columns) + 1, 10)):
    r = pipeline_v1(features, 0.6, k, mutual_info_classif, multiclass=True, anomaly_ratio=0.1)
    print(k, list(r[0].columns))

### F
Feature subsets for binary classification (anomaly)

In [None]:
for k in range(1, min(len(features.columns) + 1, 10)):
    r = pipeline_v1(features, 0.6, k, mutual_info_classif, multiclass=False, anomaly_ratio=0.1)
    print(k, list(r[0].columns))

### MI
Feature selection with Mutual information

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(features, func_select=mutual_info_classif, train=0.6, nfeat=3)

print()
X_train.info()
X_train.head(5)

### MI
Statistical distribution of features in training set

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

### MI
Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

### MI

3D distribution of data points

In [None]:
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(projection='3d')
ax.scatter(
    X_train.loc[:,X_train.columns[0]],
    X_train.loc[:,X_train.columns[1]],
    X_train.loc[:,X_train.columns[2]],
    s=1
)
ax.set_box_aspect(aspect=None, zoom=0.85)
ax.set_xlabel(X_train.columns[0])
ax.set_ylabel(X_train.columns[1])
ax.set_zlabel(X_train.columns[2])
plt.show()

### MI
Cross sectional plots of anomalies

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(features, train=0.6, nfeat=3, multiclass=True, func_select=mutual_info_classif)
cross_cuts_3d_anomalies(X_train, features['anomaly'].iloc[list(X_train.index)])

percentage = len(features[features['anomaly'] == True]) / len(features)
print(f'Percentage of anomalies: {percentage * 100:.2f} %')