In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

import warnings
warnings.filterwarnings('error')

import sys
sys.path.append('../../')
from feature import selection as sel

from sklearn.feature_selection import VarianceThreshold

FEATURES_PATH =  '../../datasets/features_data/'

Feature selection = Pearson correlation
- Calculate correlation between fault (then severity) and features by axis
- Transform faults to binary vector (is unbalance, is not unbalance) for each fault - one hot encoding

Time domain

In [None]:
def show_time_domain_correlation(corr_table, n=None):
    num_of_faults = len(corr_table['fault'].cat.categories)
    fig, axes = plt.subplots(2, num_of_faults // 2, figsize=(20, 10))

    for i, group in enumerate(corr_table.groupby(by='fault', observed=True)):
        fault, rows = group
        x = rows.sort_values(by='corr', ascending=False)
        if n is not None:
            x = x.head(n)
        sel.plot_bar_chart(axes.flatten()[i], x['feature'], x['corr'], f'Fault: {fault}')

    for ax in axes[-1]:
        ax.set_xlabel('Feature')
    for ax in axes[:,0]:
        ax.set_ylabel('Correlation with fault')

    fig.tight_layout()

Ordered importance of features to fault based on their Pearson correlation coeficient

In [None]:
features = sel.load_td_feat(['az'], all=True, path=FEATURES_PATH)
columns = sel.filter_out_metadata_columns(features).columns
df = sel.corr_features_to_fault(features, columns)
show_time_domain_correlation(df)
df

In [None]:
features = sel.load_td_feat(['az'], all=True, path=FEATURES_PATH)
columns = sel.filter_out_metadata_columns(features).columns
X = features[columns]
y = features['fault']
corr = (
    pd.DataFrame(zip(columns, sel.corr_classif(X, y)), columns=['feature', 'corr'])
       .set_index('feature')
       .sort_values(by='corr', ascending=False)
)
corr.plot.bar(grid=True, legend=False)
plt.show()

Correlations among faults and time domain featured unordered

In [None]:
corr_to_class = df.pivot(index='fault', columns='feature', values='corr')
fig, ax = plt.subplots(figsize=(10, 5))
sb.heatmap(corr_to_class, annot=True, cmap='Greens', ax=ax)
plt.show()

Rank order of features averaged among all fault types
- *Less is better*

In [None]:
corr_fault_to_feat = df.pivot(index='fault', columns='feature', values='corr')

feature_ranks = corr_fault_to_feat.rank(axis='columns', method='dense', ascending=False)
common_rank = feature_ranks.mean().sort_values().to_frame(name='rank')
common_rank.plot.bar(grid=True, legend=False)
plt.show()

Frequency domain

In [None]:
def show_freq_domain_correlation(corr_table):
    num_of_faults = len(corr_table['fault'].cat.categories)
    num_of_windows = len(corr_table['window'].cat.categories)
    
    fig, axes = plt.subplots(num_of_faults, num_of_windows, figsize=(25, 40))

    for i, group in enumerate(corr_table.groupby(by=['fault', 'window'], observed=True)):
        grouper, rows = group
        fault, win = grouper
        x = rows.sort_values(by='corr', ascending=False)
        sel.plot_bar_chart(axes.flatten()[i], x['feature'], x['corr'], f'Fault: {fault} \n(Window = {win})')
    
    for ax in axes[-1]:
        ax.set_xlabel('Feature')
    for ax in axes[:,0]:
        ax.set_ylabel('Correlation with fault')

    fig.tight_layout()

Overall feature relevance in all windows

In [None]:
features = sel.load_fd_feat(['az'], all=True, path=FEATURES_PATH)
columns = sel.filter_out_metadata_columns(features).columns
df = sel.corr_features_to_fault(features, columns)
show_time_domain_correlation(df, n=15)

Rank - less is better

In [None]:
corr_fault_to_feat = df.pivot(index='fault', columns='feature', values='corr')

feature_ranks = corr_fault_to_feat.rank(axis='columns', method='dense', ascending=False)
common_rank = feature_ranks.mean().sort_values().to_frame(name='rank')
common_rank.plot.bar(grid=True, legend=False, figsize=(20, 5))
plt.show()

How does importance of features change with window size
- -> Increasing window size
- v Different faults 

In [None]:
features = sel.load_fd_feat(['az'], all=True, path=FEATURES_PATH)
features = sel.calc_corr_in_fft_windows(features)
show_freq_domain_correlation(features)

Most important features by fault type (rank averaged by window size)
- *Less is better*

In [None]:
def weighted_rank_features_corr(features, index, weighted, values='corr'):
    df_ranks = pd.DataFrame()
    for i, group in enumerate(features.groupby(by='fault', observed=True)):
        fault, df = group
        corr_fault_to_feat = df.pivot(index=index, columns='feature', values=values)
        feature_ranks = corr_fault_to_feat.rank(axis='columns', method='dense', ascending=False)

        if weighted:
            feature_ranks *= corr_fault_to_feat
        
        common_rank = feature_ranks.mean().sort_values().to_frame(name='rank')
        
        common_rank['fault'] = fault
        df_ranks = pd.concat([df_ranks, common_rank])

    df_ranks['fault'] = df_ranks['fault'].astype('category')
    return df_ranks


features_renamed = features.copy()
features_renamed['feature'] = sel.fd_extract_feature_name(features['feature'])
ranks = weighted_rank_features_corr(features_renamed, index='window', weighted=False)
sel.plot_ranked_features(ranks)