In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_classif, f_classif

# F score

## Time domain

In [None]:
# MI: it is equal to zero if and only if two random variables are independent, 
# and higher values mean higher dependency.
def calc_feature_selection_metric(fmetric, dataset, columns, summary=True):
    if summary:
        m = fmetric(dataset[columns], dataset['fault'].cat.codes)  # Do not have to be codes
        if isinstance(m, tuple):
            m = m[0]
        return (pd.DataFrame(list(zip(columns, m)), columns=['feature', 'stat'])
                  .set_index('feature')
                  .sort_values(by='stat', ascending=False))
    else: # For each axis and target category independently
        stat = {}
        for key, group in dataset.groupby('axis'):
            m = fmetric(group[columns], group['fault'].cat.codes)
            if isinstance(m, tuple):
                m = m[0]
            stat[key] = m
        df = pd.DataFrame(stat)
        df['feature'] = columns
        return df.set_index('feature')

def calc_f_stat(dataset, columns, summary=True):
    return calc_feature_selection_metric(f_classif, dataset, columns, summary)

def calc_mutual_information(dataset, columns, summary=True):
    return calc_feature_selection_metric(mutual_info_classif, dataset, columns, summary)

### Unnormalized features

In [None]:
features = load_time_domain_features(['ax', 'ay', 'az', 'bx', 'by', 'bz'])
fscore = calc_f_stat(features, sel.TD_COLUMNS, summary=True)
fscore.plot.bar(figsize=(8, 5), grid=True, xlabel='Feature', ylabel='F statistic', legend=False)
plt.show()

### Normalized features (Result found: F score is independent of scaling)

In [None]:
features = load_time_domain_features(['ax', 'ay', 'az', 'bx', 'by', 'bz'])

standard_transformer = Pipeline(
    steps=[('standard', StandardScaler())]
)
minmax_transformer = Pipeline(
    steps=[('minmax', MinMaxScaler())]
)

preprocessor = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('std', standard_transformer , TD_COLUMNS)
    ],
    verbose_feature_names_out=False
)
features_normalized = preprocessor.fit_transform(features)

features_normalized = pd.DataFrame(features_normalized, columns=preprocessor.get_feature_names_out())
features_normalized['fault'] = features_normalized['fault'].astype('category')
fscore = calc_f_stat(features_normalized, TD_COLUMNS, summary=True)
fscore.plot.bar(figsize=(8, 5), grid=True, xlabel='Feature', ylabel='F statistic', legend=False)
plt.show()

### F stat between feature on axis and target fault state

In [None]:
fig, ax = plt.subplots(figsize=(10, 5)) 
sb.heatmap(calc_f_stat(features, sel.TD_COLUMNS, summary=False), annot=True, ax=ax, cmap="Greens")
plt.show()