In [None]:
# Labeling from extracted features,  .assign(rpm = lambda x: 1500)  # TODO: Podľa typu zariadenia
# Count classes
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from typing import List, Set, Tuple, Dict
from river import feature_selection, stream, preprocessing
from tqdm.notebook import tqdm
from enum import Enum, auto

from sklearn.metrics import silhouette_score

import re
import os
import sys
sys.path.append('../')
from vibrodiagnostics import selection, models, ranking


PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)
TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

In [None]:
faults = {
    'A': {
        'normal': 'normal',
        'imbalance': 'imbalance',
        'horizontal-misalignment': 'misalignment',
        'vertical-misalignment': 'misalignment',
        'underhang-outer_race': 'outer race fault',
        'underhang-cage_fault': 'cage fault',
        'underhang-ball_fault': 'ball fault'
    },
    'B': {
        'normal': 'normal',
        'imbalance': 'imbalance',
        'horizontal-misalignment': 'misalignment',
        'vertical-misalignment': 'misalignment',
        'overhang-cage_fault': 'cage fault',
        'overhang-ball_fault': 'ball fault',
        'overhang-outer_race': 'outer race fault'
    }
}

placements = {
    'A': ['ax', 'ay', 'az'],
    'B': ['bx', 'by', 'bz']
}


domains = {'temporal': TD_FEATURES, 'spectral': FD_FEATURES}
target = ['fault', 'anomaly_80']
placement = ['A', 'B']
online = [False, True]
GENERATE = False


def get_features_list(domains):
    features = []
    for dname, dataset in domains.items():
        names = pd.read_csv(dataset)
        names = names.columns.str.extract(r'([a-z]{2})_([a-z\_\-]+)')[1].unique()
        features.extend([f'{dname}_{col.strip("_")}' for col in names if not pd.isnull(col)])

    return features


temporal_columns = get_features_list({'temporal': TD_FEATURES})
spectral_columns = get_features_list({'spectral': FD_FEATURES})
all_columns = temporal_columns + spectral_columns

In [None]:
def load_source(dataset: str, domain: str, row: dict, all: bool = False, window_size=2**14):
    features = pd.read_csv(dataset).fillna(0)

    if not all:
        # Labeling anomaly severity levels
        target = re.search(r'([a-z]+)_?(\d+)?', row['target'])
        anomaly_severity = target.group(2) or '60'
        anomaly_severity = float(anomaly_severity) / 100

        # Choose measurement placement: A or B
        place = row['placement']
        axis = placements[place]
        features = features[features['fault'].isin(tuple(faults[place]))]
        features = models.fault_labeling(features, faults[place], anomaly_severity)

        columns = features.columns.str.startswith(tuple(axis))
        X = features[features.columns[columns]]

        # Select predicted variable column
        label = target.group(1)
        Y = features[label].astype('category')
    else:
        axis = placements['A']
        columns = features.columns.str.startswith(tuple(axis))
        X = features[features.columns[columns]]
        Y = features['fault'].astype('category')


    # Filter columns in feature domain with window size 2**14
    if domain == 'spectral':
        X = X.loc[:,X.columns.str.endswith(f'_{window_size}')]
        X.columns = X.columns.str.extract(r'(\w+)_\w+$')[0]

    # Calculate feature magnitudes from 3D vector
    feature_names = get_features_list({domain: dataset})
    result = pd.DataFrame()
    for name in feature_names:              
        # Remove prefix: temporal, spectral
        name = re.search(r'[a-z]+_([\w\_]+)', name).group(1)
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = X[vector_dims].apply(np.linalg.norm, axis=1)
    X = result

    # Batch / Online hold-out (balance and event sequencing)
    train_size = 0.8
    if row['online']:
        # Shuffle order within severity level and order event with increasing severity
        groups = [
            df.sample(frac=1, random_state=10)
            for i, df in (
                features.sort_values(by='severity_level').groupby('severity_level')
            )
        ]
        rows = list(pd.concat(groups).index)
        X = X.loc[rows].reset_index(drop=True)
        Y = Y.loc[rows].reset_index(drop=True)

        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, train_size=train_size, random_state=10
        )   
        X_train, X_test, Y_train, Y_test = (
            X_train.sort_index(), X_test.sort_index(),
            Y_train.sort_index(), Y_test.sort_index()
        )

    else:
        oversample = RandomOverSampler(sampling_strategy='not majority', random_state=10)
        X, Y = oversample.fit_resample(X, Y.to_numpy())
        X.reset_index(drop=True, inplace=True)
        Y = pd.Series(Y)

        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, train_size=train_size, stratify=Y, random_state=10
        )

    return X_train, X_test, Y_train, Y_test


def run_experiments(conditions: List[dict], exp_output: ranking.ExperimentOutput, pc=3) -> pd.DataFrame:
    experiments = []

    for row in tqdm(conditions):
        experiment = row.copy()

        for domain_label, dataset in domains.items():
            X_train, X_test, Y_train, Y_test = load_source(dataset, domain_label, row)

            # Count samples
            if exp_output == ranking.ExperimentOutput.COUNTS:
                experiment.update({'n_train': len(X_train), 'n_test': len(X_test), 'sum': len(X)})
                break

            elif exp_output == ranking.ExperimentOutput.PCA:
                experiment = row.copy()
                experiment.update({'domain': domain_label})
                experiment.update(ranking.pca_explained_variances(X_train, pc))
                experiments.append(experiment)
                continue

            elif exp_output == ranking.ExperimentOutput.SILHOUETTE:
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train)
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train)
    
                best_features = ranking.best_columns(ranks, synonyms, n=3)
                scores = ranking.silhouette_scores(X_train, X_test, Y_train, Y_test, best_features, pc)
                experiment = row.copy()
                experiment.update({'domain': domain_label})
                experiment.update(scores)
                experiments.append(experiment)
                continue

            elif exp_output == ranking.ExperimentOutput.BEST_SET:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train)
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train)
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset

            elif exp_output == ranking.ExperimentOutput.BEST_CORR:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train, 'corr')
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train, 'corr')
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset

            elif exp_output == ranking.ExperimentOutput.BEST_F_STAT:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train, 'f_stat')
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train,'f_stat')
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset

            elif exp_output == ranking.ExperimentOutput.BEST_MI:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train, 'mi')
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train, 'mi')
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset
    
            elif exp_output == ranking.ExperimentOutput.RANKS:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train)
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train)
                output = ranks

            output.reset_index(inplace=True)
            output['feature'] = output['feature'].apply(lambda s: f'{domain_label}_{s}')
            output = dict(zip(list(output['feature']), list(output['rank'])))
            experiment.update(output)

        if exp_output not in (ranking.ExperimentOutput.PCA, ranking.ExperimentOutput.SILHOUETTE):
            experiments.append(experiment)

    return pd.DataFrame.from_records(experiments)

In [None]:
column_names = ['target', 'placement', 'online']
initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(target, placement, online)
]

#### Majority voting: feature in subsets

#### 3 member sets

In [None]:
if GENERATE:
    membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_SET)
    membership.to_csv('best_set/rank_product.csv', index=False)
    membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_CORR)
    membership.to_csv('best_set/corr.csv', index=False)
    membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_F_STAT)
    membership.to_csv('best_set/fstat.csv', index=False)
    membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_MI)
    membership.to_csv('best_set/mi.csv', index=False)

In [None]:
# Globally best features (batch and online)
def globally_best_batch_features(filename):
    best_set_membership = pd.read_csv(filename)

    group = best_set_membership[best_set_membership['target'] == 'fault']  # anomaly_90
    for i, col in enumerate([temporal_columns, spectral_columns]):
        fig, ax = plt.subplots(figsize=(12, 5))
        graph = group[col][group == True].count(axis=0).sort_values(ascending=False)
        plt.grid()
        ax.bar([re.search('[a-z]+_(\w+)', s).group(1) for s in graph.index], graph)
        ax.set_xlabel('Feature')
        ax.set_ylabel('Count of best subset memberships')
        plt.show()

In [None]:
globally_best_batch_features('best_set/rank_product.csv')

In [None]:
globally_best_batch_features('best_set/corr.csv')

In [None]:
globally_best_batch_features('best_set/fstat.csv')

In [None]:
globally_best_batch_features('best_set/mi.csv')

In [None]:
best_set_membership = pd.read_csv('best_set/rank_product.csv')
agg = pd.DataFrame()
for key, group in best_set_membership.groupby(by=['online']):
    t_situation = group[temporal_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
    f_situation = group[spectral_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
    agg[key] = pd.concat([t_situation, f_situation]).index
agg

In [None]:
best_set_membership = pd.read_csv('best_set/rank_product.csv')
agg = pd.DataFrame()
for key, group in best_set_membership.groupby(by=['online', 'target']):
    t_situation = group[temporal_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
    f_situation = group[spectral_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
    agg[key] = pd.concat([t_situation, f_situation]).index
agg

#### Rank product: feature ordering

In [None]:
if GENERATE:
    best_set_ranks = run_experiments(initial_conditions, ranking.ExperimentOutput.RANKS)    # 6 minutes
    best_set_ranks.to_csv('best_set/ranks.csv', index=False)
    best_set_ranks.head()

In [None]:
best_set_ranks = pd.read_csv('best_set/ranks.csv')
# Globally best features (lower rank is better)

group = best_set_ranks[best_set_ranks['online'] == False]
fig, ax = plt.subplots(1, 2, figsize=(20, 4))
for i, col in enumerate([temporal_columns, spectral_columns]):
    graph = group[col].apply(gmean, axis=0).sort_values(ascending=True)
    print(graph)
    ax[i].grid()
    ax[i].bar([re.search('[a-z]+_(\w+)', s).group(1) for s in graph.index], graph)
plt.show()

# Online
group = best_set_ranks[best_set_ranks['online'] == True]
fig, ax = plt.subplots(1, 2, figsize=(20, 4))
for i, col in enumerate([temporal_columns, spectral_columns]):
    graph = group[col].apply(gmean, axis=0).sort_values(ascending=True)
    print(graph)
    ax[i].grid()
    ax[i].bar([re.search('[a-z]+_(\w+)', s).group(1) for s in graph.index], graph)
plt.show()

In [None]:
# Summary (absolute counts) - RPM limited/unlimted and machinery element
best_set_ranks = pd.read_csv('best_set/ranks.csv')
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online']):
    agg[key] = group[all_columns].apply(gmean, axis=0)
agg

In [None]:
best_set_ranks = pd.read_csv('best_set/ranks.csv')
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online']):
    t_situation = group[temporal_columns].apply(gmean, axis=0).sort_values(ascending=True).head(3)
    f_situation = group[spectral_columns].apply(gmean, axis=0).sort_values(ascending=True).head(3)
    agg[key] = pd.concat([t_situation, f_situation]).index
agg

In [None]:
# Summary (absolute counts) - RPM limited/unlimted and machinery element
best_set_ranks = pd.read_csv('best_set/ranks.csv')
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online', 'target']):
    agg[key] = group[all_columns].apply(gmean, axis=0)
agg

In [None]:
best_set_ranks = pd.read_csv('best_set/ranks.csv')
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online', 'target']):
    t_situation = group[temporal_columns].apply(gmean, axis=0).sort_values(ascending=False).head(3)
    f_situation = group[spectral_columns].apply(gmean, axis=0).sort_values(ascending=False).head(3)
    agg[key] = pd.concat([t_situation, f_situation]).index
agg

### Best features by experiment
- Majority voting
- Ranking

In [None]:
def extract_feature_names(feature_set):
    return [re.search('[a-z]+_(\w+)', s).group(1) for s in feature_set.index]

def best_featue_set_methods(filename):
    best_set_membership = pd.read_csv(filename)
    feature_sets = []
    indexer = ['placement', 'online', 'target']
    for key, group in best_set_membership.groupby(by=indexer):
        t_situation = group[temporal_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
        f_situation = group[spectral_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)

        # Extract feature names
        temporal = list(sorted(extract_feature_names(t_situation)))
        spectral = list(sorted(extract_feature_names(f_situation)))

        fset = {'placement': key[0], 'online': key[1], 'target': key[2], 'temporal': temporal , 'spectral': spectral}
        feature_sets.append(fset)

    return pd.DataFrame.from_records(feature_sets).set_index(indexer)

In [None]:
best_featue_set_methods('best_set/rank_product.csv')

In [None]:
best_featue_set_methods('best_set/corr.csv')

In [None]:
best_featue_set_methods('best_set/fstat.csv')

In [None]:
best_featue_set_methods('best_set/mi.csv')

In [None]:
def extract_feature_names(feature_set):
    return [re.search('[a-z]+_(\w+)', s).group(1) for s in feature_set.index]

best_set_membership = pd.read_csv('best_set/ranks.csv')
feature_sets = []
indexer = ['placement', 'online', 'target']
for key, group in best_set_membership.groupby(by=indexer):
    t_situation = group[temporal_columns].apply(gmean, axis=0).sort_values(ascending=False).head(3)
    f_situation = group[spectral_columns].apply(gmean, axis=0).sort_values(ascending=False).head(3)

    # Extract feature names
    temporal = list(sorted(extract_feature_names(t_situation)))
    spectral = list(sorted(extract_feature_names(f_situation)))

    fset = {'placement': key[0], 'online': key[1], 'target': key[2], 'temporal': temporal , 'spectral': spectral}
    feature_sets.append(fset)

pd.DataFrame.from_records(feature_sets).set_index(indexer)

#### PCA explained variance (batch only)

In [None]:
column_names = ['target', 'placement', 'online']
batch_initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(target, placement, [False])
]
pca_vars = run_experiments(batch_initial_conditions, ranking.ExperimentOutput.PCA)
pca_vars

In [None]:
def plot_explained_variances(pca_vars_in):
    selected_columns = ['target', 'PC1', 'PC2', 'PC3']
    groupby_columns = ['target']

    pca_A_temporal = pca_vars_in[
        (pca_vars_in['placement'] == 'A') & (pca_vars_in['domain'] == 'temporal')
    ][selected_columns].set_index(groupby_columns)
    pca_B_temporal = pca_vars_in[
        (pca_vars_in['placement'] == 'B') & (pca_vars_in['domain'] == 'temporal')
    ][selected_columns].set_index(groupby_columns)

    pca_A_spectral = pca_vars_in[
        (pca_vars_in['placement'] == 'A') & (pca_vars_in['domain'] == 'spectral')
    ][selected_columns].set_index(groupby_columns)
    pca_B_spectral = pca_vars_in[
        (pca_vars_in['placement'] == 'B') & (pca_vars_in['domain'] == 'spectral')
    ][selected_columns].set_index(groupby_columns)

    fig, ax = plt.subplots(2, 2, figsize=(8, 8))
    pca_A_temporal.plot.bar(stacked=True, grid=True, ax=ax[0][0], title='Temporal features, Placement: A', xlabel='', ylabel='Explained variance')
    pca_B_temporal.plot.bar(stacked=True, grid=True, ax=ax[0][1], title='Temporal features, Placement: B', xlabel='', ylabel='Explained variance')
    pca_A_spectral.plot.bar(stacked=True, grid=True, ax=ax[1][0], title='Spectral features, Placement: A', xlabel='', ylabel='Explained variance')
    pca_B_spectral.plot.bar(stacked=True, grid=True, ax=ax[1][1], title='Spectral features, Placement: B', xlabel='', ylabel='Explained variance')
    plt.tight_layout()
    plt.show()

# No RPM limit
plot_explained_variances(pca_vars)

#### Silhouette scores

In [None]:
scores = run_experiments(batch_initial_conditions, ExperimentOutput.SILHOUETTE)
scores

In [None]:
def plot_silhouette_scores(scores, cols):
    selected_columns = ['target'] + cols
    groupby_columns = ['target']

    scores_A_temporal = scores[
        (scores['placement'] == 'A') & (scores['domain'] == 'temporal')
    ][selected_columns].set_index(groupby_columns)
    scores_B_temporal = scores[
        (scores['placement'] == 'B') & (scores['domain'] == 'temporal')
    ][selected_columns].set_index(groupby_columns)

    scores_A_spectral = scores[
        (scores['placement'] == 'A') & (scores['domain'] == 'spectral')
    ][selected_columns].set_index(groupby_columns)
    scores_B_spectral = scores[
        (scores['placement'] == 'B') & (scores['domain'] == 'spectral')
    ][selected_columns].set_index(groupby_columns)

    fig, ax = plt.subplots(2, 2, figsize=(10, 10))
    scores_A_temporal.plot.bar(grid=True, ax=ax[0][0], title='Temporal features, Placement: A', xlabel='', ylabel='Silhouette score')
    scores_B_temporal.plot.bar(grid=True, ax=ax[0][1], title='Temporal features, Placement: B', xlabel='', ylabel='Silhouette variance')
    scores_A_spectral.plot.bar(grid=True, ax=ax[1][0], title='Spectral features, Placement: A', xlabel='', ylabel='Silhouette variance')
    scores_B_spectral.plot.bar(grid=True, ax=ax[1][1], title='Spectral features, Placement: B', xlabel='', ylabel='Silhouette variance')
    plt.tight_layout()

plot_silhouette_scores(scores, ['train', 'test'])
#plt.suptitle('Best features, all RPM')
plt.show()

In [None]:
plot_silhouette_scores(scores, ['train_pca', 'test_pca'])
plt.suptitle('PCA, all RPM')
plt.show()

#### Dataset sizes

In [None]:
counters = run_experiments(initial_conditions, ExperimentOutput.COUNTS)  
counters.head()

Feature distribution in different classes

In [None]:
def boxplot_features(conditions: List[dict]):
    for row in tqdm(conditions):
        experiment = row.copy()
        print(row)

        for domain_label, dataset in domains.items():
            X_train, X_test, Y_train, Y_test = load_source(dataset, domain_label, row)

            # MinMax scaled result
            # scaler = MinMaxScaler()
            # X_train_scaled = pd.DataFrame()
            # X_train_scaled[X_train.columns] = scaler.fit_transform(X_train)
            X_train_scaled = X_train

            # Diagonal of covariance matrix to see explained variance cov(A, A) = var(A)
            # Variance threshold
            train_cov = X_train_scaled.cov()
            diagonal_cov = pd.Series(np.diag(train_cov), index=[train_cov.index, train_cov.columns])
            diagonal_cov = diagonal_cov / diagonal_cov.sum()
            diagonal_cov = diagonal_cov.sort_values(ascending=False)
            print(row)
            print(diagonal_cov)

            X_train_scaled['target'] = Y_train
            # Show boxplots split by predicted variable
            X_train_scaled.boxplot(figsize=(15, 5))
            plt.show()
            X_train_scaled.boxplot(figsize=(20, 5), layout=(2, 6), by='target', sharey=False)
            plt.show()

column_names = ['target', 'placement', 'online']
batch_initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(target, placement, [False])
]
boxplot_features(batch_initial_conditions)

In [None]:
def load_whole_dataset(dataset: str, domain: str, window_size=2**14):
    features = pd.read_csv(dataset).fillna(0)

    axis = placements['A']
    columns = features.columns.str.startswith(tuple(axis))
    X = features[features.columns[columns]]
    Y = features['rpm'].astype('category')

    # Filter columns in feature domain with window size 2**14
    if domain == 'spectral':
        X = X.loc[:,X.columns.str.endswith(f'_{window_size}')]
        X.columns = X.columns.str.extract(r'(\w+)_\w+$')[0]

    # Calculate feature magnitudes from 3D vector
    feature_names = get_features_list({domain: dataset})
    result = pd.DataFrame()
    for name in feature_names:              
        # Remove prefix: temporal, spectral
        name = re.search(r'[a-z]+_([\w\_]+)', name).group(1)
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = X[vector_dims].apply(np.linalg.norm, axis=1)
    X = result

    return X, Y

In [None]:
boxprops = dict(linewidth=1, color='k')
medianprops = dict(linewidth=2, color='k')
X, Y = load_whole_dataset(TD_FEATURES, 'temporal')
X.plot(
    kind='box', 
    subplots=True, 
    sharey=False, 
    figsize=(20, 5),
    grid=True,
    boxprops=boxprops,
    medianprops=medianprops,
    whiskerprops=boxprops,
    capprops=boxprops
    
)

corrs = {}
for col in X.columns:
    corrs[col] = np.corrcoef(X[col], Y)[0, 1]

plt.subplots_adjust(wspace=0.6) 
#plt.tight_layout()
plt.show()
#pd.DataFrame.from_records([corrs]).T.describe()

x_scaled = pd.DataFrame()
x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
vars = {}

X_td = X.copy()
pca_td = PCA(n_components=10)
X_pca = pca_td.fit_transform(x_scaled)
print(pca_td.explained_variance_ratio_)
print(np.cumsum(pca_td.explained_variance_ratio_))

for col in x_scaled.columns:
    vars[col] = np.var(x_scaled[col])
vars = pd.DataFrame.from_records([vars]).T


(100 * (vars / vars.sum())).sort_values(by=0, ascending=False)

In [None]:
WIN_SIZE = 2**14
X, Y = load_whole_dataset(FD_FEATURES, 'spectral', WIN_SIZE)
X.plot(
    kind='box', 
    subplots=True, 
    sharey=False, 
    figsize=(20, 5),
    grid=True,
    boxprops=boxprops,
    medianprops=medianprops,
    whiskerprops=boxprops,
    capprops=boxprops
    
)
corrs = {}
for col in X.columns:
    corrs[col] = np.corrcoef(X[col], Y)[0, 1]

plt.subplots_adjust(wspace=0.5) 
plt.show()
#pd.DataFrame.from_records([corrs]).T.sort_values(by=0) #.describe()
#covs = pd.DataFrame([cov]).T



x_scaled = pd.DataFrame()
x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
vars = {}

X_fd = X.copy()
pca_fd = PCA(n_components=10)
X_pca = pca_fd.fit_transform(x_scaled)
print(pca_fd.explained_variance_ratio_)
print(np.cumsum(pca_fd.explained_variance_ratio_))

for col in x_scaled.columns:
    vars[col] = np.var(x_scaled[col])
vars = pd.DataFrame.from_records([vars]).T

# Explained variances
(100 * (vars / vars.sum())).sort_values(by=0, ascending=False)

Explained varinace by PCA components

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(np.arange(1, 11), 100 * np.cumsum(pca_td.explained_variance_ratio_), marker='s', label='Temporal features')
ax.plot(np.arange(1, 11), 100 * np.cumsum(pca_fd.explained_variance_ratio_), marker='s', label='Spectral features')
ax.set_xlabel('Number of principal components')
ax.set_ylabel('Percentage of explained variance')
ax.grid()
ax.legend()
plt.show()

Loadings plot
- https://www.jcchouinard.com/python-pca-biplots-machine-learning/
- https://support.minitab.com/en-us/minitab/21/help-and-how-to/statistical-modeling/multivariate/how-to/principal-components/interpret-the-results/key-results/

In [None]:
loadings = pca_td.components_
n_features = pca_td.n_features_in_
feature_names = X_td.columns
pc_list = [f'PC{i}' for i in list(range(1, n_features + 1))]

# Match PC names to loadings
pc_loadings = dict(zip(pc_list, loadings))

# Matrix of corr coefs between feature names and PCs
loadings_df = pd.DataFrame.from_dict(pc_loadings)
loadings_df['feature_names'] = feature_names
loadings_df = loadings_df.set_index('feature_names')
loadings_df[['PC1', 'PC2']]

In [None]:
from adjustText import adjust_text

def loading_plot(loadings, feature_names, bottom, top):
    xs = loadings[0]
    ys = loadings[1]

    texts = []
    # Plot the loadings on a scatterplot
    for i, varnames in enumerate(feature_names):
        plt.arrow(
            0, 0, # coordinates of arrow base
            xs[i], # length of the arrow along x
            ys[i], # length of the arrow along y
            color='r', 
            head_width=0.01
        )
        texts.append(plt.text(xs[i], ys[i], varnames))

    # Define the axes
    adjust_text(texts, only_move={'points':'y', 'texts':'y'})
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.xlim(bottom, top)
    plt.ylim(bottom, top)
    plt.grid()
    plt.show()

loading_plot(pca_td.components_, X_td.columns, -0.5, 1)
plt.show()
loading_plot(pca_fd.components_, X_fd.columns, -0.5, 1)
plt.show()