In [None]:
import re
import os
import sys
import itertools
from typing import List, Set, Tuple, Dict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler

from tqdm.notebook import tqdm
from enum import Enum, auto

import sys
sys.path.append('../')
from vibrodiagnostics import mafaulda, visualize, ranking

In [None]:
def run_experiments(conditions: List[dict], exp_output: ranking.ExperimentOutput, pc=3) -> pd.DataFrame:
    experiments = []
    domains = ('TD', 'FD')

    for row in tqdm(conditions):
        experiment = row.copy()

        for domain_label in domains:
            X_train, X_test, Y_train, Y_test = mafaulda.load_source(domain_label, row)

            if exp_output == ranking.ExperimentOutput.COUNTS:
                experiment.update({'n_train': len(X_train), 'n_test': len(X_test), 'sum': len(X)})
                break

            elif exp_output == ranking.ExperimentOutput.PCA:
                experiment = row.copy()
                experiment.update({'domain': domain_label})
                experiment.update(ranking.pca_explained_variances(X_train, pc))
                experiments.append(experiment)
                continue

            elif exp_output == ranking.ExperimentOutput.SILHOUETTE:
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train)
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train)
    
                best_features = ranking.best_columns(ranks, synonyms, n=3)
                scores = ranking.silhouette_scores(X_train, X_test, Y_train, Y_test, best_features, pc)
                experiment = row.copy()
                experiment.update({'domain': domain_label})
                experiment.update(scores)
                experiments.append(experiment)
                continue

            elif exp_output == ranking.ExperimentOutput.BEST_SET:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train)
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train)
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset

            elif exp_output == ranking.ExperimentOutput.BEST_CORR:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train, 'corr')
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train, 'corr')
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset

            elif exp_output == ranking.ExperimentOutput.BEST_F_STAT:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train, 'f_stat')
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train,'f_stat')
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset

            elif exp_output == ranking.ExperimentOutput.BEST_MI:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train, 'mi')
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train, 'mi')
                synonyms = ranking.compute_correlations(X_train, corr_above=0.95)
                subset = ranking.best_subset(ranks, synonyms, n=3)
                output = subset
    
            elif exp_output == ranking.ExperimentOutput.RANKS:
                if row['online']:
                    ranks = ranking.online_feature_ranking(X_train, Y_train)
                else:
                    ranks = ranking.batch_feature_ranking(X_train, Y_train)
                output = ranks

            output.reset_index(inplace=True)
            output['feature'] = output['feature'].apply(lambda s: f'{domain_label}_{s}')
            output = dict(zip(list(output['feature']), list(output['rank'])))
            experiment.update(output)

        if exp_output not in (ranking.ExperimentOutput.PCA, ranking.ExperimentOutput.SILHOUETTE):
            experiments.append(experiment)

    return pd.DataFrame.from_records(experiments)

In [None]:
column_names = ['placement', 'online']
initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(['A', 'B'], [False, True])
]

#### Majority voting: feature in subsets

#### 3 member sets

In [None]:
membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_SET)
membership.to_csv('best_set/rank_product.csv', index=False)
membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_CORR)
membership.to_csv('best_set/corr.csv', index=False)
membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_F_STAT)
membership.to_csv('best_set/fstat.csv', index=False)
membership = run_experiments(initial_conditions, ranking.ExperimentOutput.BEST_MI)
membership.to_csv('best_set/mi.csv', index=False)

In [None]:
# Globally best features (batch and online)
def globally_best_batch_features(filename):
    best_set_membership = pd.read_csv(filename)

    for i, domain in enumerate(['TD', 'FD']):
        fig, ax = plt.subplots(figsize=(12, 5))
        cols = [col for col in best_set_membership if col.startswith(domain)]
        graph = (
            best_set_membership[cols][best_set_membership == True]
            .count(axis=0)
            .sort_values(ascending=False)
        )
        plt.grid()
        ax.bar([re.search('[\w]+_(\w+)', s).group(1) for s in graph.index], graph)
        ax.set_xlabel('Feature')
        ax.set_ylabel('Count of best subset memberships')
        plt.show()

In [None]:
globally_best_batch_features('best_set/rank_product.csv')

In [None]:
globally_best_batch_features('best_set/corr.csv')

In [None]:
globally_best_batch_features('best_set/fstat.csv')

In [None]:
globally_best_batch_features('best_set/mi.csv')

In [None]:
best_set_membership = pd.read_csv('best_set/rank_product.csv')
temporal_columns = [col for col in best_set_membership if col.startswith('TD')]
spectral_columns = [col for col in best_set_membership if col.startswith('FD')]
agg = pd.DataFrame()
for key, group in best_set_membership.groupby(by=['online']):
    t_situation = group[temporal_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
    f_situation = group[spectral_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
    agg[key] = pd.concat([t_situation, f_situation]).index
agg

#### Rank product: feature ordering

In [None]:
best_set_ranks = run_experiments(initial_conditions, ranking.ExperimentOutput.RANKS)
best_set_ranks.to_csv('best_set/ranks.csv', index=False)
best_set_ranks.head()

In [None]:
best_set_ranks = pd.read_csv('best_set/ranks.csv')
# Globally best features (lower rank is better)

group = best_set_ranks[best_set_ranks['online'] == False]
fig, ax = plt.subplots(1, 2, figsize=(20, 4))
for i, col in enumerate([temporal_columns, spectral_columns]):
    graph = group[col].apply(gmean, axis=0).sort_values(ascending=True)
    print(graph)
    ax[i].grid()
    ax[i].bar([re.search('\w+_(\w+)', s).group(1) for s in graph.index], graph)
plt.show()

# Online
group = best_set_ranks[best_set_ranks['online'] == True]
fig, ax = plt.subplots(1, 2, figsize=(20, 4))
for i, col in enumerate([temporal_columns, spectral_columns]):
    graph = group[col].apply(gmean, axis=0).sort_values(ascending=True)
    print(graph)
    ax[i].grid()
    ax[i].bar([re.search('\w+_(\w+)', s).group(1) for s in graph.index], graph)
plt.show()

In [None]:
# Summary (absolute counts) - RPM limited/unlimted and machinery element
best_set_ranks = pd.read_csv('best_set/ranks.csv')
all_columns = temporal_columns + spectral_columns
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online']):
    agg[key] = group[all_columns].apply(gmean, axis=0)
agg

In [None]:
best_set_ranks = pd.read_csv('best_set/ranks.csv')
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online']):
    t_situation = group[temporal_columns].apply(gmean, axis=0).sort_values(ascending=True).head(3)
    f_situation = group[spectral_columns].apply(gmean, axis=0).sort_values(ascending=True).head(3)
    agg[key] = pd.concat([t_situation, f_situation]).index
agg

In [None]:
# Summary (absolute counts) - RPM limited/unlimted and machinery element
best_set_ranks = pd.read_csv('best_set/ranks.csv')
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online']):
    agg[key] = group[all_columns].apply(gmean, axis=0)
agg

In [None]:
best_set_ranks = pd.read_csv('best_set/ranks.csv')
agg = pd.DataFrame()
for key, group in best_set_ranks.groupby(by=['online']):
    t_situation = group[temporal_columns].apply(gmean, axis=0).sort_values(ascending=False).head(3)
    f_situation = group[spectral_columns].apply(gmean, axis=0).sort_values(ascending=False).head(3)
    agg[key] = pd.concat([t_situation, f_situation]).index
agg

### Best features by experiment
- Majority voting
- Ranking

In [None]:
def extract_feature_names(feature_set):
    return [re.search('\w+_(\w+)', s).group(1) for s in feature_set.index]

def best_feature_set_methods(filename):
    best_set_membership = pd.read_csv(filename)
    feature_sets = []
    indexer = ['placement', 'online']
    for key, group in best_set_membership.groupby(by=indexer):
        t_situation = group[temporal_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
        f_situation = group[spectral_columns][group == True].count(axis=0).sort_values(ascending=False).head(3)

        # Extract feature names
        temporal = list(sorted(extract_feature_names(t_situation)))
        spectral = list(sorted(extract_feature_names(f_situation)))

        fset = {'placement': key[0], 'online': key[1], 'TD': temporal , 'FD': spectral}
        feature_sets.append(fset)

    return pd.DataFrame.from_records(feature_sets).set_index(indexer)

In [None]:
best_feature_set_methods('best_set/rank_product.csv')

In [None]:
best_feature_set_methods('best_set/corr.csv')

In [None]:
best_feature_set_methods('best_set/fstat.csv')

In [None]:
best_feature_set_methods('best_set/mi.csv')

In [None]:
best_feature_set_methods('best_set/ranks.csv')

#### PCA explained variance (batch only)

In [None]:
column_names = ['placement', 'online']
batch_initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(['A', 'B'], [False, True])
]
pca_vars = run_experiments(batch_initial_conditions, ranking.ExperimentOutput.PCA)
pca_vars

In [None]:
def plot_stacked(df: pd.DataFrame, selected_columns: List[str], ylabel: str):
    domains = ('TD', 'FD')
    placements = ('A', 'B')
    fig, ax = plt.subplots(len(domains), len(placements), figsize=(8, 8))

    for r, domain in enumerate(domains):
        for c, place in enumerate(placements):
            g = df[
                (df['placement'] == place) & 
                (df['domain'] == domain)
            ][selected_columns]
            g.plot.bar(
                stacked=True,
                grid=True,
                ax=ax[r][c],
                title=f'{domain} features, Placement: {place}',
                xlabel='',
                ylabel=ylabel
            )
    plt.tight_layout()
    plt.show()

plot_stacked(
    pca_vars[pca_vars['online'] == False],
    ['PC1', 'PC2', 'PC3'],
    'Explained variance'
)

#### Silhouette scores

In [None]:
scores = run_experiments(batch_initial_conditions, ranking.ExperimentOutput.SILHOUETTE)
scores

In [None]:
plot_stacked(scores[scores['online'] == False], ['train', 'test'], 'Silhouette score')

In [None]:
plot_stacked(scores[scores['online'] == False], ['train_pca', 'test_pca'], 'Silhouette score')

Feature distribution in different classes

In [None]:
def boxplot_features(conditions: List[dict]):
    for row in tqdm(conditions):
        experiment = row.copy()
        print(row)
        domains = ('TD', 'FD')

        for domain_label in domains:
            X_train, X_test, Y_train, Y_test = mafaulda.load_source(domain_label, row)
            X_train_scaled = X_train

            # Diagonal of covariance matrix to see explained variance cov(A, A) = var(A)
            # Variance threshold
            train_cov = X_train_scaled.cov()
            diagonal_cov = pd.Series(np.diag(train_cov), index=[train_cov.index, train_cov.columns])
            diagonal_cov = diagonal_cov / diagonal_cov.sum()
            diagonal_cov = diagonal_cov.sort_values(ascending=False)
            print(row)
            print(diagonal_cov)

            X_train_scaled['target'] = Y_train
            # Show boxplots split by predicted variable
            X_train_scaled.boxplot(figsize=(15, 5))
            plt.show()
            X_train_scaled.boxplot(figsize=(20, 5), layout=(2, 6), by='target', sharey=False)
            plt.show()

column_names = ['placement', 'online']
batch_initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(['A', 'B'], [False, True])
]
boxplot_features(batch_initial_conditions)

In [None]:
boxprops = dict(linewidth=1, color='k')
medianprops = dict(linewidth=2, color='k')
X, _, Y, _ = mafaulda.load_source('TD', {'placement': 'A', 'online': False})
X.plot(
    kind='box', 
    subplots=True, 
    sharey=False, 
    figsize=(20, 5),
    grid=True,
    boxprops=boxprops,
    medianprops=medianprops,
    whiskerprops=boxprops,
    capprops=boxprops
    
)

x_scaled = pd.DataFrame()
x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
vars = {}

X_td = X.copy()
pca_td = PCA(n_components=10)
X_pca = pca_td.fit_transform(x_scaled)
print(pca_td.explained_variance_ratio_)
print(np.cumsum(pca_td.explained_variance_ratio_))

for col in x_scaled.columns:
    vars[col] = np.var(x_scaled[col])
vars = pd.DataFrame.from_records([vars]).T


(100 * (vars / vars.sum())).sort_values(by=0, ascending=False)

In [None]:
X, _, Y, _ = mafaulda.load_source('FD', {'placement': 'A', 'online': False})
X.plot(
    kind='box', 
    subplots=True, 
    sharey=False, 
    figsize=(20, 5),
    grid=True,
    boxprops=boxprops,
    medianprops=medianprops,
    whiskerprops=boxprops,
    capprops=boxprops
    
)

x_scaled = pd.DataFrame()
x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
vars = {}

X_fd = X.copy()
pca_fd = PCA(n_components=10)
X_pca = pca_fd.fit_transform(x_scaled)
print(pca_fd.explained_variance_ratio_)
print(np.cumsum(pca_fd.explained_variance_ratio_))

for col in x_scaled.columns:
    vars[col] = np.var(x_scaled[col])
vars = pd.DataFrame.from_records([vars]).T

# Explained variances
(100 * (vars / vars.sum())).sort_values(by=0, ascending=False)

Explained varinace by PCA components

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(np.arange(1, 11), 100 * np.cumsum(pca_td.explained_variance_ratio_), marker='s', label='Temporal features')
ax.plot(np.arange(1, 11), 100 * np.cumsum(pca_fd.explained_variance_ratio_), marker='s', label='Spectral features')
ax.set_xlabel('Number of principal components')
ax.set_ylabel('Percentage of explained variance')
ax.grid()
ax.legend()
plt.show()

Loadings plot
- https://www.jcchouinard.com/python-pca-biplots-machine-learning/
- https://support.minitab.com/en-us/minitab/21/help-and-how-to/statistical-modeling/multivariate/how-to/principal-components/interpret-the-results/key-results/

In [None]:
loadings = pca_td.components_
n_features = pca_td.n_features_in_
feature_names = X_td.columns
pc_list = [f'PC{i}' for i in list(range(1, n_features + 1))]

# Match PC names to loadings
pc_loadings = dict(zip(pc_list, loadings))

# Matrix of corr coefs between feature names and PCs
loadings_df = pd.DataFrame.from_dict(pc_loadings)
loadings_df['feature_names'] = feature_names
loadings_df = loadings_df.set_index('feature_names')
loadings_df[['PC1', 'PC2']]

In [None]:
visualize.loading_plot(pca_td.components_, X_td.columns, -0.5, 1)
visualize.loading_plot(pca_fd.components_, X_fd.columns, -0.5, 1)