In [32]:
import itertools
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import plotly.express as px

from IPython.display import display, HTML
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [33]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

# Let's define some constants

datasets = [
    "kuhar",
    "motionsense",
    "uci",
    "wisdm",
    "realworld"
]


labels_activity = {
    0: "sit",
    1: "stand",
    2: "walk",
    3: "stair up",
    4: "stair down",
    5: "run",
    6: "stair up and down",
}

colors_map = {
    "plotly-blue": px.colors.qualitative.Plotly[0],
    "plotly-green": px.colors.qualitative.Plotly[2],
    "plotly-purple": px.colors.qualitative.Plotly[3],
    "plotly-orange": px.colors.qualitative.Plotly[4],
    "plotly-cyan": px.colors.qualitative.Plotly[5],
    "plotly-pink": px.colors.qualitative.Plotly[6],
    "plotly-lightgreen": px.colors.qualitative.Plotly[7],
    "plotly-lightpink": px.colors.qualitative.Plotly[8],
    "plotly-yellow": px.colors.qualitative.Plotly[9],
    "plotly-red": px.colors.qualitative.Plotly[1],
}

colors = list(colors_map.values())
markers = ["circle", "square", "diamond", "x", "triangle-up", "triangle-down", "pentagon"]

In [34]:
# Let's define some filters related to each of the columns

# --- Related to domain ---
def filter_domain_fft_only(df):
    return df[df["transforms"] == "fft"]

def filter_domain_time_only(df):
    return df.loc[df["transforms"] == "time"]


# --- Related to features ---
def filter_features_all(df):
    return df[
        df["in_use_features"].str.contains("accel-x")
        & df["in_use_features"].str.contains("accel-y")
        & df["in_use_features"].str.contains("accel-z")
        & df["in_use_features"].str.contains("gyro-x")
        & df["in_use_features"].str.contains("gyro-y")
        & df["in_use_features"].str.contains("gyro-z")
    ]

def filter_features_accelerometer_only(df):
    return df[
        df["in_use_features"].str.contains("accel-x")
        & df["in_use_features"].str.contains("accel-y")
        & df["in_use_features"].str.contains("accel-z")
        & ~df["in_use_features"].str.contains("gyro-x")
        & ~df["in_use_features"].str.contains("gyro-y")
        & ~df["in_use_features"].str.contains("gyro-z")
    ]

def filter_features_gyroscope_only(df):
    return df[
        ~df["in_use_features"].str.contains("accel-x")
        & ~df["in_use_features"].str.contains("accel-y")
        & ~df["in_use_features"].str.contains("accel-z")
        & df["in_use_features"].str.contains("gyro-x")
        & df["in_use_features"].str.contains("gyro-y")
        & df["in_use_features"].str.contains("gyro-z")
    ]

# --- Related to how reducer is executed ---
def filter_reducer_over_all(df):
    return df[df["reduce_on"] == "all"]

def filter_reducer_over_sensor(df):
    return df[df["reduce_on"] == "sensor"]

def filter_reducer_over_axis(df):
    return df[df["reduce_on"] == "axis"]

# --- Related to the estimator ---
def filter_estimator_rf(df):
    return df[df["estimator"].str.lower().str.contains("randomforest")]

def filter_estimator_svm(df):
    return df[df["estimator"].str.lower().str.contains("svm")]

def filter_estimator_knn(df):
    return df[df["estimator"].str.lower().str.contains("knn")]

# --- Related to the scaler ---
def filter_scaler_none(df):
    return df[df["scaler"] == "no scaler"]

def filter_scaler_minmax(df):
    return df[df["scaler"].str.lower().str.contains("minmaxscaler")]

def filter_scaler_std(df):
   return df[df["scaler"].str.lower().str.contains("standardscaler")]

# --- Related to the view ---
def filter_view_standardized_inter_balanced(df):
    return df[df["view"] == "standartized_intra_balanced"] # typo in the data intra -> inter

def filter_view_standardized_balanced(df):
    return df[(df["view"] == "standartized_balanced")]

def filter_view_raw(df):
    return df[df["view"] == "raw_balanced"]    

# --- Some other useful filters ---
def filter_only_reducer_dataset_equals_train_dataset(df):
    return df[df["reducer_datasets"] == df["train_datasets"]]

def filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df):
    return df[(df["reducer_datasets"] == df["train_datasets"]) | (df["reducer_datasets"] == "")]

def filter_only_reducer_dataset_diferent_from_train(df):
    return df[(df["reducer_datasets"] != df["train_datasets"]) & (df["reducer_datasets"] != "")]

def filter_same_train_test(df):
    return df[df["train_datasets"] == df["test_datasets"]]

def filter_different_train_test(df):
    return df[df["train_datasets"] != df["test_datasets"]]

In [35]:
# Preprocessing steps
def rename_datasets(
    df, columns: List[str] = ("reducer_datasets", "train_datasets", "test_datasets")
):
    def rename_row(row):
        for col in columns:
            names = set()
            for name in row[col].split(","):
                name = name.strip()
                names.add(name.split(".")[0])
            row[col] = ", ".join(sorted(names))
        return row

    df = df.apply(rename_row, axis=1)
    return df

def add_view_name(df, new_column_name: str = "view"):
    df[new_column_name] = df["config_id"].apply(lambda x: "_".join(x.split("_")[:-1]))
    return df

# def match_configs(df, new_column_name: str = "config_group"):
#     group_no = 0
#     for k, subdf in df.groupby(
#         [
#             "in_use_features",
#             "scale_on",
#             "reduce_on",
#             "transforms",
#             "scaler",
#             "reducer",
#             "umap components",
#             "reducer_datasets",
#             "train_datasets",
#             "test_datasets",
#             "estimator",
#         ]
#     ):
#         if len(subdf) == 2:
#             df.loc[subdf.index, new_column_name] = group_no
#             group_no += 1
#     return df

In [36]:
# First results file
results_file = Path("results2.csv")
results_1 = pd.read_csv(results_file).fillna("")
# We must filter realworld datasets from the results 1
results_1 = results_1[~(results_1["train_datasets"].str.contains("realworld"))]
results_1 = results_1[~(results_1["reducer_datasets"].str.contains("realworld"))]
# results_1["from"] = str(results_file)


# Second results file (the missing info)
results_file = Path("results_missing_v1.csv")
results_2 = pd.read_csv(results_file).fillna("")
# results_2["from"] = str(results_file)

# Finally, the final results
results = pd.concat([results_1, results_2])
results

Unnamed: 0,experiment_name,run_name,config_id,reduce_size,train_size,test_size,in_use_features,scale_on,reduce_on,transforms,scaler,reducer,umap components,reducer_datasets,train_datasets,test_datasets,estimator,accuracy (mean),accuracy (std),f1-score macro (mean),f1-score macro (std),f1-score weighted (mean),f1-score weighted (std)
0,reducer_comb_1,execution1,standartized_intra_balanced_03744,1794,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-4,4,"kuhar.standartized_intra_balanced[train], kuha...","wisdm.standartized_intra_balanced[train], wisd...",wisdm.standartized_intra_balanced[test],randomforest-100,0.590000,1.779513e-02,0.589906,1.835340e-02,0.589906,1.835340e-02
1,reducer_comb_1,execution1,standartized_intra_balanced_03744,1794,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-4,4,"kuhar.standartized_intra_balanced[train], kuha...","wisdm.standartized_intra_balanced[train], wisd...",wisdm.standartized_intra_balanced[test],KNN-5,0.508333,1.110223e-16,0.490085,0.000000e+00,0.490085,0.000000e+00
2,reducer_comb_1,execution1,standartized_intra_balanced_03744,1794,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-4,4,"kuhar.standartized_intra_balanced[train], kuha...","wisdm.standartized_intra_balanced[train], wisd...",wisdm.standartized_intra_balanced[test],SVM-rbf-C1.0,0.475000,5.551115e-17,0.443587,0.000000e+00,0.443587,0.000000e+00
6,reducer_comb_1,execution1,raw_balanced_02861,3978,3978,1062,"accel-x, accel-y, accel-z",train,axis,FFT-centered,,umap-20,20,"motionsense.raw_balanced[train], motionsense.r...","motionsense.raw_balanced[train], motionsense.r...",motionsense.raw_balanced[test],randomforest-100,0.873540,4.690173e-03,0.873424,4.649545e-03,0.873424,4.649545e-03
7,reducer_comb_1,execution1,raw_balanced_02861,3978,3978,1062,"accel-x, accel-y, accel-z",train,axis,FFT-centered,,umap-20,20,"motionsense.raw_balanced[train], motionsense.r...","motionsense.raw_balanced[train], motionsense.r...",motionsense.raw_balanced[test],KNN-5,0.843691,1.110223e-16,0.844248,0.000000e+00,0.844248,1.110223e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7600,reducer_comb_1_not_intra,execution1,standartized_balanced02167,10164,10164,2628,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,,umap-10,10,"realworld.standartized_balanced[train], realwo...","realworld.standartized_balanced[train], realwo...",realworld.standartized_balanced[test],KNN-5,0.517123,1.110223e-16,0.508283,0.000000e+00,0.508283,0.000000e+00
7601,reducer_comb_1_not_intra,execution1,standartized_balanced02167,10164,10164,2628,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,,umap-10,10,"realworld.standartized_balanced[train], realwo...","realworld.standartized_balanced[train], realwo...",realworld.standartized_balanced[test],SVM-rbf-C1.0,0.516743,1.110223e-16,0.501107,1.110223e-16,0.501107,1.110223e-16
7602,reducer_comb_1_not_intra,execution1,standartized_balanced02922,10164,10164,2628,"gyro-x, gyro-y, gyro-z",train,axis,FFT-centered,StandardScaler,umap-20,20,"realworld.standartized_balanced[train], realwo...","realworld.standartized_balanced[train], realwo...",realworld.standartized_balanced[test],randomforest-100,0.666438,1.140055e-02,0.666738,1.076361e-02,0.666738,1.076361e-02
7603,reducer_comb_1_not_intra,execution1,standartized_balanced02922,10164,10164,2628,"gyro-x, gyro-y, gyro-z",train,axis,FFT-centered,StandardScaler,umap-20,20,"realworld.standartized_balanced[train], realwo...","realworld.standartized_balanced[train], realwo...",realworld.standartized_balanced[test],KNN-5,0.556697,0.000000e+00,0.516618,1.110223e-16,0.516618,1.110223e-16


In [37]:
# Lets transform and add some useful information
results = rename_datasets(results)
results = add_view_name(results)
results.loc[results["view"] == "standartized", "view"] = "standartized_balanced"
results.loc[results["view"] == "standartized_intra", "view"] = "standartized_intra_balanced"
results.loc[results["view"] == "raw", "view"] = "raw_balanced"
# results = match_configs(results)
results

Unnamed: 0,experiment_name,run_name,config_id,reduce_size,train_size,test_size,in_use_features,scale_on,reduce_on,transforms,scaler,reducer,umap components,reducer_datasets,train_datasets,test_datasets,estimator,accuracy (mean),accuracy (std),f1-score macro (mean),f1-score macro (std),f1-score weighted (mean),f1-score weighted (std),view
0,reducer_comb_1,execution1,standartized_intra_balanced_03744,1794,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-4,4,kuhar,wisdm,wisdm,randomforest-100,0.590000,1.779513e-02,0.589906,1.835340e-02,0.589906,1.835340e-02,standartized_intra_balanced
1,reducer_comb_1,execution1,standartized_intra_balanced_03744,1794,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-4,4,kuhar,wisdm,wisdm,KNN-5,0.508333,1.110223e-16,0.490085,0.000000e+00,0.490085,0.000000e+00,standartized_intra_balanced
2,reducer_comb_1,execution1,standartized_intra_balanced_03744,1794,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-4,4,kuhar,wisdm,wisdm,SVM-rbf-C1.0,0.475000,5.551115e-17,0.443587,0.000000e+00,0.443587,0.000000e+00,standartized_intra_balanced
6,reducer_comb_1,execution1,raw_balanced_02861,3978,3978,1062,"accel-x, accel-y, accel-z",train,axis,FFT-centered,,umap-20,20,motionsense,motionsense,motionsense,randomforest-100,0.873540,4.690173e-03,0.873424,4.649545e-03,0.873424,4.649545e-03,raw_balanced
7,reducer_comb_1,execution1,raw_balanced_02861,3978,3978,1062,"accel-x, accel-y, accel-z",train,axis,FFT-centered,,umap-20,20,motionsense,motionsense,motionsense,KNN-5,0.843691,1.110223e-16,0.844248,0.000000e+00,0.844248,1.110223e-16,raw_balanced
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7600,reducer_comb_1_not_intra,execution1,standartized_balanced02167,10164,10164,2628,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,,umap-10,10,realworld,realworld,realworld,KNN-5,0.517123,1.110223e-16,0.508283,0.000000e+00,0.508283,0.000000e+00,standartized_balanced
7601,reducer_comb_1_not_intra,execution1,standartized_balanced02167,10164,10164,2628,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,,umap-10,10,realworld,realworld,realworld,SVM-rbf-C1.0,0.516743,1.110223e-16,0.501107,1.110223e-16,0.501107,1.110223e-16,standartized_balanced
7602,reducer_comb_1_not_intra,execution1,standartized_balanced02922,10164,10164,2628,"gyro-x, gyro-y, gyro-z",train,axis,FFT-centered,StandardScaler,umap-20,20,realworld,realworld,realworld,randomforest-100,0.666438,1.140055e-02,0.666738,1.076361e-02,0.666738,1.076361e-02,standartized_balanced
7603,reducer_comb_1_not_intra,execution1,standartized_balanced02922,10164,10164,2628,"gyro-x, gyro-y, gyro-z",train,axis,FFT-centered,StandardScaler,umap-20,20,realworld,realworld,realworld,KNN-5,0.556697,0.000000e+00,0.516618,1.110223e-16,0.516618,1.110223e-16,standartized_balanced


In [38]:
# Lets filter out some reduntant lines
results = results[
    ~(results["reducer_datasets"] == "")
    | (results["umap components"] == 0)
]

# As we only use umap, we can drop the reducer column
results = results.drop(columns=["reducer"])

# As we only use TIME and FFT domains, lets rename the values "" of column transforms to "TIME"
# and "FFT" to "FFT"
results.loc[results["transforms"] == "", "transforms"] = "time" 
results.loc[results["transforms"] == "FFT-centered", "transforms"] = "fft"

# If no scaler is used, lets change the value from "" to "no scaler"
results.loc[results["scaler"] == "", "scaler"] = "no scaler"

# As scaler is only used for the train dataset, we can drop the column scale_on
results = results.drop(columns=["scale_on"])

# Lets drop some meta-columns that we will not use
results = results.drop(columns=["experiment_name", "run_name", "config_id", "reduce_size", "train_size", "test_size"])
list(results.columns), results.shape

(['in_use_features',
  'reduce_on',
  'transforms',
  'scaler',
  'umap components',
  'reducer_datasets',
  'train_datasets',
  'test_datasets',
  'estimator',
  'accuracy (mean)',
  'accuracy (std)',
  'f1-score macro (mean)',
  'f1-score macro (std)',
  'f1-score weighted (mean)',
  'f1-score weighted (std)',
  'view'],
 (143856, 16))

In [39]:
# Grouping by this columns should result in only one row per group
# It is a unique tuple that describes each experiment
unique_exp_columns = [
    "in_use_features",
    "reduce_on",
    "transforms",
    "scaler",
    "umap components",
    "reducer_datasets",
    "train_datasets",
    "test_datasets",
    "estimator",
    "view",
]

# Remove duplicates
lines = []
for k, subdf in results.groupby(unique_exp_columns):
    lines.append(subdf.iloc[0])
results = pd.DataFrame(lines)

# Sanity check
results.groupby(unique_exp_columns).apply(lambda x: len(x) == 1).value_counts()

True    139158
dtype: int64

In [40]:
def best_of_each_estimator(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["estimator"]]

    best_of_each_dataset = []
    for key, subdf in df.groupby(experiment_uniq):
        best = subdf.sort_values(by="accuracy (mean)", ascending=False, inplace=False).iloc[0]
        best_of_each_dataset.append(best)
    return pd.DataFrame(best_of_each_dataset)

def best_of_each_umap(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["umap components"]]

    best_of_each_dataset = []
    for key, subdf in df.groupby(experiment_uniq):
        best = subdf.sort_values(by="accuracy (mean)", ascending=False, inplace=False).iloc[0]
        best_of_each_dataset.append(best)
    return pd.DataFrame(best_of_each_dataset)

def best_of_each_estimator_and_umap(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["estimator", "umap components"]]

    best_of_each_dataset = []
    for key, subdf in df.groupby(experiment_uniq):
        best = subdf.sort_values(by="accuracy (mean)", ascending=False, inplace=False).iloc[0]
        best_of_each_dataset.append(best)
    return pd.DataFrame(best_of_each_dataset)

In [41]:
def chart_bar_with_error_bars(df, improvement_col: str, groupby: str = "train_datasets"):
    xs, ys, error_y_min, error_y_max = [], [], [], []
    for key, subdf in df.groupby(groupby):
        sorted_improvement = sorted(subdf[improvement_col].values)
        # print(sorted_improvement)
        xs.append(key)
        ys.append(sorted_improvement[1])
        error_y_min.append(abs(sorted_improvement[0]-sorted_improvement[1]))
        error_y_max.append(abs(sorted_improvement[-1]-sorted_improvement[1]))

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=xs,
            y=ys,
            error_y=dict(
                type="data",
                symmetric=False,
                array=error_y_max,
                arrayminus=error_y_min
            )
        )
    )
    return fig

def chart_bar_with_side_by_side(df, improvement_col: str, groupby: str = "train_datasets", inner_group: str = "estimator"):
    fig = go.Figure()

    for i, (key, subdf) in enumerate(df.groupby(groupby)):
        xs, ys = [], []
        for inner_key, inner_subdf in subdf.groupby(inner_group):
            xs.append(str(inner_key))
            ys.append(inner_subdf[improvement_col].values[0])
        
        fig.add_trace(
            go.Bar(
                y=ys,
                x=xs,
                name=str(key),
                marker=dict(color=colors[i])
            )
        )
    return fig

## Pergunta 1

### a) Qual o impacto no desempenho dos modelos de ML quando o dado de entrada é transformado com o UMAP?

Compararemos, para cada tupla <dataset DS; domíno D; modelo de ML M>, qual é a diferença (razão) de desempenho do modelo M na tarefa de HAR com o dado do dataset DS no domínio D com e sem redução de dimensionalidade.

In [42]:
def get_umap_improvement_over_no_umap(df, new_column_name: str = "umap improvement"):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reducer_datasets", "umap components"]]

    zero_umap = {}
    for _, subdf in df[df["umap components"] == 0].iterrows():
        zero_umap[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df[new_column_name] = df.apply(lambda x: x["accuracy (mean)"] / zero_umap.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    return df

def filter_max_set_only(df, column: str = "umap improvement"):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reducer_datasets", "umap components"]]
    df = df.dropna(subset=[column], inplace=False)
    df = df[df["umap components"] > 0]
    lines = [subdf[subdf[column] == subdf[column].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    df = pd.DataFrame(lines)
    return df

In [43]:
df = results.copy()
df = filter_view_standardized_balanced(df)
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
df = get_umap_improvement_over_no_umap(df)
#df.loc[~df["umap improvement"].isna(), "umap components"].unique()
df = filter_max_set_only(df)

fig = chart_bar_with_error_bars(df, improvement_col="umap improvement", groupby="train_datasets")
fig.add_hline(y=1.0, line_dash="dash", line_color="magenta")
fig.update_layout(
    title="UMAP Improvement for each dataset (best UMAP over no UMAP, per dataset).<br>Error bars comes from different estimators",
    xaxis_title="Dataset",
    yaxis_title="Accuracy Improvement over no UMAP",
    width=800,
    height=600
)
fig.show()

fig = chart_bar_with_side_by_side(df, improvement_col="umap improvement", groupby="estimator", inner_group="train_datasets")
fig.add_hline(y=1.0, line_dash="dash", line_color="red")
fig.update_layout(
    title="UMAP Improvement for each dataset (best UMAP over no UMAP, per dataset)",
    xaxis_title="Dataset",
    yaxis_title="Accuracy Improvement over no UMAP",
    legend_title="Estimator",
    width=800,
    height=600
)
fig.show()

### b) Qual é o impacto da dimensionalidade alvo do UMAP no desempenho dos modelos de ML?

b .1) UMAP vs No-UMAP

In [44]:
# df = results.copy()
# df = filter_view_standardized_balanced(df)
# df = filter_domain_fft_only(df)
# df = filter_features_all(df)
# df = filter_reducer_over_all(df)
# df = filter_scaler_none(df)
# df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
# df = get_umap_improvement_over_no_umap(df)
# df = df[(df["umap components"] > 0) & (df["umap components"] <= 20)]

df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
df = get_umap_improvement_over_no_umap(df)
df = df[(df["umap components"] > 0) & (df["umap components"] <= 360)]
df


dset_markers = {
    name: markers[i]
    for i, name in enumerate(df["train_datasets"].unique())
}

estimator_colors = {
    name: colors[i]
    for i, name in enumerate(df["estimator"].unique())
}

fig = go.Figure()
for (estimator, dataset), subdf in df.groupby(["estimator", "train_datasets"]):
    subdf = subdf.sort_values(by="umap components")
    xs, ys = [], []
    for n_components in sorted(subdf["umap components"].unique()):
        xs.append(n_components)
        ys.append(subdf[subdf["umap components"] <= n_components]["umap improvement"].max())
    fig.add_trace(
        go.Scatter( 
            x=xs,
            y=ys,
            name=f"{estimator} {dataset}",
            mode="lines+markers",
            legendgroup=estimator,
            # legendgrouptitle="Estimator",
            marker=dict(
                color=estimator_colors[estimator],
                symbol=dset_markers[dataset],
                size=8,
            ),
        )
    )
    # subdf["best umap improvement so far"] = df["umap improvement"].expanding().max()

fig.update_layout(
    title="UMAP Improvement for each dataset (best UMAP over no UMAP, per dataset)",
    xaxis_title="Maximum UMAP Components",
    yaxis_title="Accuracy improvement over no UMAP",
    legend_title="Estimator",
    width=1200,
    height=600,
    # xaxis=dict(tickmode="linear")
)
fig.show()

b.2) UMAP vs best UMAP

In [45]:
def get_umap_improvement_over_best_umap(df, new_column_name: str = "umap improvement"):
    df = df[df["umap components"] != 0]
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reducer_datasets", "umap components"]]

    best_umap = {}
    for key, subdf in df.groupby(experiment_uniq):
        best_umap[key] = subdf["accuracy (mean)"].max()

    for key, subdf in df.groupby(experiment_uniq):
        df.loc[subdf.index, new_column_name] = subdf["accuracy (mean)"] / best_umap[key]

    return df

In [46]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
df = get_umap_improvement_over_best_umap(df)
df = df[(df["umap components"] > 0) & (df["umap components"] <= 360)]

dset_markers = {
    name: markers[i]
    for i, name in enumerate(df["train_datasets"].unique())
}

estimator_colors = {
    name: colors[i]
    for i, name in enumerate(df["estimator"].unique())
}

fig = go.Figure()
for (estimator, dataset), subdf in df.groupby(["estimator", "train_datasets"]):
    subdf = subdf.sort_values(by="umap components")
    xs, ys = [], []
    for n_components in sorted(subdf["umap components"].unique()):
        xs.append(n_components)
        ys.append(subdf[subdf["umap components"] <= n_components]["umap improvement"].max())
    fig.add_trace(
        go.Scatter( 
            x=xs,
            y=ys,
            name=f"{estimator} {dataset}",
            mode="lines+markers",
            legendgroup=estimator,
            # legendgrouptitle="Estimator",
            marker=dict(
                color=estimator_colors[estimator],
                symbol=dset_markers[dataset],
                size=7,
            ),
        )
    )
    # subdf["best umap improvement so far"] = df["umap improvement"].expanding().max()

fig.update_layout(
    title="UMAP Improvement for each dataset (UMAP over best UMAP, per dataset)",
    xaxis_title="Maximum UMAP Components",
    yaxis_title="Accuracy improvement best UMAP",
    legend_title="Estimator",
    width=1200,
    height=600,
    # xaxis=dict(tickmode="linear")
)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Pergunta 1.c

In [47]:
experiment_uniq = [c for c in unique_exp_columns if c not in ["train_datasets", "view"]]

df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)

raw_df = filter_view_raw(df.copy())
standartized_df = filter_view_standardized_balanced(df.copy())

# for view in ["raw_balanced", "standardized_balanced"]:

fig = go.Figure()
xs = datasets
no_umap_ys = []
umap_ys = []
for dset in datasets:
    # Filter raw dsets only
    best_acc = raw_df.loc[(raw_df["train_datasets"] == dset) & (raw_df["umap components"] == 0), "accuracy (mean)"]
    no_umap_ys.append(best_acc.sort_values().iloc[-1])
    
    best_acc = raw_df.loc[(raw_df["train_datasets"] == dset) & (raw_df["umap components"] > 0), "accuracy (mean)"]
    umap_ys.append(best_acc.sort_values().iloc[-1])
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=no_umap_ys,
        name="Raw (no umap)",
        marker=dict(color=px.colors.sequential.Plotly3[0]),
        legendgroup="Raw (no umap)",
    )
)
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=umap_ys,
        name="Raw (umap)",
        marker=dict(color=px.colors.sequential.Plotly3[3]),
        legendgroup="Raw (best umap)",
    )
)

no_umap_ys = []
umap_ys = []
for dset in datasets:
    # Filter raw dsets only
    best_acc = standartized_df.loc[(standartized_df["train_datasets"] == dset) & (standartized_df["umap components"] == 0), "accuracy (mean)"]
    no_umap_ys.append(best_acc.sort_values().iloc[-1])
    
    best_acc = standartized_df.loc[(standartized_df["train_datasets"] == dset) & (standartized_df["umap components"] > 0), "accuracy (mean)"]
    umap_ys.append(best_acc.sort_values().iloc[-1])
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=no_umap_ys,
        name="Standardized (no umap)",
        marker=dict(color=px.colors.sequential.Plasma_r[1]),
        legendgroup="Standardized (no umap)",
    )
)
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=umap_ys,
        name="Standardized (umap)",
        marker=dict(color=px.colors.sequential.Plasma_r[2]),
        legendgroup="Standardized (best umap)",
    )
)

fig.update_layout(
    title="Impact of UMAP on raw and standardized datasets (FFT)",
    xaxis_title="Dataset",
    yaxis_title="Accuracy (best classifier)",
)

fig.show()

In [48]:
experiment_uniq = [c for c in unique_exp_columns if c not in ["train_datasets", "view"]]

df = results.copy()
df = filter_domain_time_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)

raw_df = filter_view_raw(df.copy())
standartized_df = filter_view_standardized_balanced(df.copy())

# for view in ["raw_balanced", "standardized_balanced"]:

fig = go.Figure()
xs = datasets
no_umap_ys = []
umap_ys = []
for dset in datasets:
    # Filter raw dsets only
    best_acc = raw_df.loc[(raw_df["train_datasets"] == dset) & (raw_df["umap components"] == 0), "accuracy (mean)"]
    no_umap_ys.append(best_acc.sort_values().iloc[-1])
    
    best_acc = raw_df.loc[(raw_df["train_datasets"] == dset) & (raw_df["umap components"] > 0), "accuracy (mean)"]
    umap_ys.append(best_acc.sort_values().iloc[-1])
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=no_umap_ys,
        name="Raw (no umap)",
        marker=dict(color=px.colors.sequential.Plotly3[0]),
        legendgroup="Raw (no umap)",
    )
)
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=umap_ys,
        name="Raw (umap)",
        marker=dict(color=px.colors.sequential.Plotly3[3]),
        legendgroup="Raw (best umap)",
    )
)

no_umap_ys = []
umap_ys = []
for dset in datasets:
    # Filter raw dsets only
    best_acc = standartized_df.loc[(standartized_df["train_datasets"] == dset) & (standartized_df["umap components"] == 0), "accuracy (mean)"]
    no_umap_ys.append(best_acc.sort_values().iloc[-1])
    
    best_acc = standartized_df.loc[(standartized_df["train_datasets"] == dset) & (standartized_df["umap components"] > 0), "accuracy (mean)"]
    umap_ys.append(best_acc.sort_values().iloc[-1])
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=no_umap_ys,
        name="Standardized (no umap)",
        marker=dict(color=px.colors.sequential.Plasma_r[1]),
        legendgroup="Standardized (no umap)",
    )
)
    
fig.add_trace(
    go.Bar(
        x=xs,
        y=umap_ys,
        name="Standardized (umap)",
        marker=dict(color=px.colors.sequential.Plasma_r[2]),
        legendgroup="Standardized (best umap)",
    )
)

fig.update_layout(
    title="Impact of UMAP on raw and standardized datasets (Time)",
    xaxis_title="Dataset",
    yaxis_title="Accuracy (best classifier)",
)

fig.show()

## Pergunta 2

Como a escolha do dataset no treinamento do UMAP afeta os resultados da tarefa?

- Normalizar pelo melhor do dataset de teste
- Marcador por dataset
- Comparar MAE

In [49]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_inter_balanced(df)
# df = pd.concat([filter_only_reducer_dataset_diferent_from_train(df), filter_only_reducer_dataset_equals_train_dataset(df)])
# df = filter_only_reducer_dataset_diferent_from_train(df)
df = filter_same_train_test(df)
df = df[(df["umap components"] > 0) & (df["umap components"] <= 10)]
df = best_of_each_estimator_and_umap(df)

fig = go.Figure()

for i, dset in enumerate(datasets):
    x = df[df["train_datasets"] == dset].sort_values(by="accuracy (mean)")
    xs = list(range(len(x)))
    ys = []
    the_markers = []
    names = []
    for row_idx, row in x.iterrows():
        ys.append(row["accuracy (mean)"])
        the_markers.append("circle" if dset in row["reducer_datasets"] else "x")
        name = f'{row["reducer_datasets"]} ({row["umap components"]} components)'
        names.append(name)
        
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            marker=dict(
                symbol=the_markers,
                size=10,
                color=colors[i]
            ),
            hovertext=names,
            name=dset,
        )
    )
    
fig.update_layout(
    title="Using different datasets to train umap (train/test = same)<br>Circles = same dataset, X = different dataset",
    xaxis_title="Number of experiments",
    yaxis_title="Accuracy (best classifier and umap <= 10 components)",
    legend_title="Train/Test dataset",
)

fig.show()

In [50]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_inter_balanced(df)
# df = pd.concat([filter_only_reducer_dataset_diferent_from_train(df), filter_only_reducer_dataset_equals_train_dataset(df)])
# df = filter_only_reducer_dataset_diferent_from_train(df)
df = filter_same_train_test(df)
df = df[(df["umap components"] > 0) & (df["umap components"] <= 10)]
df = best_of_each_estimator_and_umap(df)

matrix = []
reducer_datasets = df["reducer_datasets"].unique()

the_datasets = datasets.copy()
the_datasets.remove("realworld")

for i, dset in enumerate(the_datasets):
    lines = [0] * len(reducer_datasets)
    for j, r_dset in enumerate(reducer_datasets):
        acc = df.loc[(df["reducer_datasets"] == r_dset) & (df["train_datasets"] == dset), "accuracy (mean)"].sort_values()
        if len(acc) == 0:
            continue
        else:
            lines[j] = acc.iloc[-1]
    matrix.append(lines)
    
matrix = np.array(matrix)

fig = px.imshow(matrix, text_auto=True, x=reducer_datasets, y=the_datasets)
fig.update_layout(
    title="Heatmap between datasets to train UMAP and train classifier<br>The color indicates the accuracy using the best classifier and UMAP <= 10",
    xaxis_title="Dataset(s) used to train UMAP",
    yaxis_title="Datasets used to train classifier"
)
fig.show()


# fig.show()

In [51]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_inter_balanced(df)
# df = pd.concat([filter_only_reducer_dataset_diferent_from_train(df), filter_only_reducer_dataset_equals_train_dataset(df)])
# df = filter_only_reducer_dataset_diferent_from_train(df)
df = filter_same_train_test(df)
df = df[(df["umap components"] > 0)]
df = best_of_each_estimator_and_umap(df)

fig = go.Figure()

for i, dset in enumerate(datasets):
    x = df[df["train_datasets"] == dset].sort_values(by="accuracy (mean)")
    xs = list(range(len(x)))
    ys = []
    the_markers = []
    names = []
    for row_idx, row in x.iterrows():
        ys.append(row["accuracy (mean)"])
        the_markers.append("circle" if dset in row["reducer_datasets"] else "x")
        name = f'{row["reducer_datasets"]} ({row["umap components"]} components)'
        names.append(name)
        
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            marker=dict(
                symbol=the_markers,
                size=10,
                color=colors[i]
            ),
            hovertext=names,
            name=dset,
        )
    )
    
fig.update_layout(
    title="Using different datasets to train umap (train/test = same)<br>Circles = same dataset, X = different dataset",
    xaxis_title="Number of experiments",
    yaxis_title="Accuracy (best classifier)",
    legend_title="Train/Test dataset",
)

fig.show()

In [64]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_inter_balanced(df)
# df = pd.concat([filter_only_reducer_dataset_diferent_from_train(df), filter_only_reducer_dataset_equals_train_dataset(df)])
# df = filter_only_reducer_dataset_diferent_from_train(df)
df = filter_same_train_test(df)
df = df[(df["umap components"] > 0)]
df = best_of_each_estimator_and_umap(df)

matrix = []
reducer_datasets = df["reducer_datasets"].unique().tolist()

the_datasets = datasets.copy()
the_datasets.remove("realworld")
reducer_datasets.remove("realworld")

for i, dset in enumerate(the_datasets):
    lines = [0] * len(reducer_datasets)
    for j, r_dset in enumerate(reducer_datasets):
        acc = df.loc[(df["reducer_datasets"] == r_dset) & (df["train_datasets"] == dset), "accuracy (mean)"].sort_values()
        if len(acc) == 0:
            continue
        else:
            lines[j] = acc.iloc[-1]
    matrix.append(lines)
    
matrix = np.array(matrix)

fig = px.imshow(matrix, text_auto=True, x=reducer_datasets, y=the_datasets)
fig.update_layout(
    title="Heatmap between datasets to train UMAP and train classifier<br>The color indicates the accuracy using the best classifier",
    xaxis_title="Dataset(s) used to train UMAP",
    yaxis_title="Datasets used to train classifier"
)
fig.show()


# fig.show()

## Pergunta 3

## Pergunta 4

## Pergunta 5

É mais vantajoso reduzir a dimensionalidade dos dados dos sensores de forma individual (aplicando o UMAP a um sensor, ou eixo, por vez) ou de forma agregada (como estamos fazendo)?

In [53]:
def get_reduce_on_improvement(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reduce_on", "umap components"]]

    all_values = {}
    for _, subdf in df[df["reduce_on"] == "all"].iterrows():
        all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    df = df.dropna(subset=["umap improvement"], inplace=False)
    df = df[df["reduce_on"] != "all"]
    lines = [subdf[subdf["umap improvement"] == subdf["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    df = pd.DataFrame(lines)
    return df

# results = add_umap_improvement(results)
#df = only_standardized_view(no_scaler(using_all_features(only_fft(only_reduce_on_all(only_reducer_equals_train(df))))))

In [54]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
# df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
df = best_of_each_estimator(df)
df = filter_only_reducer_dataset_equals_train_dataset(df)

fig = make_subplots(rows=1, cols=3, shared_yaxes=False, column_titles=["Umap using all features", "Per-sensor UMAP", "Per-sensor-axis umap"])

xs = [3, 5, 6, 7, 8, 9, 10]
all_df = filter_reducer_over_all(df.copy())
sensor_df = filter_reducer_over_sensor(df.copy())
axis_df = filter_reducer_over_axis(df.copy())

vals = [
    (all_df,    [18, 30, 36, 42, 48, 54, 60 ]),
    (sensor_df, [9, 15, 18, 21, 24, 27, 30]),
    (axis_df,   [3, 5, 6, 7, 8, 9, 10])
]

for col_no, (df, the_range) in enumerate(vals):
    for d, dset in enumerate(datasets):
        ys = []
        for i in the_range:
            best_acc = all_df.loc[(all_df["umap components"] == i) & (all_df["train_datasets"] == dset), "accuracy (mean)"].sort_values().iloc[-1]
            ys.append(best_acc)
            
        fig.add_trace(
            go.Scatter(
                x=xs,
                y=ys,
                name=dset,
                hovertext=[f"{r} components" for r in the_range],
                legendgroup=dset,
                marker=dict(
                    color=colors[d]
                )
            ),
            row=1,
            col=col_no+1
        )
        
fig["layout"]["xaxis1"].update(title=f"UMAP components (x6)", tickmode="linear")
fig["layout"]["xaxis2"].update(title=f"UMAP components (x3)", tickmode="linear")
fig["layout"]["xaxis3"].update(title=f"UMAP components (x1)", tickmode="linear")
fig["layout"]["yaxis1"].update(title="Accuracy (best classifier)", range=[0.6, 1])
fig["layout"]["yaxis2"].update(title="Accuracy (best classifier)", range=[0.6, 1])
fig["layout"]["yaxis3"].update(title="Accuracy (best classifier)", range=[0.6, 1])
fig.update_layout(
    title="UMAP applied to train per sensor and per axis and using all features",
    legend_title="Dataset",
    # xaxis=dict(tickmode="linear"),
)


fig.show()


## Pergunta 6

Há uma mudança significativa entre os diferentes classificadores?

In [55]:
# def get_estimator_improvement(df):
#     experiment_uniq = [c for c in unique_exp_columns if c not in ["estimator"]]
#     df = df[df["umap components"] > 0]

#     all_values = {}
#     for _, subdf in df[df["reduce_on"] == "all"].iterrows():
#         all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

#     df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
#     df = df.dropna(subset=["umap improvement"], inplace=False)
#     df = df[df["reduce_on"] != "all"]
#     lines = [subdf[subdf["umap improvement"] == subdf["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
#     df = pd.DataFrame(lines)
#     return df

# # results = add_umap_improvement(results)
# #df = only_standardized_view(no_scaler(using_all_features(only_fft(only_reduce_on_all(only_reducer_equals_train(df))))))

## Pergunta 7

Melhor no domínio da frequência ou tempo (condicionada à abordagem - manifold learning com UMAP)? (ou wavelet?)

- Usar barras de sobreposição
- Sem UMAP (tempo e frequencia)
- Best(<20) vs UMAP 10

In [56]:
df = results.copy()
# df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)

fft_df = filter_domain_fft_only(df.copy())
fft_df = best_of_each_umap(fft_df)

time_df = filter_domain_time_only(df.copy())
time_df = best_of_each_umap(time_df)

estimators = ["KNN-5", "SVM-rbf-C1.0", "randomforest-100"]

fig = go.Figure()
xs = datasets

ys_fft = []
ys_time = []
for dset in datasets:
    best_acc = fft_df.loc[(fft_df["train_datasets"] == dset) & (fft_df["estimator"] == estimator), "accuracy (mean)"].sort_values().iloc[-1]
    ys_fft.append(best_acc)
    best_acc = time_df.loc[(time_df["train_datasets"] == dset) & (time_df["estimator"] == estimator), "accuracy (mean)"].sort_values().iloc[-1]
    ys_time.append(best_acc)
fig.add_trace(
    go.Bar(
        x=xs,
        y=ys_fft,
        name=f"FFT",
        marker=dict(color=colors[0])
    )
)
fig.add_trace(
    go.Bar(
        x=xs,
        y=ys_time,
        name=f"TIME",
        width=[0.5, 0.5, 0.5, 0.5, 0.5],
        marker=dict(color=colors[1])
    )
)
    
    

# df = get_umap_improvement_over_no_umap(df)



# fig = chart_bar_with_side_by_side(df, improvement_col="accuracy (mean)", groupby="estimator", inner_group="train_datasets")
# # fig.add_hline(y=1.0, line_dash="dash", line_color="magenta")
# fig.update_layout(
#     title="Accuracy using FFT",
#     xaxis_title="Dataset",
#     yaxis_title="Accuracy (mean)",
#     legend_title="Estimator",
#     width=800,
#     height=600
# )
fig.update_layout(
    title="Accuracy using different non-parametric transforms",
    xaxis_title="Dataset",
    yaxis_title="Accuracy (best classifier/umap)",
    barmode="overlay"
)

fig.show()

In [57]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
# df = get_umap_improvement_over_no_umap(df)
df = best_of_each_umap(df)

fig = chart_bar_with_side_by_side(df, improvement_col="accuracy (mean)", groupby="estimator", inner_group="train_datasets")
# fig.add_hline(y=1.0, line_dash="dash", line_color="magenta")
fig.update_layout(
    title="Accuracy using FFT",
    xaxis_title="Dataset",
    yaxis_title="Accuracy (best umap)",
    legend_title="Estimator",
    width=800,
    height=600
)
fig.show()

In [58]:
df = results.copy()
df = filter_domain_time_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
# df = get_umap_improvement_over_no_umap(df)
df = best_of_each_umap(df)

fig = chart_bar_with_side_by_side(df, improvement_col="accuracy (mean)", groupby="estimator", inner_group="train_datasets")
# fig.add_hline(y=1.0, line_dash="dash", line_color="magenta")
fig.update_layout(
    title="Accuracy using TIME",
    xaxis_title="Dataset",
    yaxis_title="Accuracy (best umap)",
    legend_title="Estimator",
    width=800,
    height=600
)
fig.show()

## Pergunta 8

In [83]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_inter_balanced(df)
# df = pd.concat([filter_only_reducer_dataset_diferent_from_train(df), filter_only_reducer_dataset_equals_train_dataset(df)])
# df = filter_only_reducer_dataset_diferent_from_train(df)
df = pd.concat([filter_different_train_test(df), filter_only_reducer_dataset_equals_train_dataset(filter_same_train_test(df))])
df = df[(df["umap components"] > 0) ]
df = best_of_each_estimator_and_umap(df)

fig = go.Figure()

for i, dset in enumerate(datasets):
    x = df[df["train_datasets"] == dset].sort_values(by="accuracy (mean)")
    xs = list(range(len(x)))
    ys = []
    the_markers = []
    names = []
    for row_idx, row in x.iterrows():
        ys.append(row["accuracy (mean)"])
        the_markers.append("circle" if dset in row["test_datasets"] else "x")
        name = f'{row["test_datasets"]} ({row["umap components"]} components)'
        names.append(name)
        
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            marker=dict(
                symbol=the_markers,
                size=10,
                color=colors[i]
            ),
            hovertext=names,
            name=dset,
        )
    )
    
fig.update_layout(
    title="Using different datasets to train and test (UMAP applyied in dataset same as training)<br>Circles: same dataset to test, X: different dataset to test",
    xaxis_title="Number of experiments",
    yaxis_title="Accuracy (best classifier and best UMAP)",
    legend_title="Train/Test dataset",
)

fig.show()

In [89]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_inter_balanced(df)
# df = pd.concat([filter_only_reducer_dataset_diferent_from_train(df), filter_only_reducer_dataset_equals_train_dataset(df)])
# df = filter_only_reducer_dataset_diferent_from_train(df)
df = pd.concat([filter_different_train_test(df), filter_only_reducer_dataset_equals_train_dataset(filter_same_train_test(df))])
df = df[(df["umap components"] > 0) ]
df = best_of_each_estimator_and_umap(df)

matrix = []
the_datasets = sorted(datasets.copy())
test_datasets = sorted(df["test_datasets"].unique().tolist())

for i, dset in enumerate(the_datasets):
    lines = [0] * len(test_datasets)
    for j, t_dset in enumerate(test_datasets):
        acc = df.loc[(df["test_datasets"] == t_dset) & (df["train_datasets"] == dset), "accuracy (mean)"].sort_values()
        if len(acc) == 0:
            print(f"Missing {dset} {t_dset}")
        else:
            lines[j] = acc.iloc[-1]
    matrix.append(lines)
    
matrix = np.array(matrix)

fig = px.imshow(matrix, text_auto=True, x=test_datasets, y=the_datasets)
fig.update_layout(
    title="Heatmap between datasets to train UMAP and train classifier<br>The color indicates the accuracy using the best classifier",
    xaxis_title="Dataset(s) used to train",
    yaxis_title="Datasets used to test"
)
fig.show()


# fig.show()

## Pergunta 9

Normalização traz mais benefícios?

In [59]:
def get_scaler_improvement(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["scaler"]]

    all_values = {}
    for _, subdf in df[df["scaler"] == "no scaler"].iterrows():
        all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    df = df.dropna(subset=["umap improvement"], inplace=False)
    df = df[df["scaler"] != "no scaler"]
    
    lines = []
    for _, subdf in df.groupby(["train_datasets", "estimator", "scaler"]):
        lines.append(subdf.sort_values(by="accuracy (mean)").iloc[-1])
    
    # lines = [df[df["umap improvement"] == df["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]1
    df = pd.DataFrame(lines)
    return df

In [60]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
# df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
# df = best_of_each_umap(df)
df = filter_only_reducer_dataset_equals_train_dataset(df)
df = get_scaler_improvement(df)

fig = chart_bar_with_side_by_side(df, improvement_col="umap improvement", groupby="estimator", inner_group=["train_datasets", "scaler"])
fig.add_hline(y=1.0, line_dash="dash", line_color="red")
fig.update_layout(
    title="Improvement using different scalers",
    xaxis_title="dataset / scaler",
    yaxis_title="Improvement over using no scaler",
    legend_title="Estimator",
    width=1200,
    height=600
)
fig.show()


## Pergunta 10

Quais os sensores que ajudam mais: acelerômetro, giroscópio, ambos?


In [61]:
def get_in_use_features_improvement(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["in_use_features"]]

    all_values = {}
    for _, subdf in df[df["in_use_features"] == "accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z"].iterrows():
        all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    df = df.dropna(subset=["umap improvement"], inplace=False)
    df = df[df["in_use_features"] != "accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z"]
    
    lines = []
    for _, subdf in df.groupby(["train_datasets", "estimator", "in_use_features"]):
        lines.append(subdf.sort_values(by="accuracy (mean)").iloc[-1])
    
    # lines = [df[df["umap improvement"] == df["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]1
    df = pd.DataFrame(lines)
    return df

In [62]:
df = results.copy()
df = filter_domain_fft_only(df)
# df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_balanced(df)
# df = best_of_each_umap(df)
df = filter_only_reducer_dataset_equals_train_dataset(df)
df = get_in_use_features_improvement(df)
df["in_use_features"].value_counts()

fig = chart_bar_with_side_by_side(df, improvement_col="umap improvement", groupby="estimator", inner_group=["train_datasets", "in_use_features"])
fig.add_hline(y=1.0, line_dash="dash", line_color="red")
fig.update_layout(
    title="Improvement using different features",
    xaxis_title="dataset /  features",
    yaxis_title="Improvement over using all features",
    legend_title="Estimator",
    width=1200,
    height=600
)
fig.show()