In [1]:
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import plotly.express as px

from IPython.display import display, HTML
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [78]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

# Let's define some constants

datasets = [
    "kuhar",
    "motionsense",
    "uci",
    "wisdm",
    "realworld"
]


labels_activity = {
    0: "sit",
    1: "stand",
    2: "walk",
    3: "stair up",
    4: "stair down",
    5: "run",
    6: "stair up and down",
}

colors_map = {
    "plotly-blue": px.colors.qualitative.Plotly[0],
    "plotly-red": px.colors.qualitative.Plotly[1],
    "plotly-green": px.colors.qualitative.Plotly[2],
    "plotly-purple": px.colors.qualitative.Plotly[3],
    "plotly-orange": px.colors.qualitative.Plotly[4],
    "plotly-cyan": px.colors.qualitative.Plotly[5],
    "plotly-pink": px.colors.qualitative.Plotly[6],
    "plotly-lightgreen": px.colors.qualitative.Plotly[7],
    "plotly-lightpink": px.colors.qualitative.Plotly[8],
    "plotly-yellow": px.colors.qualitative.Plotly[9],
}

colors = list(colors_map.values())
markers = ["circle", "square", "diamond", "x", "triangle-up", "triangle-down", "pentagon"]

In [3]:
# Let's define some filters related to each of the columns

# --- Related to domain ---
def filter_domain_fft_only(df):
    return df[df["transforms"] == "fft"]

def filter_domain_time_only(df):
    return df.loc[df["transforms"] == "time"]


# --- Related to features ---
def filter_features_all(df):
    return df[
        df["in_use_features"].str.contains("accel-x")
        & df["in_use_features"].str.contains("accel-y")
        & df["in_use_features"].str.contains("accel-z")
        & df["in_use_features"].str.contains("gyro-x")
        & df["in_use_features"].str.contains("gyro-y")
        & df["in_use_features"].str.contains("gyro-z")
    ]

def filter_features_accelerometer_only(df):
    return df[
        df["in_use_features"].str.contains("accel-x")
        & df["in_use_features"].str.contains("accel-y")
        & df["in_use_features"].str.contains("accel-z")
        & ~df["in_use_features"].str.contains("gyro-x")
        & ~df["in_use_features"].str.contains("gyro-y")
        & ~df["in_use_features"].str.contains("gyro-z")
    ]

def filter_features_gyroscope_only(df):
    return df[
        ~df["in_use_features"].str.contains("accel-x")
        & ~df["in_use_features"].str.contains("accel-y")
        & ~df["in_use_features"].str.contains("accel-z")
        & df["in_use_features"].str.contains("gyro-x")
        & df["in_use_features"].str.contains("gyro-y")
        & df["in_use_features"].str.contains("gyro-z")
    ]

# --- Related to how reducer is executed ---
def filter_reducer_over_all(df):
    return df[df["reduce_on"] == "all"]

def filter_reducer_over_sensor(df):
    return df[df["reduce_on"] == "sensor"]

def filter_reducer_over_axis(df):
    return df[df["reduce_on"] == "axis"]

# --- Related to the estimator ---
def filter_estimator_rf(df):
    return df[df["estimator"].str.lower().str.contains("randomforest")]

def filter_estimator_svm(df):
    return df[df["estimator"].str.lower().str.contains("svm")]

def filter_estimator_knn(df):
    return df[df["estimator"].str.lower().str.contains("knn")]

# --- Related to the scaler ---
def filter_scaler_none(df):
    return df[df["scaler"] == "no scaler"]

def filter_scaler_minmax(df):
    return df[df["scaler"].str.lower().str.contains("minmaxscaler")]

def filter_scaler_std(df):
   return df[df["scaler"].str.lower().str.contains("standardscaler")]

# --- Related to the view ---
def filter_view_standardized_intra(df):
    return df[df["view"] == "standartized_intra_balanced"]

def filter_view_raw(df):
    return df[df["view"] == "raw_balanced"]    

# --- Some other useful filters ---
def filter_only_reducer_dataset_equals_train_dataset(df):
    return df[df["reducer_datasets"] == df["train_datasets"]]

def filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df):
    return df[(df["reducer_datasets"] == df["train_datasets"]) | (df["reducer_datasets"] == "")]

In [4]:
# Preprocessing steps
def rename_datasets(
    df, columns: List[str] = ("reducer_datasets", "train_datasets", "test_datasets")
):
    def rename_row(row):
        for col in columns:
            names = set()
            for name in row[col].split(","):
                name = name.strip()
                names.add(name.split(".")[0])
            row[col] = ", ".join(sorted(names))
        return row

    df = df.apply(rename_row, axis=1)
    return df

def add_view_name(df, new_column_name: str = "view"):
    df[new_column_name] = df["config_id"].apply(lambda x: "_".join(x.split("_")[:-1]))
    return df

def match_configs(df, new_column_name: str = "config_group"):
    group_no = 0
    for k, subdf in df.groupby(
        [
            "in_use_features",
            "scale_on",
            "reduce_on",
            "transforms",
            "scaler",
            "reducer",
            "umap components",
            "reducer_datasets",
            "train_datasets",
            "test_datasets",
            "estimator",
        ]
    ):
        if len(subdf) == 2:
            df.loc[subdf.index, new_column_name] = group_no
            group_no += 1
    return df

In [5]:
results_file = Path("results.csv")
results = pd.read_csv(results_file).fillna("")
# Lets transform and add some useful information
results = rename_datasets(results)
results = add_view_name(results)
results = match_configs(results)
results

Unnamed: 0,experiment_name,run_name,config_id,reduce_size,train_size,test_size,in_use_features,scale_on,reduce_on,transforms,scaler,reducer,umap components,reducer_datasets,train_datasets,test_datasets,estimator,accuracy (mean),accuracy (std),f1-score macro (mean),f1-score macro (std),f1-score weighted (mean),f1-score weighted (std),view,config_group
0,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,wisdm,wisdm,randomforest-100,0.670833,2.770128e-02,0.667083,3.039176e-02,0.667083,3.039176e-02,standartized_intra_balanced,
1,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,wisdm,wisdm,KNN-5,0.541667,1.110223e-16,0.540794,0.000000e+00,0.540794,0.000000e+00,standartized_intra_balanced,
2,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,wisdm,wisdm,SVM-rbf-C1.0,0.583333,1.110223e-16,0.521068,0.000000e+00,0.521068,0.000000e+00,standartized_intra_balanced,
3,reducer_comb_1,execution1,standartized_intra_balanced_00845,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,MinMaxScaler,umap-2,2,,kuhar,kuhar,randomforest-100,0.800694,1.812221e-02,0.795044,1.921549e-02,0.795044,1.921549e-02,standartized_intra_balanced,
4,reducer_comb_1,execution1,standartized_intra_balanced_00845,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,MinMaxScaler,umap-2,2,,kuhar,kuhar,KNN-5,0.493056,5.551115e-17,0.482479,0.000000e+00,0.482479,5.551115e-17,standartized_intra_balanced,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63172,reducer_comb_1,execution1,raw_balanced_02926,3978,3978,1062,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,FFT-centered,MinMaxScaler,umap-20,20,motionsense,motionsense,motionsense,KNN-5,0.883239,1.110223e-16,0.883064,1.110223e-16,0.883064,0.000000e+00,raw_balanced,3678.0
63173,reducer_comb_1,execution1,raw_balanced_02926,3978,3978,1062,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,FFT-centered,MinMaxScaler,umap-20,20,motionsense,motionsense,motionsense,SVM-rbf-C1.0,0.870998,1.110223e-16,0.871440,1.110223e-16,0.871440,1.110223e-16,raw_balanced,3679.0
63174,reducer_comb_1,execution1,standartized_intra_balanced_05531,0,1794,144,"accel-x, accel-y, accel-z",train,sensor,,StandardScaler,umap-5,5,,motionsense,motionsense,randomforest-100,0.798611,1.889093e-02,0.796915,1.948198e-02,0.796915,1.948198e-02,standartized_intra_balanced,
63175,reducer_comb_1,execution1,standartized_intra_balanced_05531,0,1794,144,"accel-x, accel-y, accel-z",train,sensor,,StandardScaler,umap-5,5,,motionsense,motionsense,KNN-5,0.562500,0.000000e+00,0.546329,0.000000e+00,0.546329,0.000000e+00,standartized_intra_balanced,


In [6]:
# Lets filter out some reduntant lines
results = results[
    ~(results["reducer_datasets"] == "")
    | (results["umap components"] == 0)
]

# As we only use umap, we can drop the reducer column
results = results.drop(columns=["reducer"])

# As we only use TIME and FFT domains, lets rename the values "" of column transforms to "TIME"
# and "FFT" to "FFT"
results.loc[results["transforms"] == "", "transforms"] = "time" 
results.loc[results["transforms"] == "FFT-centered", "transforms"] = "fft"

# If no scaler is used, lets change the value from "" to "no scaler"
results.loc[results["scaler"] == "", "scaler"] = "no scaler"

# As scaler is only used for the train dataset, we can drop the column scale_on
results = results.drop(columns=["scale_on"])

# Lets drop some meta-columns that we will not use
results = results.drop(columns=["experiment_name", "run_name", "config_id", "reduce_size", "train_size", "test_size"])
list(results.columns)

['in_use_features',
 'reduce_on',
 'transforms',
 'scaler',
 'umap components',
 'reducer_datasets',
 'train_datasets',
 'test_datasets',
 'estimator',
 'accuracy (mean)',
 'accuracy (std)',
 'f1-score macro (mean)',
 'f1-score macro (std)',
 'f1-score weighted (mean)',
 'f1-score weighted (std)',
 'view',
 'config_group']

In [7]:
# Grouping by this columns should result in only one row per group
# It is a unique tuple that describes each experiment
unique_exp_columns = [
    "in_use_features",
    "reduce_on",
    "transforms",
    "scaler",
    "umap components",
    "reducer_datasets",
    "train_datasets",
    "test_datasets",
    "estimator",
    "view",
]

# Sanity check
results.groupby(unique_exp_columns).apply(lambda x: len(x) == 1).value_counts()

True    54267
dtype: int64

In [15]:
def chart_bar_with_error_bars(df, improvement_col: str, groupby: str = "train_datasets"):
    xs, ys, error_y_min, error_y_max = [], [], [], []
    for key, subdf in df.groupby(groupby):
        sorted_improvement = sorted(subdf[improvement_col].values)
        # print(sorted_improvement)
        xs.append(key)
        ys.append(sorted_improvement[1])
        error_y_min.append(abs(sorted_improvement[0]-sorted_improvement[1]))
        error_y_max.append(abs(sorted_improvement[-1]-sorted_improvement[1]))

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=xs,
            y=ys,
            error_y=dict(
                type="data",
                symmetric=False,
                array=error_y_max,
                arrayminus=error_y_min
            )
        )
    )
    return fig

def chart_bar_with_side_by_side(df, improvement_col: str, groupby: str = "train_datasets", inner_group: str = "estimator"):
    fig = go.Figure()

    for key, subdf in df.groupby(groupby):
        xs, ys = [], []
        for i, (inner_key, inner_subdf) in enumerate(subdf.groupby(inner_group)):
            xs.append(str(inner_key))
            ys.append(inner_subdf[improvement_col].values[0])
        
        fig.add_trace(
            go.Bar(
                y=ys,
                x=xs,
                name=str(key),
            )
        )
    return fig

## Pergunta 1

### a) Qual o impacto no desempenho dos modelos de ML quando o dado de entrada é transformado com o UMAP?

Compararemos, para cada tupla <dataset DS; domíno D; modelo de ML M>, qual é a diferença (razão) de desempenho do modelo M na tarefa de HAR com o dado do dataset DS no domínio D com e sem redução de dimensionalidade.

In [70]:
def get_umap_improvement_over_no_umap(df, new_column_name: str = "umap improvement"):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reducer_datasets", "umap components"]]

    zero_umap = {}
    for _, subdf in df[df["umap components"] == 0].iterrows():
        zero_umap[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df[new_column_name] = df.apply(lambda x: x["accuracy (mean)"] / zero_umap.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    return df

def filter_max_set_only(df, column: str = "umap improvement"):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reducer_datasets", "umap components"]]
    df = df.dropna(subset=[column], inplace=False)
    df = df[df["umap components"] > 0]
    lines = [subdf[subdf[column] == subdf[column].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    df = pd.DataFrame(lines)
    return df

df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_intra(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
df = get_umap_improvement_over_no_umap(df)
df = filter_max_set_only(df)

fig = chart_bar_with_error_bars(df, improvement_col="umap improvement", groupby="train_datasets")
fig.add_hline(y=1.0, line_dash="dash", line_color="magenta")
fig.update_layout(
    title="UMAP Improvement for each dataset (best UMAP over no UMAP, per dataset).<br>Error bars comes from different estimators",
    xaxis_title="Dataset",
    yaxis_title="Accuracy Improvement over no UMAP",
    width=800,
    height=600
)
fig.show()

fig = chart_bar_with_side_by_side(df, improvement_col="umap improvement", groupby="estimator", inner_group="train_datasets")
fig.add_hline(y=1.0, line_dash="dash", line_color="magenta")
fig.update_layout(
    title="UMAP Improvement for each dataset (best UMAP over no UMAP, per dataset)",
    xaxis_title="Dataset",
    yaxis_title="Accuracy Improvement over no UMAP",
    legend_title="Estimator",
    width=800,
    height=600
)
fig.show()

### b) Qual é o impacto da dimensionalidade alvo do UMAP no desempenho dos modelos de ML?

b .1) UMAP vs No-UMAP

In [84]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_intra(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
df = get_umap_improvement_over_no_umap(df)
df = df[df["umap components"] > 0]

dset_markers = {
    name: markers[i]
    for i, name in enumerate(df["train_datasets"].unique())
}

estimator_colors = {
    name: colors[i]
    for i, name in enumerate(df["estimator"].unique())
}

fig = go.Figure()
for (estimator, dataset), subdf in df.groupby(["estimator", "train_datasets"]):
    subdf = subdf.sort_values(by="umap components")
    xs, ys = [], []
    for n_components in sorted(subdf["umap components"].unique()):
        xs.append(n_components)
        ys.append(subdf[subdf["umap components"] <= n_components]["umap improvement"].max())
    fig.add_trace(
        go.Scatter( 
            x=xs,
            y=ys,
            name=f"{estimator} {dataset}",
            mode="lines+markers",
            legendgroup=estimator,
            # legendgrouptitle="Estimator",
            marker=dict(
                color=estimator_colors[estimator],
                symbol=dset_markers[dataset],
                size=8,
            ),
        )
    )
    # subdf["best umap improvement so far"] = df["umap improvement"].expanding().max()

fig.update_layout(
    title="UMAP Improvement for each dataset (best UMAP over no UMAP, per dataset)",
    xaxis_title="Maximum UMAP Components",
    yaxis_title="Accuracy improvement over no UMAP",
    legend_title="Estimator",
    width=1200,
    height=600,
    xaxis=dict(tickmode="linear")
)
fig.show()

b.2) UMAP vs best UMAP

In [75]:
def get_umap_improvement_over_best_umap(df, new_column_name: str = "umap improvement"):
    df = df[df["umap components"] != 0]
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reducer_datasets", "umap components"]]

    best_umap = {}
    for key, subdf in df.groupby(experiment_uniq):
        best_umap[key] = subdf["accuracy (mean)"].max()

    for key, subdf in df.groupby(experiment_uniq):
        df.loc[subdf.index, new_column_name] = subdf["accuracy (mean)"] / best_umap[key]

    return df

In [85]:
df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_view_standardized_intra(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
df = get_umap_improvement_over_best_umap(df)

dset_markers = {
    name: markers[i]
    for i, name in enumerate(df["train_datasets"].unique())
}

estimator_colors = {
    name: colors[i]
    for i, name in enumerate(df["estimator"].unique())
}

fig = go.Figure()
for (estimator, dataset), subdf in df.groupby(["estimator", "train_datasets"]):
    subdf = subdf.sort_values(by="umap components")
    xs, ys = [], []
    for n_components in sorted(subdf["umap components"].unique()):
        xs.append(n_components)
        ys.append(subdf[subdf["umap components"] <= n_components]["umap improvement"].max())
    fig.add_trace(
        go.Scatter( 
            x=xs,
            y=ys,
            name=f"{estimator} {dataset}",
            mode="lines+markers",
            legendgroup=estimator,
            # legendgrouptitle="Estimator",
            marker=dict(
                color=estimator_colors[estimator],
                symbol=dset_markers[dataset],
                size=7,
            ),
        )
    )
    # subdf["best umap improvement so far"] = df["umap improvement"].expanding().max()

fig.update_layout(
    title="UMAP Improvement for each dataset (UMAP over best UMAP, per dataset)",
    xaxis_title="Maximum UMAP Components",
    yaxis_title="Accuracy improvement best UMAP",
    legend_title="Estimator",
    width=1200,
    height=600,
    xaxis=dict(tickmode="linear")
)
fig.show()

### Pergunta 1.c

In [137]:
experiment_uniq = [c for c in unique_exp_columns if c not in ["train_datasets", "view"]]

df = results.copy()
df = filter_domain_fft_only(df)
df = filter_features_all(df)
df = filter_reducer_over_all(df)
df = filter_scaler_none(df)
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)

fig = go.Figure()


for i, (dset_name, train_df) in enumerate(df.groupby(["train_datasets"])):
    xs, ys = [], []
    for j, (view_name, subdf) in enumerate(train_df.groupby("view")):
        umap_0_max = subdf[subdf["umap components"] == 0]["accuracy (mean)"].max()
        xs.append(f"{dset_name} {view_name.split('_')[0]} (no umap)")
        ys.append(umap_0_max)
    
    fig.add_trace(
        go.Bar(
            x=xs,
            y=ys,
            name=f"{dset_name} no umap",
            marker=dict(color=colors[0]),
            legendgroup="no umap",
        )
    )
    xs, ys = [], []
    for j, (view_name, subdf) in enumerate(train_df.groupby("view")):
        umap_best_max = subdf[subdf["umap components"] > 0]["accuracy (mean)"].max()
        xs.append(f"{dset_name} {view_name.split('_')[0]} (best umap)")
        ys.append(umap_best_max)
    
    fig.add_trace(
        go.Bar(
            x=xs,
            y=ys,
            name=f"{dset_name} best umap",
            marker=dict(color=colors[1]),
            legendgroup="best umap",
        )
    )

fig.update_layout(
    title="Impact of standardization process on UMAP",
    xaxis_title="Dataset view",
    yaxis_title="Accuracy of best UMAP and best classifier",
    legend_title="Estimator",
    width=1200,
    height=800,
    xaxis=dict(tickangle=90)
)

# fig.update_xaxes(tickangle=75)
fig.show()





## Pergunta 2

## Pergunta 3

## Pergunta 4

## Pergunta 5

In [None]:
# Miss data

def get_reduce_on_improvement(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["reduce_on", "reducer", "umap components"]]
    df = df[df["umap components"] > 0]

    all_values = {}
    for _, subdf in df[df["reduce_on"] == "all"].iterrows():
        all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    df = df.dropna(subset=["umap improvement"], inplace=False)
    df = df[df["reduce_on"] != "all"]
    lines = [subdf[subdf["umap improvement"] == subdf["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    df = pd.DataFrame(lines)
    return df

# results = add_umap_improvement(results)
#df = only_standardized_view(no_scaler(using_all_features(only_fft(only_reduce_on_all(only_reducer_equals_train(df))))))

In [86]:
df = filter_view_standardized_intra(filter_scaler_none(filter_features_all(filter_domain_fft_only(results))))
df = get_reduce_on_improvement(df)
df 

# fig = bar_improvement_1(df, improvement_col="umap improvement", groupby=["train_datasets", "reduce_on"])
# fig.update_layout(
#     title="UMAP Improvement",
#     xaxis_title="Dataset",
#     yaxis_title="UMAP Improvement",
#     legend_title="UMAP Components",
#     width=800,
#     height=600
# )
# fig.show()


fig = chart_bar_with_side_by_side(df, improvement_col="umap improvement", groupby="estimator", inner_group=["train_datasets", "reduce_on"])
fig.update_layout(
    title="Improvement using multiple UMAPs to train per sensor and per axis over using all features",
    xaxis_title="reduce_on / dataset",
    yaxis_title="Improvement over using all features",
    legend_title="Estimator",
    width=1200,
    height=600
)
fig.show()


NameError: name 'get_reduce_on_improvement' is not defined

## Pergunta 6

In [None]:
# def get_estimator_improvement(df):
#     experiment_uniq = [c for c in unique_exp_columns if c not in ["estimator"]]
#     df = df[df["umap components"] > 0]

#     all_values = {}
#     for _, subdf in df[df["reduce_on"] == "all"].iterrows():
#         all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

#     df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
#     df = df.dropna(subset=["umap improvement"], inplace=False)
#     df = df[df["reduce_on"] != "all"]
#     lines = [subdf[subdf["umap improvement"] == subdf["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
#     df = pd.DataFrame(lines)
#     return df

# # results = add_umap_improvement(results)
# #df = only_standardized_view(no_scaler(using_all_features(only_fft(only_reduce_on_all(only_reducer_equals_train(df))))))

## Pergunta 7

In [None]:
def get_transform_improvement_umap_0(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["transforms", "umap components"]]

    df = df[df["umap components"] == 0]
    lines = [subdf[subdf["accuracy (mean)"] == subdf["accuracy (mean)"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    df = pd.DataFrame(lines)
    

    # df = df[df["umap components"] > 0]

    # all_values = {}
    # for _, subdf in df[df["reduce_on"] == "all"].iterrows():
    #     all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    # df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    # df = df.dropna(subset=["umap improvement"], inplace=False)
    # df = df[df["reduce_on"] != "all"]
    # lines = [subdf[subdf["umap improvement"] == subdf["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    # df = pd.DataFrame(lines)
    return df


# results = add_umap_improvement(results)
#df = only_standardized_view(no_scaler(using_all_features(only_fft(only_reduce_on_all(only_reducer_equals_train(df))))))

In [None]:
df["transforms"].value_counts()

In [None]:
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(filter_scaler_none(filter_reducer_over_all(filter_features_all(filter_view_standardized_intra(results)))))
df = get_transform_improvement_umap_0(df)
# print(df)
df 

fig = chart_bar_with_side_by_side(df, improvement_col="accuracy (mean)", groupby="estimator", inner_group=["transforms", "train_datasets"])
fig.update_layout(
    title="Impact of domain (FFT, time)",
    xaxis_title="transform / dataset",
    yaxis_title="Accuracy (mean)",
    legend_title="Estimator",
    width=1200,
    height=600
)
fig.show()


## Pergunta 8

## Pergunta 9

In [87]:
def get_scaler_improvement(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["scaler", "scale_on", "umap components"]]
    df = df[df["umap components"] > 0]

    all_values = {}
    for _, subdf in df[df["scaler"] == ""].iterrows():
        all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    df = df.dropna(subset=["umap improvement"], inplace=False)
    df = df[df["scaler"] != ""]
    lines = [subdf[subdf["umap improvement"] == subdf["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    df = pd.DataFrame(lines)
    return df

In [88]:
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(filter_reducer_over_all(filter_features_all(filter_view_standardized_intra(filter_domain_fft_only(results)))))
df = get_scaler_improvement(df)

fig = chart_bar_with_side_by_side(df, improvement_col="accuracy (mean)", groupby="estimator", inner_group=["train_datasets", "scaler"])
fig.update_layout(
    title="Improvement using different scalers (MinMax, StandardScaler), in relation of using no scaler",
    xaxis_title="scaler / dataset",
    yaxis_title="Improvement over using no scaler",
    legend_title="Estimator",
    width=1200,
    height=600
)
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



KeyError: 'estimator'

## Pergunta 10

In [None]:
def get_in_use_features_improvement(df):
    experiment_uniq = [c for c in unique_exp_columns if c not in ["in_use_features", "umap components"]]
    df = df[df["umap components"] > 0]

    all_values = {}
    for _, subdf in filter_features_all(df).iterrows():
        all_values[tuple(subdf[experiment_uniq].values.tolist())] = subdf["accuracy (mean)"]

    df["umap improvement"] = df.apply(lambda x: x["accuracy (mean)"] / all_values.get(tuple(x[experiment_uniq].values.tolist()), np.nan), axis=1)
    df = df.dropna(subset=["umap improvement"], inplace=False)
    df = df.loc[
        ~(df["in_use_features"].str.contains("accel-y")
        & df["in_use_features"].str.contains("accel-z")
        & df["in_use_features"].str.contains("gyro-x")
        & df["in_use_features"].str.contains("gyro-y")
        & df["in_use_features"].str.contains("gyro-z"))
    ]
    # df = df[df["scaler"] != ""]
    lines = [subdf[subdf["umap improvement"] == subdf["umap improvement"].max()].iloc[0] for _, subdf in df.groupby(experiment_uniq)]
    df = pd.DataFrame(lines)
    return df

In [None]:
df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(filter_reducer_over_all(filter_scaler_none(filter_view_standardized_intra(filter_domain_fft_only(results)))))
df = get_in_use_features_improvement(df)

fig = chart_bar_with_side_by_side(df, improvement_col="accuracy (mean)", groupby="estimator", inner_group=["train_datasets", "in_use_features"])
fig.update_layout(
    title="Improvement of using only accelerometer or using only gyro, in relation to using all features",
    xaxis_title="in_use_features / dataset",
    yaxis_title="Improvement over using all features",
    legend_title="Estimator",
    width=1200,
    height=600
)
fig.show()


## Pergunta 1: Qual é o impacto do UMAP na capacidade de discriminação dos modelos de ML na tarefa de HAR?

1. O desempenho dos 3 modelos de ML com o experimento realizado com e sem o UMAP
2. (11) O impacto da normatização no resultado

**OBS**: 
- Adicionar dimensões alvo (n/2) e n
- Colocar o resultado do 0 para o final
- Grafico com a média de todos os graficos (juntar diferentes datasets)
- Adicionar resultado sem UMAP para gráfico de comparação
- Adicionar grafos com o best UMAP


In [None]:
experiment_uniq = [c for c in unique_exp_columns if c not in ["umap components", "reducer"]]
results.groupby(experiment_uniq).apply(lambda x: len(x)).value_counts()

In [None]:
from typing import Tuple


def best_umap_improvement(df, group_by: str | Tuple[str] = ()):
    best = df[df["umap components"] == 0].iloc[0]
    best_umap = df[df["umap components"] != 0].iloc[0]
    return best_umap["accuracy"] - best["accuracy"]

In [None]:
def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]


def improvement_over_baseline(df):
    df = filter_reducer_over_all(df)
    df = filter_features_all(df)
    df = filter_scaler_none(df)
    df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
    df = only_standardized_view(df)

    for (train, test), subdf in df.groupby(
        [
            "train_datasets",
            "test_datasets",
        ]
    ):
        fig = go.Figure()
        for (estimator, transform), subsubdf in subdf.groupby(
            ["estimator", "transforms"]
        ):
            # subsubdf = pd.concat([subsubdf.sort_values("umap components").iloc[1:], subsubdf.sort_values("umap components").iloc[0]])
            subsubdf = subsubdf.sort_values("umap components") 

            transform = transform if transform else 'no transform'
            fig.add_trace(
                go.Scatter(
                    x=subsubdf["umap components"],
                    y=subsubdf["accuracy (mean)"]*100,
                    error_y=dict(array=subsubdf["accuracy (std)"]*100),
                    mode="lines+markers",
                    name=f"{estimator} {transform if transform else 'no transform'}",
                    legendgroup=transform
                )
            )


        fig.update_layout(
            title=f"{train}",
            xaxis_title="UMAP components",
            yaxis_title="Accuracy (mean 10 runs)",
            yaxis_range=[0, 100],
            xaxis = dict(
                tickmode = 'linear',
            ),
            autosize=True,
            width=1200,
            height=600
        )
        fig.show()
        # break

    return df


_ = improvement_over_baseline(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


In [None]:
def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]


def improvement_over_baseline(df):
    df = filter_reducer_over_all(df)
    df = filter_features_all(df)
    df = filter_scaler_none(df)
    df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
    df = only_standardized_view(df)
    df = df[df["umap components"] > 1]

    df["best umap accuracy"] = df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)
    df["umap best improvement"] = df["accuracy (mean)"] / df["best umap accuracy"]

    # display(HTML(df.to_html()))

    for (train, test), subdf in df.groupby(
        [
            "train_datasets",
            "test_datasets",
        ]
    ):
        fig = go.Figure()
        for (estimator, transform), subsubdf in subdf.groupby(
            ["estimator", "transforms"]
        ):
            # subsubdf = pd.concat([subsubdf.sort_values("umap components").iloc[1:], subsubdf.sort_values("umap components").iloc[0]])
            subsubdf = subsubdf.sort_values("umap components") 

            transform = transform if transform else 'no transform'
            fig.add_trace(
                go.Scatter(
                    x=subsubdf["umap components"],
                    y=subsubdf["umap best improvement"],
                    # error_y=dict(array=subsubdf["accuracy (std)"]*100),
                    mode="lines+markers",
                    name=f"{estimator} {transform if transform else 'no transform'}",
                    # legendgroup=transform
                )
            )


        fig.update_layout(
            title=f"{train}",
            xaxis_title="UMAP components",
            yaxis_title="Accuracy (mean 10 runs)",
            # yaxis_range=[0, 1],
            xaxis = dict(
                tickmode = 'linear',
            ),
            autosize=True,
            width=1200,
            height=600
        )
        fig.show()
        # break

    return df


_ = improvement_over_baseline(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


In [None]:
from collections import defaultdict

def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]


def improvement_over_baseline(df):
    df = filter_reducer_over_all(df)
    df = filter_features_all(df)
    df = filter_scaler_none(df)
    df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
    df = only_standardized_view(df)


    df["best umap accuracy"] = df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)
    df["umap best improvement"] = df["accuracy (mean)"] / df["best umap accuracy"]

    bests_so_far = defaultdict(list)

    for index, subdf  in df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"]):
        for i in range(2, 11, 1):
            xdf = subdf[(subdf["umap components"] <= i) & (subdf["umap components"] > 1)]
            # bests_so_far[i] = xdf["accuracy (mean)"].max()
            bests_so_far[(index[0], index[2], index[3])].append(xdf["accuracy (mean)"].max())

        # xdf = df[df["umap components"] <= i]
        # df[f"best umap accuracy {i}"] = xdf.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)
        # # df[f"umap best improvement {i}"] = xdf["accuracy (mean)"] / xdf["best umap accuracy"]
        # bests_so_far[i] = xdf["accuracy (mean)"].max()
        # df[f"umap best improvement {i}"] = xdf["accuracy (mean)"] / xdf["best umap accuracy"]

    # print(bests_so_far)

    fig = go.Figure()

    for name, acc in bests_so_far.items():
        if "FFT" not in name[2]:
            continue
        # print(name, acc)
        fig.add_trace(
            go.Scatter(
                x=list(range(2, 11, 1)),
                y=[a/acc[-1] for a in acc],
                mode="lines+markers",
                name=f"{name}",
                legendgroup=f"({name[0]}, {name[2]})"
                # showlegend=False
            )
        )

    fig.update_layout(
        xaxis_title="Maximum UMAP components",
        yaxis_title="Accuracy improvement (relation to best)",
        # yaxis_range=[0, 1],
        xaxis = dict(
            tickmode = 'linear',
        ),
        autosize=True,
        width=1200,
        height=600
    )
    # fig.show()

    fig.show()
    return df



_ = improvement_over_baseline(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


In [None]:
from collections import defaultdict

def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]


def improvement_over_baseline(df):
    df = filter_reducer_over_all(df)
    df = filter_features_all(df)
    df = filter_scaler_none(df)
    df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
    df = only_standardized_view(df)


    df["best umap accuracy"] = df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)
    df["umap best improvement"] = df["accuracy (mean)"] / df["best umap accuracy"]

    bests_so_far = defaultdict(list)

    for index, subdf  in df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"]):
        for i in range(2, 11, 1):
            xdf = subdf[(subdf["umap components"] <= i) & (subdf["umap components"] > 1)]
            xdf2 = subdf[(subdf["umap components"] == 0)]
            # bests_so_far[i] = xdf["accuracy (mean)"].max()
            bests_so_far[(index[0], index[2], index[3])].append(xdf["accuracy (mean)"].max() / xdf2["accuracy (mean)"].max())

        # xdf = df[df["umap components"] <= i]
        # df[f"best umap accuracy {i}"] = xdf.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)
        # # df[f"umap best improvement {i}"] = xdf["accuracy (mean)"] / xdf["best umap accuracy"]
        # bests_so_far[i] = xdf["accuracy (mean)"].max()
        # df[f"umap best improvement {i}"] = xdf["accuracy (mean)"] / xdf["best umap accuracy"]

    # print(bests_so_far)

    fig = go.Figure()
    # for name, acc in bests_so_far.items():
    #     if "FFT" not in name[2]:
    #         continue
    #     # print(name, acc)
    #     xs = [11]
    #     ys = [acc[-1]]

    #     fig.add_trace(
    #         go.Scatter(
    #             x=xs,
    #             y=[acc[-1] for a in acc],
    #             mode="lines+markers",
    #             name=f"{name}",
    #             legendgroup=f"({name[0]}, {name[2]})"
    #             # showlegend=False
    #         )
    #     )

    fig.add_trace(
        go.Histogram(
            x=[acc[-1] for name, acc in bests_so_far.items() if "FFT" not in name[2]], 
            xbins=dict(size=0.05)
        )
    )

    fig.update_layout(
        xaxis_title="Maximum UMAP components",
        yaxis_title="Accuracy improvement (relation to no-UMAP)",
        # yaxis_range=[0, 1],
        xaxis = dict(
            tickmode = 'linear',
        ),
        autosize=True,
        width=1200,
        height=600
    )
    # fig.show()

    fig.show()
    return df


    # display(HTML(df.to_html()))

    for (train, test), subdf in df.groupby(
        [
            "train_datasets",
            "test_datasets",
        ]
    ):
        fig = go.Figure()
        for (estimator, transform), subsubdf in subdf.groupby(
            ["estimator", "transforms"]
        ):
            # subsubdf = pd.concat([subsubdf.sort_values("umap components").iloc[1:], subsubdf.sort_values("umap components").iloc[0]])
            subsubdf = subsubdf.sort_values("umap components") 

            transform = transform if transform else 'no transform'
            for i in range(1, 10, 1):
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf[f"best umap accuracy {i}"],
                        # error_y=dict(array=subsubdf["accuracy (std)"]*100),
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=transform
                    )
                )
            break


        fig.update_layout(
            title=f"{train}",
            xaxis_title="UMAP components",
            yaxis_title="Accuracy (mean 10 runs)",
            # yaxis_range=[0, 1],
            xaxis = dict(
                tickmode = 'linear',
            ),
            autosize=True,
            width=1200,
            height=600
        )
        fig.show()
        # break

    return df


_ = improvement_over_baseline(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


In [None]:
from collections import defaultdict

def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]


def improvement_over_baseline(df):
    df = filter_reducer_over_all(df)
    df = filter_features_all(df)
    df = filter_scaler_none(df)
    df = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
    df = only_standardized_view(df)


    df["best umap accuracy"] = df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)
    df["umap best improvement"] = df["accuracy (mean)"] / df["best umap accuracy"]

    bests_so_far = defaultdict(list)

    for index, subdf  in df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"]):
        for i in range(2, 11, 1):
            xdf = subdf[(subdf["umap components"] <= i) & (subdf["umap components"] > 1)]
            xdf2 = subdf[(subdf["umap components"] == 0)]
            # bests_so_far[i] = xdf["accuracy (mean)"].max()
            bests_so_far[(index[0], index[2], index[3])].append(xdf["accuracy (mean)"].max() / xdf2["accuracy (mean)"].max())

        # xdf = df[df["umap components"] <= i]
        # df[f"best umap accuracy {i}"] = xdf.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)
        # # df[f"umap best improvement {i}"] = xdf["accuracy (mean)"] / xdf["best umap accuracy"]
        # bests_so_far[i] = xdf["accuracy (mean)"].max()
        # df[f"umap best improvement {i}"] = xdf["accuracy (mean)"] / xdf["best umap accuracy"]

    # print(bests_so_far)

    fig = go.Figure()
    for name, acc in bests_so_far.items():
        if "FFT" not in name[2]:
            continue
        # print(name, acc)
        # fig.add_trace(
        #     go.Scatter(
        #         x=list(range(2, 11, 1)),
        #         # y=[acc[-1] for a in acc],
        #         y=acc,
        #         mode="lines+markers",
        #         name=f"{name}",
        #         legendgroup=f"({name[0]}, {name[2]})"
        #         # showlegend=False
        #     )
        # )

        fig.add_trace(
            go.Bar(
                x=[10],
                y=[acc[-1] for a in acc],
                # y=acc,
                name=f"{name}",
                legendgroup=f"({name[0]}, {name[2]})"
                # showlegend=False
            )
        )

    # plot a bar plot using plotly




    fig.update_layout(
        xaxis_title="Maximum UMAP components",
        yaxis_title="Accuracy improvement (relation to no-UMAP)",
        # yaxis_range=[0, 1],
        xaxis = dict(
            tickmode = 'linear',
        ),
        autosize=True,
        width=1200,
        height=600
    )
    # fig.show()

    fig.show()
    return df

_ = improvement_over_baseline(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


## Pergunta 2: Como a escolha do dataset no treinamento do UMAP afeta os resultados da tarefa?

In [None]:
def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]

def improvement_dset_choice_1(df):
    # df = only_reduce_on_all(df)
    # df = using_all_features(df)
    # df = only_fft(df)
    # df = no_scaler(df)
    df = only_standardized_view(df)
    df = df[df["umap components"] > 4]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }


    colors = list(colors_map.values())

    for i, ((estimator), subdf) in enumerate(df.groupby(["estimator"])):
        fig = go.Figure()

        for i, (n_components, subsubdf) in enumerate(subdf.groupby("umap components")):
            subsubdf = subsubdf.sort_values("accuracy (mean)")
            fig.add_trace(
                go.Scatter(
                    y=np.arange(len(subsubdf)),
                    x=subsubdf["accuracy (mean)"]*100,
                    mode="lines",
                    name=f"UMAP components: {n_components}",
                    line=dict(color=colors[i], width=1),
                )
            )
            fig["layout"]["xaxis"].update(title_text="Accuracy (%)")
            fig["layout"]["yaxis"].update(title_text="Number of experiments")
            

        fig.update_layout(
            autosize=True,
            width=2400,
            height=600,
            title=f"Estimator: {estimator}"
        )
        fig.show()

    return df
        
_ = improvement_dset_choice_1(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


In [None]:
def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]

def improvement_dset_choice_2(df):
    # df = only_reduce_on_all(df)
    # df = using_all_features(df)
    # df = only_fft(df)
    # df = no_scaler(df)
    df = only_standardized_view(df)
    df_0 = filter_only_reducer_dataset_equals_train_dataset_or_no_reducer(df)
    df_0 = df[df["umap components"] == 0]
    df = df[df["umap components"] > 4]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }


    merge_feat = [
        "scale_on",
        "reduce_on",
        "transforms",
        "reducer",
        "umap components",
        "reducer_datasets",
        "train_datasets",
        "test_datasets",
        "estimator",
    ]

    rows = []
    for idx, row in df.iterrows():
        r = df_0.loc[
            (df_0["estimator"] == row["estimator"]) &
            (df_0["scale_on"] == row["scale_on"]) &
            (df_0["reduce_on"] == row["reduce_on"]) &
            (df_0["transforms"] == row["transforms"]) &
            (df_0["train_datasets"] == row["train_datasets"]) &
            (df_0["test_datasets"] == row["test_datasets"]) &
            (df_0["estimator"] == row["estimator"]) 
        ].iloc[0]
        row["accuracy improvement (mean)"] = row["accuracy (mean)"]/r["accuracy (mean)"]
        rows.append(row)

    df = pd.DataFrame(rows)

    colors = list(colors_map.values())

    for i, ((estimator), subdf) in enumerate(df.groupby(["estimator"])):
        fig = go.Figure()

        for i, (n_components, subsubdf) in enumerate(subdf.groupby("umap components")):
            subsubdf = subsubdf.sort_values("accuracy improvement (mean)")
            fig.add_trace(
                go.Scatter(
                    x=np.arange(len(subsubdf)),
                    y=subsubdf["accuracy improvement (mean)"],
                    mode="lines",
                    name=f"UMAP components: {n_components}",
                    line=dict(color=colors[i], width=1),
                )
            )
            fig["layout"]["yaxis"].update(title_text="Accuracy improvement (%)")
            fig["layout"]["xaxis"].update(title_text="Number of experiments")
            

        fig.update_layout(
            autosize=True,
            width=2400,
            height=600,
            title=f"Estimator: {estimator}"
        )
        fig.show()

    return df
        
_ = improvement_dset_choice_2(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


## Pergunta 3: É possível usar Manifold Learning para reduzir o domain shift?

## Pergunta 4: Qual a importância da normatização nos experimentos inter-dataset

## Pergunta 5: É mais vantajoso reduzir a dimensionalidade dos dados dos sensores de forma individual (aplicando o UMAP a um sensor, ou eixo, por vez) ou de forma agregada (como estamos fazendo)

Qual abordagem é a mais interessante para a generalização de um dataset fonte para um dataset alvo? 
1. Espaço latente unificado partindo dos dados de sensores concatenados;
2. União de espaços latentes gerados separadamente por sensor;
3. União dos espaços latentes gerados por eixo e por sensor (ou seja, 6 projeções UMAP independentes).

**OBS**:
- Ajustar a diferença para comparar com o numero de features final

In [None]:
def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]

def improvement_over_umap_all(df):
    df = filter_features_all(df)
    df = filter_scaler_none(df)
    df = filter_only_reducer_dataset_equals_train_dataset(df)
    df = only_standardized_view(df)
    df = filter_domain_fft_only(df)
    umap_all = filter_reducer_over_all(df.copy())
    umap_sensor = filter_reducer_over_sensor(df.copy())
    umap_axis = filter_reducer_over_axis(df.copy())

    merge_feat = [
            "in_use_features",
            "scale_on",
            "transforms",
            "scaler",
            "reducer",
            "umap components",
            "reducer_datasets",
            "train_datasets",
            "test_datasets",
            "estimator",
        ]
        
    merged_df_sensor = umap_all.merge(umap_sensor, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_sensor["diff"] = merged_df_sensor["accuracy (mean)_other"] - merged_df_sensor["accuracy (mean)_all"]
    merged_df_axis = umap_all.merge(umap_axis, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_axis["diff"] = merged_df_axis["accuracy (mean)_other"] - merged_df_axis["accuracy (mean)_all"]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }

    colors = list(colors_map.values())
    
    for the_df, name in [(merged_df_sensor, "sensor"), (merged_df_axis, "axis")]:
        for (train, test), subdf in the_df.groupby(
            [
                "train_datasets",
                "test_datasets",
            ]
        ):
            fig = make_subplots(rows=1, cols=3, subplot_titles=("all", name, f"difference (y = {name} - all)"))
            for i, (estimator, subsubdf) in enumerate(subdf.groupby(
                "estimator"
            )):
                subsubdf = subsubdf.sort_values("umap components")
                transform = "FFT"
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_all"]*100,
                        error_y=dict(array=subsubdf["accuracy (std)_all"]*100),
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=1
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_other"]*100,
                        error_y=dict(array=subsubdf["accuracy (std)_other"]*100),
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=2
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["diff"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=3
                )

            fig["layout"]["xaxis1"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis2"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis3"].update(title_text="UMAP components", tickmode="linear")

            fig["layout"]["yaxis1"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis2"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis3"].update(title_text=f"Difference")

            fig.update_layout(
                autosize=True,
                width=2400,
                height=600,
                title=f"Dataset: {train}"
            )


            fig.show()


    return merged_df_sensor, merged_df_axis


_= improvement_over_umap_all(results.copy())


## Pergunta 6: Há uma mudança significativa entre os diferentes classificadores




In [None]:
results.groupby("estimator").mean()["accuracy (mean)"].sort_values(ascending=False)

In [None]:

merge_feat = [
    "scale_on",
    "reduce_on",
    "transforms",
    "reducer",
    "umap components",
    "reducer_datasets",
    "train_datasets",
    "test_datasets",
]

df = results.copy()
df["best umap accuracy"] = df.groupby(["train_datasets", "test_datasets", "estimator", "transforms"])["accuracy (mean)"].transform(max)

for estimator, xc in df.groupby("estimator"):
    for (key), y in filter_domain_fft_only(only_standardized_view(xc)).groupby(merge_feat):    
        # subsubdf.loc[subsubdf["estimator"] == estimator, "best umap accuracy"].mean()
        print(f"{estimator} {key} {y.loc[y['estimator'] == estimator, 'accuracy (mean)'].mean()/ df['best umap accuracy'].max()}") 

## Pergunta 7:  Melhor no domínio da frequência ou tempo (condicionada à abordagem - manifold learning com UMAP)? (ou wavelet?)

In [None]:
x = results.copy()
x.loc[x["transforms"] == "", "transforms"] = "no transform"
x.groupby("transforms").mean()["accuracy (mean)"].sort_values(ascending=False)

## Pergunta 8: Qual dataset (ou subconjunto de datasets) é mais propício para generalizar o reconhecimento de HAR?

## Pergunta 9: Normalização traz mais benefícios?

**OBS**:
- Reportar média e std dev dos resultados antes e depois da normalização

In [None]:
def only_minmax_scaler(df):
    return df[df["scaler"] == "MinMaxScaler"]

def only_std_scaler(df):
    return df[df["scaler"] == "StandardScaler"]

def improvement_over_umap_scaler(df):
    df = filter_features_all(df)
    df = filter_only_reducer_dataset_equals_train_dataset(df)
    df = only_standardized_view(df)
    df = filter_domain_fft_only(df)
    df = filter_reducer_over_all(df)
    umap_no_scaler = filter_scaler_none(df.copy())
    umap_minmax = only_minmax_scaler(df.copy())
    umap_std = only_std_scaler(df.copy())

    merge_feat = [
        "scale_on",
        "reduce_on",
        "transforms",
        "reducer",
        "umap components",
        "reducer_datasets",
        "train_datasets",
        "test_datasets",
        "estimator",
    ]

        
    merged_df_minmax = umap_no_scaler.merge(umap_minmax, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_minmax["diff"] = merged_df_minmax["accuracy (mean)_other"] - merged_df_minmax["accuracy (mean)_all"]
    merged_df_std = umap_no_scaler.merge(umap_std, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_std["diff"] = merged_df_std["accuracy (mean)_other"] - merged_df_std["accuracy (mean)_all"]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }

    colors = list(colors_map.values())
    
    for the_df, name in [(merged_df_minmax, "minmax"), (merged_df_std, "z-norm")]:
        for (train, test), subdf in the_df.groupby(
            [
                "train_datasets",
                "test_datasets",
            ]
        ):
            fig = make_subplots(rows=1, cols=3, subplot_titles=("all", name, f"difference (y = {name} - no scaler)"))
            for i, (estimator, subsubdf) in enumerate(subdf.groupby(
                "estimator"
            )):

                subsubdf = subsubdf.sort_values("umap components")
                transform = "FFT"
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_all"]*100,
                        error_y=dict(array=subsubdf["accuracy (std)_all"]*100),
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=1
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_other"]*100,
                         error_y=dict(array=subsubdf["accuracy (std)_other"]*100),
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=2
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["diff"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=3
                )

            fig["layout"]["xaxis1"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis2"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis3"].update(title_text="UMAP components", tickmode="linear")

            fig["layout"]["yaxis1"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis2"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis3"].update(title_text=f"Difference")

            fig.update_layout(
                autosize=True,
                width=2400,
                height=600,
                title=f"Dataset: {train}"
            )


            fig.show()


    return merged_df_minmax, merged_df_std


_ = improvement_over_umap_scaler(results.copy())


## Pergunta 10: Quais os sensores que ajudam mais: acelerômetro, giroscópio, ambos?

In [None]:
def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]

def improvement_over_umap_all_features(df):
    df = filter_scaler_none(df)
    df = filter_only_reducer_dataset_equals_train_dataset(df)
    df = only_standardized_view(df)
    df = filter_domain_fft_only(df)
    df = filter_reducer_over_all(df)
    umap_all = filter_features_all(df.copy())
    umap_accel = filter_features_accelerometer_only(df.copy())
    umap_gyro = filter_features_gyroscope_only(df.copy())

    merge_feat = [
        "scale_on",
        "reduce_on",
        "transforms",
        "scaler",
        "reducer",
        "umap components",
        "reducer_datasets",
        "train_datasets",
        "test_datasets",
        "estimator",
    ]

        
    merged_df_accel = umap_all.merge(umap_accel, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_accel["diff"] = merged_df_accel["accuracy (mean)_other"] - merged_df_accel["accuracy (mean)_all"]
    merged_df_gyro = umap_all.merge(umap_gyro, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_gyro["diff"] = merged_df_gyro["accuracy (mean)_other"] - merged_df_gyro["accuracy (mean)_all"]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }

    colors = list(colors_map.values())
    
    for the_df, name in [(merged_df_accel, "accel"), (merged_df_gyro, "gyro")]:
        for (train, test), subdf in the_df.groupby(
            [
                "train_datasets",
                "test_datasets",
            ]
        ):
            fig = make_subplots(rows=1, cols=3, subplot_titles=("all", name, f"difference (y = {name} - all)"))
            for i, (estimator, subsubdf) in enumerate(subdf.groupby(
                "estimator"
            )):
                subsubdf = subsubdf.sort_values("umap components")
                transform = "FFT"
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_all"]*100,
                        error_y=dict(array=subsubdf["accuracy (std)_all"]*100),
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=1
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_other"]*100,
                        error_y=dict(array=subsubdf["accuracy (std)_other"]*100),
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=2
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["diff"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=3
                )

            fig["layout"]["xaxis1"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis2"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis3"].update(title_text="UMAP components", tickmode="linear")

            fig["layout"]["yaxis1"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis2"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis3"].update(title_text=f"Difference")

            fig.update_layout(
                autosize=True,
                width=2400,
                height=600,
                title=f"Dataset: {train}"
            )


            fig.show()


    return merged_df_accel, merged_df_gyro


_ = improvement_over_umap_all_features(results.copy())
