In [1]:
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [3]:
# Let's define some constants

datasets = [
    "kuhar",
    "motionsense",
    "uci",
    "wisdm",
    "realworld"
]


labels_activity = {
    0: "sit",
    1: "stand",
    2: "walk",
    3: "stair up",
    4: "stair down",
    5: "run",
    6: "stair up and down",
}

In [4]:
# Let's define some filters
def only_fft(df):
    return df[df["transforms"].str.contains("FFT")]


def only_time(df):
    return df.loc[df["transforms"] == ""]


def using_all_features(df):
    return df[
        df["in_use_features"].str.contains("accel-x")
        & df["in_use_features"].str.contains("accel-y")
        & df["in_use_features"].str.contains("accel-z")
        & df["in_use_features"].str.contains("gyro-x")
        & df["in_use_features"].str.contains("gyro-y")
        & df["in_use_features"].str.contains("gyro-z")
    ]


def using_only_accel(df):
    return df[
        df["in_use_features"].str.contains("accel-x")
        & df["in_use_features"].str.contains("accel-y")
        & df["in_use_features"].str.contains("accel-z")
        & ~df["in_use_features"].str.contains("gyro-x")
        & ~df["in_use_features"].str.contains("gyro-y")
        & ~df["in_use_features"].str.contains("gyro-z")
    ]


def using_only_gyro(df):
    return df[
        ~df["in_use_features"].str.contains("accel-x")
        & ~df["in_use_features"].str.contains("accel-y")
        & ~df["in_use_features"].str.contains("accel-z")
        & df["in_use_features"].str.contains("gyro-x")
        & df["in_use_features"].str.contains("gyro-y")
        & df["in_use_features"].str.contains("gyro-z")
    ]


def only_reduce_on_all(df):
    return df[df["reduce_on"] == "all"]


def only_reduce_on_sensor(df):
    return df[df["reduce_on"] == "sensor"]


def only_reduce_on_axis(df):
    return df[df["reduce_on"] == "axis"]


def only_rf(df):
    return df[df["estimator"].str.lower().str.contains("randomforest")]


def only_svm(df):
    return df[df["estimator"].str.lower().str.contains("svm")]


def only_knn(df):
    return df[df["estimator"].str.lower().str.contains("knn")]


def no_scaler(df):
    return df[df["scaler"] == ""]

def min_max_scaler(df):
    return df[df["scaler"].str.lower().str.contains("minmaxscaler")]

def standard_scaler(df):
   return df[df["scaler"].str.lower().str.contains("standardscaler")]

def only_reducer_equals_train(df):
    return df[df["reducer_datasets"] == df["train_datasets"]]

def only_reducer_equals_train_or_no_reduce(df):
    return df[(df["reducer_datasets"] == df["train_datasets"]) | (df["reducer_datasets"] == "")]


def rename_datasets(
    df, columns: List[str] = ("reducer_datasets", "train_datasets", "test_datasets")
):
    def rename_row(row):
        for col in columns:
            names = set()
            for name in row[col].split(","):
                name = name.strip()
                names.add(name.split(".")[0])
            row[col] = ", ".join(sorted(names))
        return row

    df = df.apply(rename_row, axis=1)
    return df


def add_view_name(df, new_column_name: str = "view"):
    df[new_column_name] = df["config_id"].apply(lambda x: "_".join(x.split("_")[:-1]))
    return df


def match_configs(df, new_column_name: str = "config_group"):
    group_no = 0
    for k, subdf in df.groupby(
        [
            "in_use_features",
            "scale_on",
            "reduce_on",
            "transforms",
            "scaler",
            "reducer",
            "umap components",
            "reducer_datasets",
            "train_datasets",
            "test_datasets",
            "estimator",
        ]
    ):
        if len(subdf) == 2:
            df.loc[subdf.index, new_column_name] = group_no
            group_no += 1
    return df


In [5]:
results_file = Path("results.csv")
results = pd.read_csv(results_file).fillna("")
results

Unnamed: 0,experiment_name,run_name,config_id,reduce_size,train_size,test_size,in_use_features,scale_on,reduce_on,transforms,scaler,reducer,umap components,reducer_datasets,train_datasets,test_datasets,estimator,accuracy (mean),accuracy (std),f1-score macro (mean),f1-score macro (std),f1-score weighted (mean),f1-score weighted (std)
0,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,"wisdm.standartized_intra_balanced[train], wisd...",wisdm.standartized_intra_balanced[test],randomforest-100,0.670833,2.770128e-02,0.667083,3.039176e-02,0.667083,3.039176e-02
1,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,"wisdm.standartized_intra_balanced[train], wisd...",wisdm.standartized_intra_balanced[test],KNN-5,0.541667,1.110223e-16,0.540794,0.000000e+00,0.540794,0.000000e+00
2,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,"wisdm.standartized_intra_balanced[train], wisd...",wisdm.standartized_intra_balanced[test],SVM-rbf-C1.0,0.583333,1.110223e-16,0.521068,0.000000e+00,0.521068,0.000000e+00
3,reducer_comb_1,execution1,standartized_intra_balanced_00845,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,MinMaxScaler,umap-2,2,,"kuhar.standartized_intra_balanced[train], kuha...",kuhar.standartized_intra_balanced[test],randomforest-100,0.800694,1.812221e-02,0.795044,1.921549e-02,0.795044,1.921549e-02
4,reducer_comb_1,execution1,standartized_intra_balanced_00845,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,MinMaxScaler,umap-2,2,,"kuhar.standartized_intra_balanced[train], kuha...",kuhar.standartized_intra_balanced[test],KNN-5,0.493056,5.551115e-17,0.482479,0.000000e+00,0.482479,5.551115e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63172,reducer_comb_1,execution1,raw_balanced_02926,3978,3978,1062,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,FFT-centered,MinMaxScaler,umap-20,20,"motionsense.raw_balanced[train], motionsense.r...","motionsense.raw_balanced[train], motionsense.r...",motionsense.raw_balanced[test],KNN-5,0.883239,1.110223e-16,0.883064,1.110223e-16,0.883064,0.000000e+00
63173,reducer_comb_1,execution1,raw_balanced_02926,3978,3978,1062,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,FFT-centered,MinMaxScaler,umap-20,20,"motionsense.raw_balanced[train], motionsense.r...","motionsense.raw_balanced[train], motionsense.r...",motionsense.raw_balanced[test],SVM-rbf-C1.0,0.870998,1.110223e-16,0.871440,1.110223e-16,0.871440,1.110223e-16
63174,reducer_comb_1,execution1,standartized_intra_balanced_05531,0,1794,144,"accel-x, accel-y, accel-z",train,sensor,,StandardScaler,umap-5,5,,motionsense.standartized_intra_balanced[train]...,motionsense.standartized_intra_balanced[test],randomforest-100,0.798611,1.889093e-02,0.796915,1.948198e-02,0.796915,1.948198e-02
63175,reducer_comb_1,execution1,standartized_intra_balanced_05531,0,1794,144,"accel-x, accel-y, accel-z",train,sensor,,StandardScaler,umap-5,5,,motionsense.standartized_intra_balanced[train]...,motionsense.standartized_intra_balanced[test],KNN-5,0.562500,0.000000e+00,0.546329,0.000000e+00,0.546329,0.000000e+00


In [6]:
results = rename_datasets(results)
results = add_view_name(results)
results = match_configs(results)
results

Unnamed: 0,experiment_name,run_name,config_id,reduce_size,train_size,test_size,in_use_features,scale_on,reduce_on,transforms,scaler,reducer,umap components,reducer_datasets,train_datasets,test_datasets,estimator,accuracy (mean),accuracy (std),f1-score macro (mean),f1-score macro (std),f1-score weighted (mean),f1-score weighted (std),view,config_group
0,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,wisdm,wisdm,randomforest-100,0.670833,2.770128e-02,0.667083,3.039176e-02,0.667083,3.039176e-02,standartized_intra_balanced,
1,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,wisdm,wisdm,KNN-5,0.541667,1.110223e-16,0.540794,0.000000e+00,0.540794,0.000000e+00,standartized_intra_balanced,
2,reducer_comb_1,execution1,standartized_intra_balanced_08609,0,1495,120,"gyro-x, gyro-y, gyro-z",train,sensor,,,umap-7,7,,wisdm,wisdm,SVM-rbf-C1.0,0.583333,1.110223e-16,0.521068,0.000000e+00,0.521068,0.000000e+00,standartized_intra_balanced,
3,reducer_comb_1,execution1,standartized_intra_balanced_00845,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,MinMaxScaler,umap-2,2,,kuhar,kuhar,randomforest-100,0.800694,1.812221e-02,0.795044,1.921549e-02,0.795044,1.921549e-02,standartized_intra_balanced,
4,reducer_comb_1,execution1,standartized_intra_balanced_00845,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,sensor,,MinMaxScaler,umap-2,2,,kuhar,kuhar,KNN-5,0.493056,5.551115e-17,0.482479,0.000000e+00,0.482479,5.551115e-17,standartized_intra_balanced,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63172,reducer_comb_1,execution1,raw_balanced_02926,3978,3978,1062,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,FFT-centered,MinMaxScaler,umap-20,20,motionsense,motionsense,motionsense,KNN-5,0.883239,1.110223e-16,0.883064,1.110223e-16,0.883064,0.000000e+00,raw_balanced,3678.0
63173,reducer_comb_1,execution1,raw_balanced_02926,3978,3978,1062,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,FFT-centered,MinMaxScaler,umap-20,20,motionsense,motionsense,motionsense,SVM-rbf-C1.0,0.870998,1.110223e-16,0.871440,1.110223e-16,0.871440,1.110223e-16,raw_balanced,3679.0
63174,reducer_comb_1,execution1,standartized_intra_balanced_05531,0,1794,144,"accel-x, accel-y, accel-z",train,sensor,,StandardScaler,umap-5,5,,motionsense,motionsense,randomforest-100,0.798611,1.889093e-02,0.796915,1.948198e-02,0.796915,1.948198e-02,standartized_intra_balanced,
63175,reducer_comb_1,execution1,standartized_intra_balanced_05531,0,1794,144,"accel-x, accel-y, accel-z",train,sensor,,StandardScaler,umap-5,5,,motionsense,motionsense,KNN-5,0.562500,0.000000e+00,0.546329,0.000000e+00,0.546329,0.000000e+00,standartized_intra_balanced,


In [7]:
results.columns

Index(['experiment_name', 'run_name', 'config_id', 'reduce_size', 'train_size', 'test_size', 'in_use_features', 'scale_on', 'reduce_on', 'transforms', 'scaler', 'reducer', 'umap components', 'reducer_datasets', 'train_datasets', 'test_datasets', 'estimator', 'accuracy (mean)', 'accuracy (std)', 'f1-score macro (mean)', 'f1-score macro (std)', 'f1-score weighted (mean)', 'f1-score weighted (std)', 'view', 'config_group'], dtype='object')

## Pergunta 1: Qual é o impacto do UMAP na capacidade de discriminação dos modelos de ML na tarefa de HAR?

1. O desempenho dos 3 modelos de ML com o experimento realizado com e sem o UMAP
2. O impacto da normatização no resultado

**OBS**: 
- Adicionar dimensões alvo (n/2) e n
- Colocar o resultado do 0 para o final
- Grafico com a média de todos os graficos (juntar diferentes datasets)
- Adicionar resultado sem UMAP para gráfico de comparação


In [8]:
from IPython.display import display, HTML
import plotly.graph_objects as go


def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]


def improvement_over_baseline(df):
    df = only_reduce_on_all(df)
    df = using_all_features(df)
    df = no_scaler(df)
    df = only_reducer_equals_train_or_no_reduce(df)
    df = only_standardized_view(df)

    for (train, test), subdf in df.groupby(
        [
            "train_datasets",
            "test_datasets",
        ]
    ):
        fig = go.Figure()
        for (estimator, transform), subsubdf in subdf.groupby(
            ["estimator", "transforms"]
        ):
            subsubdf = subsubdf[
                ~(subsubdf["reducer_datasets"] == "")
                | (subsubdf["umap components"] == 0)
            ]
            subsubdf = subsubdf.sort_values("umap components")
            transform = transform if transform else 'no transform'
            fig.add_trace(
                go.Scatter(
                    x=subsubdf["umap components"],
                    y=subsubdf["accuracy (mean)"]*100,
                    mode="lines+markers",
                    name=f"{estimator} {transform if transform else 'no transform'}",
                    legendgroup=transform
                )
            )

        fig.update_layout(
            title=f"{train}",
            xaxis_title="UMAP components",
            yaxis_title="Accuracy (mean 10 runs)",
            yaxis_range=[0, 100],
            xaxis = dict(
                tickmode = 'linear',
            ),
            autosize=True,
            width=1200,
            height=600
        )
        fig.show()
        # break

    return df


improvement_over_baseline(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


Unnamed: 0,experiment_name,run_name,config_id,reduce_size,train_size,test_size,in_use_features,scale_on,reduce_on,transforms,scaler,reducer,umap components,reducer_datasets,train_datasets,test_datasets,estimator,accuracy (mean),accuracy (std),f1-score macro (mean),f1-score macro (std),f1-score weighted (mean),f1-score weighted (std),view,config_group
639,reducer_comb_1,execution1,standartized_intra_balanced_16477,1794,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-20,20,motionsense,motionsense,motionsense,randomforest-100,0.629167,1.250000e-02,0.620857,1.214242e-02,0.620857,1.214242e-02,standartized_intra_balanced,3020.0
640,reducer_comb_1,execution1,standartized_intra_balanced_16477,1794,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-20,20,motionsense,motionsense,motionsense,KNN-5,0.652778,0.000000e+00,0.643501,0.000000e+00,0.643501,0.000000e+00,standartized_intra_balanced,3018.0
641,reducer_comb_1,execution1,standartized_intra_balanced_16477,1794,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-20,20,motionsense,motionsense,motionsense,SVM-rbf-C1.0,0.652778,0.000000e+00,0.617080,0.000000e+00,0.617080,0.000000e+00,standartized_intra_balanced,3019.0
810,reducer_comb_1,execution1,standartized_intra_balanced_08387,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-7,7,,realworld,realworld,randomforest-100,0.555556,3.011039e-02,0.560679,2.978170e-02,0.560679,2.978170e-02,standartized_intra_balanced,
811,reducer_comb_1,execution1,standartized_intra_balanced_08387,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-7,7,,realworld,realworld,KNN-5,0.250000,0.000000e+00,0.201620,2.775558e-17,0.201620,2.775558e-17,standartized_intra_balanced,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63043,reducer_comb_1,execution1,standartized_intra_balanced_11624,1794,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-9,9,realworld,realworld,realworld,KNN-5,0.340278,0.000000e+00,0.323675,0.000000e+00,0.323675,0.000000e+00,standartized_intra_balanced,3126.0
63044,reducer_comb_1,execution1,standartized_intra_balanced_11624,1794,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-9,9,realworld,realworld,realworld,SVM-rbf-C1.0,0.291667,5.551115e-17,0.267821,5.551115e-17,0.267821,5.551115e-17,standartized_intra_balanced,3127.0
63105,reducer_comb_1,execution1,standartized_intra_balanced_03515,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-4,4,,kuhar,kuhar,randomforest-100,0.797917,2.116632e-02,0.792412,2.307571e-02,0.792412,2.307571e-02,standartized_intra_balanced,
63106,reducer_comb_1,execution1,standartized_intra_balanced_03515,0,1794,144,"accel-x, accel-y, accel-z, gyro-x, gyro-y, gyro-z",train,all,,,umap-4,4,,kuhar,kuhar,KNN-5,0.493056,5.551115e-17,0.469367,5.551115e-17,0.469367,0.000000e+00,standartized_intra_balanced,


In [9]:
from IPython.display import display, HTML
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]


def only_raw_view(df):
    return df[df["view"] == "raw_balanced"]


def improvement_raw_standardized(df):
    df = only_reduce_on_all(df)
    df = using_all_features(df)
    df = no_scaler(df)
    df = only_reducer_equals_train_or_no_reduce(df)
    raw_view = only_raw_view(df.copy())
    standard_df = only_standardized_view(df.copy())
    merged_df = raw_view.merge(standard_df, on=["config_group"], suffixes=("", "_standardized"))
    merged_df["diff"] = merged_df["accuracy (mean)_standardized"] - merged_df["accuracy (mean)"]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }

    colors = list(colors_map.values())

    for (train, test), subdf in merged_df.groupby(
        [
            "train_datasets",
            "test_datasets",
        ]
    ):
        fig = make_subplots(rows=1, cols=3, subplot_titles=("raw", "standartized", f"difference (y = standartized - raw)"))
        for i, ((estimator, transform), subsubdf) in enumerate(subdf.groupby(
            ["estimator", "transforms"]
        )):
            subsubdf = subsubdf[
                ~(subsubdf["reducer_datasets"] == "")
                | (subsubdf["umap components"] == 0)
            ]
            subsubdf = subsubdf.sort_values("umap components")
            transform = transform if transform else 'no transform'
            group = f"{estimator} {transform}"
            fig.add_trace(
                go.Scatter(
                    x=subsubdf["umap components"],
                    y=subsubdf["accuracy (mean)"]*100,
                    mode="lines+markers",
                    name=f"{estimator} {transform if transform else 'no transform'}",
                    legendgroup=group,
                    line=dict(color=colors[i], width=1),
                ),
                row=1, col=1
            )

            fig.add_trace(
                go.Scatter(
                    x=subsubdf["umap components"],
                    y=subsubdf["accuracy (mean)_standardized"]*100,
                    mode="lines+markers",
                    name=f"{estimator} {transform if transform else 'no transform'}",
                    legendgroup=group,
                    line=dict(color=colors[i], width=1),
                ),
                row=1, col=2
            )

            fig.add_trace(
                go.Scatter(
                    x=subsubdf["umap components"],
                    y=subsubdf["diff"]*100,
                    mode="lines+markers",
                    name=f"{estimator} {transform if transform else 'no transform'}",
                    legendgroup=group,
                    line=dict(color=colors[i], width=1),
                ),
                row=1, col=3
            )

            fig["layout"]["xaxis1"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis2"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis3"].update(title_text="UMAP components", tickmode="linear")

            fig["layout"]["yaxis1"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis2"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis3"].update(title_text=f"Difference")

            fig.update_layout(
                autosize=True,
                width=2400,
                height=600,
                title=f"Dataset: {train}"
            )
        fig.show()
        # break

    return merged_df


_ = improvement_raw_standardized(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


## Pergunta 2: Como a escolha do dataset no treinamento do UMAP afeta os resultados da tarefa?

In [10]:
# def only_with_reduce(df):
#     return df[(df["reducer_datasets"] != "") & (df["umap components"] != 0)]

# only_with_reduce(results.copy())

## Pergunta 3: É possível usar Manifold Learning para reduzir o domain shift?

## Pergunta 4: Qual a importância da normatização nos experimentos inter-dataset

## Pergunta 5: É mais vantajoso reduzir a dimensionalidade dos dados dos sensores de forma individual (aplicando o UMAP a um sensor, ou eixo, por vez) ou de forma agregada (como estamos fazendo)

Qual abordagem é a mais interessante para a generalização de um dataset fonte para um dataset alvo? 
1. Espaço latente unificado partindo dos dados de sensores concatenados;
2. União de espaços latentes gerados separadamente por sensor;
3. União dos espaços latentes gerados por eixo e por sensor (ou seja, 6 projeções UMAP independentes).

**OBS**:
- Ajustar a diferença para comparar com o numero de features final

In [11]:
from IPython.display import display, HTML
import plotly.graph_objects as go
# subplots plotly
from plotly.subplots import make_subplots 


def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]



def improvement_over_umap_all(df):
    df = using_all_features(df)
    df = no_scaler(df)
    df = only_reducer_equals_train(df)
    df = only_standardized_view(df)
    df = only_fft(df)
    umap_all = only_reduce_on_all(df.copy())
    umap_sensor = only_reduce_on_sensor(df.copy())
    umap_axis = only_reduce_on_axis(df.copy())

    merge_feat = [
            "in_use_features",
            "scale_on",
            "transforms",
            "scaler",
            "reducer",
            "umap components",
            "reducer_datasets",
            "train_datasets",
            "test_datasets",
            "estimator",
        ]
        
    merged_df_sensor = umap_all.merge(umap_sensor, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_sensor["diff"] = merged_df_sensor["accuracy (mean)_other"] - merged_df_sensor["accuracy (mean)_all"]
    merged_df_axis = umap_all.merge(umap_axis, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_axis["diff"] = merged_df_axis["accuracy (mean)_other"] - merged_df_axis["accuracy (mean)_all"]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }

    colors = list(colors_map.values())
    
    for the_df, name in [(merged_df_sensor, "sensor"), (merged_df_axis, "axis")]:
        for (train, test), subdf in the_df.groupby(
            [
                "train_datasets",
                "test_datasets",
            ]
        ):
            fig = make_subplots(rows=1, cols=3, subplot_titles=("all", name, f"difference (y = {name} - all)"))
            for i, (estimator, subsubdf) in enumerate(subdf.groupby(
                "estimator"
            )):
                subsubdf = subsubdf[
                    ~(subsubdf["reducer_datasets"] == "")
                    | (subsubdf["umap components"] == 0)
                ]
                subsubdf = subsubdf.sort_values("umap components")
                transform = "FFT"
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_all"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=1
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_other"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=2
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["diff"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=3
                )

            fig["layout"]["xaxis1"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis2"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis3"].update(title_text="UMAP components", tickmode="linear")

            fig["layout"]["yaxis1"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis2"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis3"].update(title_text=f"Difference")

            fig.update_layout(
                autosize=True,
                width=2400,
                height=600,
                title=f"Dataset: {train}"
            )


            fig.show()


    return merged_df_sensor, merged_df_axis


_= improvement_over_umap_all(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


## Pergunta 6: Há uma mudança significativa entre os diferentes classificadores




In [12]:
results.groupby("estimator").mean()["accuracy (mean)"].sort_values(ascending=False)


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



estimator
randomforest-100    0.681847
SVM-rbf-C1.0        0.650984
KNN-5               0.645637
Name: accuracy (mean), dtype: float64

## Pergunta 7:  Melhor no domínio da frequência ou tempo (condicionada à abordagem - manifold learning com UMAP)? (ou wavelet?)

In [13]:
x = results.copy()
x.loc[x["transforms"] == "", "transforms"] = "no transform"
x.groupby("transforms").mean()["accuracy (mean)"].sort_values(ascending=False)


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



transforms
FFT-centered    0.724967
no transform    0.594006
Name: accuracy (mean), dtype: float64

## Pergunta 8: Qual dataset (ou subconjunto de datasets) é mais propício para generalizar o reconhecimento de HAR?

## Pergunta 9: Normalização traz mais benefícios?

**OBS**:
- Reportar média e std dev dos resultados antes e depois da normalização

In [14]:
from IPython.display import display, HTML
import plotly.graph_objects as go
# subplots plotly
from plotly.subplots import make_subplots 


def only_minmax_scaler(df):
    return df[df["scaler"] == "MinMaxScaler"]

def only_std_scaler(df):
    return df[df["scaler"] == "StandardScaler"]

def improvement_over_umap_scaler(df):
    df = using_all_features(df)
    df = only_reducer_equals_train(df)
    df = only_standardized_view(df)
    df = only_fft(df)
    df = only_reduce_on_all(df)
    umap_no_scaler = no_scaler(df.copy())
    umap_minmax = only_minmax_scaler(df.copy())
    umap_std = only_std_scaler(df.copy())

    merge_feat = [
        "scale_on",
        "reduce_on",
        "transforms",
        "reducer",
        "umap components",
        "reducer_datasets",
        "train_datasets",
        "test_datasets",
        "estimator",
    ]

        
    merged_df_minmax = umap_no_scaler.merge(umap_minmax, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_minmax["diff"] = merged_df_minmax["accuracy (mean)_other"] - merged_df_minmax["accuracy (mean)_all"]
    merged_df_std = umap_no_scaler.merge(umap_std, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_std["diff"] = merged_df_std["accuracy (mean)_other"] - merged_df_std["accuracy (mean)_all"]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }

    colors = list(colors_map.values())
    
    for the_df, name in [(merged_df_minmax, "minmax"), (merged_df_std, "z-norm")]:
        for (train, test), subdf in the_df.groupby(
            [
                "train_datasets",
                "test_datasets",
            ]
        ):
            fig = make_subplots(rows=1, cols=3, subplot_titles=("all", name, f"difference (y = {name} - no scaler)"))
            for i, (estimator, subsubdf) in enumerate(subdf.groupby(
                "estimator"
            )):

                subsubdf = subsubdf[
                    ~(subsubdf["reducer_datasets"] == "")
                    | (subsubdf["umap components"] == 0)
                ]

                subsubdf = subsubdf.sort_values("umap components")
                transform = "FFT"
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_all"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=1
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_other"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=2
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["diff"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=3
                )

            fig["layout"]["xaxis1"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis2"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis3"].update(title_text="UMAP components", tickmode="linear")

            fig["layout"]["yaxis1"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis2"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis3"].update(title_text=f"Difference")

            fig.update_layout(
                autosize=True,
                width=2400,
                height=600,
                title=f"Dataset: {train}"
            )


            fig.show()


    return merged_df_minmax, merged_df_std


_ = improvement_over_umap_scaler(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df


## Pergunta 10: Quais os sensores que ajudam mais: acelerômetro, giroscópio, ambos?

In [15]:
from IPython.display import display, HTML
import plotly.graph_objects as go
# subplots plotly
from plotly.subplots import make_subplots 


def only_standardized_view(df):
    return df[df["view"] == "standartized_intra_balanced"]



def improvement_over_umap_all_features(df):
    df = no_scaler(df)
    df = only_reducer_equals_train(df)
    df = only_standardized_view(df)
    df = only_fft(df)
    df = only_reduce_on_all(df)
    umap_all = using_all_features(df.copy())
    umap_accel = using_only_accel(df.copy())
    umap_gyro = using_only_gyro(df.copy())

    merge_feat = [
        "scale_on",
        "reduce_on",
        "transforms",
        "scaler",
        "reducer",
        "umap components",
        "reducer_datasets",
        "train_datasets",
        "test_datasets",
        "estimator",
    ]

        
    merged_df_accel = umap_all.merge(umap_accel, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_accel["diff"] = merged_df_accel["accuracy (mean)_other"] - merged_df_accel["accuracy (mean)_all"]
    merged_df_gyro = umap_all.merge(umap_gyro, on=merge_feat, suffixes=("_all", "_other")).dropna()
    merged_df_gyro["diff"] = merged_df_gyro["accuracy (mean)_other"] - merged_df_gyro["accuracy (mean)_all"]

    colors_map = {
        "plotly-blue": px.colors.qualitative.Plotly[0],
        "plotly-red": px.colors.qualitative.Plotly[1],
        "plotly-green": px.colors.qualitative.Plotly[2],
        "plotly-purple": px.colors.qualitative.Plotly[3],
        "plotly-orange": px.colors.qualitative.Plotly[4],
        "plotly-cyan": px.colors.qualitative.Plotly[5],
        "plotly-pink": px.colors.qualitative.Plotly[6],
        "plotly-lightgreen": px.colors.qualitative.Plotly[7],
        "plotly-lightpink": px.colors.qualitative.Plotly[8],
        "plotly-yellow": px.colors.qualitative.Plotly[9],
    }

    colors = list(colors_map.values())
    
    for the_df, name in [(merged_df_accel, "accel"), (merged_df_gyro, "gyro")]:
        for (train, test), subdf in the_df.groupby(
            [
                "train_datasets",
                "test_datasets",
            ]
        ):
            fig = make_subplots(rows=1, cols=3, subplot_titles=("all", name, f"difference (y = {name} - all)"))
            for i, (estimator, subsubdf) in enumerate(subdf.groupby(
                "estimator"
            )):
                subsubdf = subsubdf[
                    ~(subsubdf["reducer_datasets"] == "")
                    | (subsubdf["umap components"] == 0)
                ]

                subsubdf = subsubdf.sort_values("umap components")
                transform = "FFT"
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_all"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=1
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["accuracy (mean)_other"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=2
                )
                fig.add_trace(
                    go.Scatter(
                        x=subsubdf["umap components"],
                        y=subsubdf["diff"]*100,
                        mode="lines+markers",
                        name=f"{estimator} {transform if transform else 'no transform'}",
                        legendgroup=estimator,
                        line=dict(color=colors[i], width=1),
                    ),
                    row=1, col=3
                )

            fig["layout"]["xaxis1"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis2"].update(title_text="UMAP components", tickmode="linear")
            fig["layout"]["xaxis3"].update(title_text="UMAP components", tickmode="linear")

            fig["layout"]["yaxis1"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis2"].update(title_text="Accuracy (%)", range=[0, 100])
            fig["layout"]["yaxis3"].update(title_text=f"Difference")

            fig.update_layout(
                autosize=True,
                width=2400,
                height=600,
                title=f"Dataset: {train}"
            )


            fig.show()


    return merged_df_accel, merged_df_gyro


_ = improvement_over_umap_all_features(results.copy())


# df = df.copy()
# df["improvement"] = df["accuracy"] - df["baseline_accuracy"]
# return df
