In [1]:
import numpy as np
import pandas as pd
import pickle
import plotly.express as px
import plotly.graph_objects as go

# Análise das características dos datasets

## Características dos datasets

- KuHar:
 - Treino: 3114
 - Teste: 246
 - Classes: 
     - Sit
     - Stand
     - Walk
     - Stair up
     - Stair down
     - Run
 
- MotionSense:
 - Treino: 4092
 - Teste: 936
 - Classes: 
     - Sit
     - Stand
     - Walk
     - Stair up
     - Stair down
     - Run
 
- UCI:
 - Treino: 2169
 - Teste: 671
 - Classes: 
     - Sit
     - Stand
     - Walk
     - Stair up
     - Stair down
     
- WISDM:
 - Treino: 12305
 - Teste: 2891
 - Classes: 
     - Sit
     - Stand
     - Walk
     - Stair up and down

## Comentários

Analisando pelos conjuntos de treinos podemos ver que o WISDM é o dataset com maior número de amostras, sendo que, em termos de quantidade, os datasets ocupam $x \%$ do WISDM.

- KuHar: 25.3\%
- MotionSense: 33.25\%
- UCI: 17.63\%
- WISDM: 100\%

Essa última análise em termos de $\%$ nos revela que o WISDM tem mais do que o dobro de todos os outros datasets, ou mais ainda, que seu conjunto de treino tem mais amostras do que o KuHar, MotionSense e UCI juntos. Este fator nos leva a levantar algumas possíveis perguntas de pesquisa para analisar os resultados intra/inter dataset:

1. O desbalanceamento da quantidade de amostras de um dataset introduz um fator de viés por dataset nos experimentos?

2. O que melhora os resultados é a presença de um novo dataset ou o aumento de amostras para o treinamento do modelo?

3. O UMAP gera uma boa representação? Se sim, existe um conjunto mínimo de datasets que deixa essa representação boa?

4. A melhor maneira de generalizar é no domínio do tempo, frequência ou outro (eg. features handcrafted)?

# Carregando resultados

In [2]:
# with open('backup/df_results.pkl', 'rb') as f:
with open('df_results_fixed.pkl', 'rb') as f:
    df_dict = pickle.load(f)
df = pd.DataFrame(df_dict)
df['Test'] = df['Test'].astype(str)
df['Train'] = df['Train'].astype(str)
df['Umap - 10'] = df['Umap - 10'].astype(str)
df

FileNotFoundError: [Errno 2] No such file or directory: 'df_results_fixed.pkl'

## Substituindo os valores NaN por zero

In [None]:
df = df.fillna(0)

## Removendo os resultados que possuem o ExtraSensory no conjunto de treino

In [None]:
data = 'ExtraSensory'
train = [comb for comb in df['Train'].unique() if data not in comb]
df = df.loc[(df['Train'].isin(train)) & (df['Classifier'] == 'RandomForest')]
df['Train'].unique()

In [None]:
y = [
    'f1-score - mean - sit', 
    'f1-score - mean - stand',
    'f1-score - mean - walk', 
    'f1-score - mean - run',
    'Test'
]

In [None]:
aux = []
for sample in df[y].values:
    aux.append(np.mean(sample[:-1] if sample[-1] != 'UCI' else sample[:-2]))
new_column = np.array(aux)
df['f1-score - mean - without stair up/down'] = new_column

In [None]:
df

In [None]:
datasets = df['Test'].unique()
datasets

In [None]:
def boxplot(df, title, y):
    fig = go.Figure()
    for elem in y:
        fig.add_trace(
            go.Box(y=df[elem], name=elem),
            # go.Scatter (
            #     hovertext=datasets
            # )
        )
    fig.update_layout(
        title=title,

    )
    fig.update_yaxes(range = [0., 1.])
    fig.show()

In [None]:
y = [
    'accuracy - mean',
    'f1 score (weighted) - mean',
    'f1-score - mean - sit', 
    'f1-score - mean - stand',
    'f1-score - mean - walk', 
    'f1-score - mean - stair up',
    'f1-score - mean - stair down', 
    'f1-score - mean - run',
    'f1-score - mean - stair up and down',
    'f1-score - mean - without stair up/down'
]


## Obs: 

- The charts below show the dsitributiosns of f1 score mean (total and per class) with/without each test data on train data. Note that we can only calculate the metric score to classes that appear on test data.

In [None]:
x = 'Test'
for data in datasets:
    train = [elem for elem in df['Train'].unique() if data in elem]
    new_df = df.loc[(df['Test'] == data) & (df['Train'].isin(train))]
    title = f'Boxplot from {data} test data with {data} train data - Classifier: Random Forest'
    boxplot(new_df, title, y)

In [None]:
column = y[-1]
data = 'UCI'

x = 'Test'
train = [elem for elem in df['Train'].unique() if data not in elem]
new_df = df.loc[(df['Test'] == data) & (df['Train'].isin(train))]
title = f'Boxplot from {data} test data with {data} train data - Classifier: Random Forest'
boxplot(new_df, title, [column])

Q1 = np.percentile(new_df[[column]], 25, interpolation = 'midpoint')
Q3 = np.percentile(new_df[[column]], 75, interpolation = 'midpoint')

new_df = new_df.loc[(new_df[column] >= Q3) | (new_df[column] <= Q1)]
new_df.sort_values(by='f1-score - mean - without stair up/down')

In [None]:
columns = df.columns
df_filtered = pd.DataFrame(columns=columns)
for data in datasets:
    train = [comb for comb in df['Train'].unique() if data not in comb]
    df_aux = df.loc[(df['Test'] == data) & (df['Train'].isin(train))]
    df_filtered = pd.concat([df_filtered, df_aux])
df_filtered

In [None]:
x = 'Test'
for data in datasets:

    train = [elem for elem in df['Train'].unique() if data not in elem]
    new_df = df.loc[(df['Test'] == data) & (df['Train'].isin(train))]
    title = f'Boxplot from {data} test data without {data} train data - Classifier: Random Forest'
    # title = f'Dataset: {data}'
    boxplot(new_df, title, y)

In [None]:
x = 'Test'
for data in datasets:
    train = [elem for elem in df_filtered['Train'].unique() if data in elem]
    new_df = df_filtered.loc[(df_filtered['Train'].isin(train))]
    title = f'Boxplot from {data} in train data'
    boxplot(new_df, title, y)

In [None]:
x = 'Test'

for data_test in datasets:
    for data in datasets:
        if data != data_test:
            train = [elem for elem in df_filtered['Train'].unique() if data in elem]
            new_df = df_filtered.loc[(df_filtered['Train'].isin(train)) & (df_filtered['Test'] == data_test)]
            title = f'Boxplot from {data} in train data and {data_test} in test data'
            boxplot(new_df, title, y)

In [None]:
data = 'UCI'
data_test='MotionSense'
train = [elem for elem in df_filtered['Train'].unique() if data in elem]
new_df = df_filtered.loc[(df_filtered['Train'].isin(train)) & (df_filtered['Test'] == data_test)]
new_df.sort_values(by='f1-score - mean - without stair up/down')

In [None]:
# new_df.sort_values(by='f1-score - mean - without stair up/down')

## Conclusions

1. When the test data don't appear on train data the score down.
2. Classes stair up/down sometimes have a terrible score (probably when WISDM appear on train data).
3. The best score when data appear on train and test data was with MotionSense - 88.42\%.
4. The worst score when data appear on train and test data was with WISDM - 69.80\%.
5. The best score when data appear only on test data was with UCI-HAR and WISDM - 61\%.
6. The worst score when data appear only on test data was with KuHar and UCI-HAR - 28\%.
7. When it only test with MotionSense and the run score is zero? It probably occurs when we haven't the class run on train data (We just have on train UCI or UCI and ExtraSensory). 

## Verifying the point 7 above

In [None]:
new_df = df.loc[(df['f1-score - mean - run'] == 0) & (df['Classifier'] == 'RandomForest') & (df['Umap - 10'] != "['-']")]
new_df.iloc[:,:15]

In [None]:
 new_df['Train'].unique()

## Questions

- Make sense use f1-score or accuracy for crossDatasets? 
- Is f1-score mean is a good metric to evaluate the experiment? 
  - R: No, we need look to f1-score per class.

In [None]:
new_df = df.loc[(df['Test'] == 'UCI') & (df['Classifier'] == 'RandomForest') & (df['Train'].isin(train)) & (df['Umap - 10'] != "['-']") & (df['f1-score - mean - sit'] < 0.5)]
new_df

In [None]:
new_df['Train'].unique()

In [None]:
df2 = df.loc[(df['Classifier'] == 'RandomForest') & (df['Umap - 10'] != "['-']") & (df['f1-score - mean - sit'] >= 0.5) & (df['f1-score - mean - stand'] >= 0.27) & (df['f1-score - mean - walk'] >= 0.52) & 
                (df['f1-score - mean - stair up'] >= 0.0) & (df['f1-score - mean - stair down'] >= 0.0) & (df['f1-score - mean - run'] >= 0.82)]
df2['Train'].unique()

In [None]:
x = 'Test'
for data in datasets:

    train = [elem for elem in df2['Train'].unique() if data not in elem]
    new_df = df2.loc[(df2['Test'] == data) & (df2['Train'].isin(train))]
    title = f'Boxplot from {data} test data and without {data} train data - Classifier: Random Forest'
    # title = f'Dataset: {data}'
    boxplot(new_df, title, y) if new_df.shape[0] != 0 else print('Empty')

In [None]:
df3 = df.loc[(df['Classifier'] == 'RandomForest') & (df['Umap - 10'] != "['-']") & (df['f1-score - mean - sit'] >= 0.5) & (df['f1-score - mean - stand'] >= 0.27) & (df['f1-score - mean - walk'] >= 0.52) & 
                (df['f1-score - mean - stair up'] >= 0.0) & (df['f1-score - mean - stair down'] >= 0.0) & (df['f1-score - mean - run'] >= 0.82) & (df['Test'].isin(['KuHar', 'MotionSense']))]
df3['Train'].unique()

In [None]:
df3

## New analyze

Now, let's remove KuHar, ExtraSensory, and UCI from train data and analyze results withou test data on train data.

In [None]:
x = 'Test'
for data in datasets:

    train = [elem for elem in df['Train'].unique() if (data not in elem) and ('ExtraSensory' not in elem) and ('WISDM' not in elem)]
    new_df = df.loc[(df['Test'] == data) & (df['Classifier'] == 'RandomForest') & (df['Train'].isin(train)) & (df['Umap - 10'] != "['-']")]
    title = f'Boxplot from {data} test data and without {data}, ExtraSensory, WISDM and UCI on train data - Classifier: Random Forest'
    # title = f'Dataset: {data}'
    boxplot(new_df, title, y)