In [1]:
import numpy as np
import pandas as pd
import tqdm
from scipy import signal
from pathlib import Path

from dataset_processor import (
    CalcTimeDiffMean,
    PlotDiffMean,
    ResamplerPoly,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    BalanceToMinimumClass,
    FilterByCommonRows,
    Pipeline
)

In [2]:
def read_kuhar(kuhar_dir_path: str) -> pd.DataFrame:
    """Le o dataset Kuhar e retorna um DataFrame com os dados (vindo de todos os arquivos CSV)
    O dataframe contém as seguintes colunas:
    - accel-x: Aceleração no eixo x
    - accel-y: Aceleração no eixo y
    - accel-z: Aceleração no eixo z
    - gyro-x: Velocidade angular no eixo x
    - gyro-y: Velocidade angular no eixo y
    - gyro-z: Velocidade angular no eixo z
    - accel-start-time: Tempo de início da janela de aceleração
    - gyro-start-time: Tempo de início da janela de giroscópio
    - activity code: Código da atividade
    - index: Índice da amostra vindo do csv
    - user: Código do usuário
    - serial: Número de serial da atividade
    - csv: Nome do arquivo CSV

    Parameters
    ----------
    kuhar_dir_path : str
        Caminho para o dataset Kuhar

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset Kuhar
    """
    kuhar_dir_path = Path(kuhar_dir_path)

    # Cria um dicionário com os tipos de dados de cada coluna
    feature_dtypes = {
        "accel-start-time": np.float32,
        "accel-x": np.float32,
        "accel-y": np.float32,
        "accel-z": np.float32,
        "gyro-start-time": np.float32,
        "gyro-x": np.float32,
        "gyro-y": np.float32,
        "gyro-z": np.float32,
    }

    dfs = []
    for i, f in enumerate(sorted(kuhar_dir_path.rglob("*.csv"))):
        # Pega o nome da atividade (nome da pasta, ex.: 5.Lay)
        # Pega o nome do arquivo CSV (ex.: 1052_F_1.csv)
        # Separa o número da atividade e o nome (ex.: [5, 'Lay'])
        activity_no, activity_name = f.parents[0].name.split(".")
        activity_no = int(activity_no)

        # Divide o código do usuário, o tipo de atividade e o número de serial (ex.: [1055, 'G', 1])
        csv_splitted = f.stem.split("_")
        user = int(csv_splitted[0])
        serial = "_".join(csv_splitted[2:])

        # Le o arquivo CSV
        df = pd.read_csv(f, names=list(feature_dtypes.keys()), dtype=feature_dtypes)
        
        # Remove dataframes que contenham NaN
        if df.isnull().values.any():
            continue

        # Apenas reordenando as colunas (não é removida nenhuma coluna)
        df = df[
            [
                "accel-x",
                "accel-y",
                "accel-z",
                "gyro-x",
                "gyro-y",
                "gyro-z",
                "accel-start-time",
                "gyro-start-time",
            ]
        ]

        # ----- Adiciona colunas auxiliares e meta-dados ------
        # Como é um simples instante de tempo (sem duração), o tempo de início e fim são iguais
        df["accel-end-time"] = df["accel-start-time"]
        df["gyro-end-time"] = df["gyro-start-time"]
        # Adiciona a coluna com o código da atividade
        df["activity code"] = activity_no
        # Adiciona a coluna do índice (qual é o numero da linha da amostra no dataframe)
        df["index"] = range(len(df))
        # Adiciona a coluna do usuário
        df["user"] = user
        # Adiciona a coluna do serial (a vez que o usuário praticou)
        df["serial"] = serial
        # Adiciona a coluna com o caminho do csv
        df["csv"] = "/".join(f.parts[-2:])
        # ----------------------------------------------------
        dfs.append(df)
    return pd.concat(dfs)

In [3]:
# Caminho para o dataset Kuhar
kuhar_path = Path("../data/original/KuHar/1.Raw_time_domian_data")

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome da coluna que será usada para agrupar as janelas
column_group = "csv"

# activity code: standard activity code
standard_activity_code_map = {
    0: 1,
    1: 0,
    2: -1,
    3: -1,
    4: -1,
    5: -1,
    6: -1,
    7: -1,
    8: -1,
    9: -1,
    10: -1,
    11: 2,
    12: -1,
    13: -1,
    14: 5,
    15: 3,
    16: 4,
    17: -1,
}


## Bruto

In [4]:
# Lê o dataset Kuhar
dataframe = read_kuhar(kuhar_path)

# Instancia o objeto para calcular a diferença entre os tempos
differ = CalcTimeDiffMean(
    groupby_column=column_group,            # Agrupa pela coluna do CSV. Os tempos de início e fim são calculados para cada grupo da coluna CSV
    column_to_diff="accel-start-time",      # Coluna para calcular a diferença
    new_column_name="timestamp diff",       # Nome da coluna com a diferença
    # filter_predicate=lambda x: (            # Aplica o filtro para remover as linhas com diferença maior/menor que 1 segundo
    #     (x["timestamp diff"] < 1) & (x["timestamp diff"] > -1.0)
    # ).all()
)

# # Instancia o objeto que plota a diferença entre os tempos
# plotter = PlotDiffMean(column_to_plot="timestamp diff")

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=300,                 # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
new_df

Executing CalcTimeDiffMean
Executing Windowize


Creating windows: 100%|██████████| 1944/1944 [01:43<00:00, 18.75it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,activity code,accel-end-time,gyro-start-time,gyro-end-time,index,serial,user,timestamp diff,window,standard activity code
0,-0.00028,-0.013218,-0.006041,0.019178,0.056363,0.05167,0.066037,0.084075,0.053171,0.031872,...,0,0.013,0.014,0.014,1,1,1001,0.01,0,1
1,0.065882,0.070706,0.079351,0.045129,-0.00106,-0.01858,-0.026813,-0.01128,-0.00422,-0.006679,...,0,3.022,3.023,3.023,301,1,1001,0.009,1,1
2,0.06039,0.05483,0.026566,-0.007787,-0.023151,-0.030435,0.002795,0.008467,0.009453,0.005837,...,0,6.022,6.023,6.023,601,1,1001,0.01,2,1
3,0.106681,0.109868,0.097632,0.062969,0.021348,-0.008206,-0.038906,-0.051849,-0.044724,0.006483,...,0,9.021,9.023,9.023,901,1,1001,0.01,3,1
4,-0.010397,0.005637,0.043228,0.04267,0.045788,0.046685,0.027399,0.022593,0.009374,0.004614,...,0,12.022,12.033,12.033,1201,1,1001,0.01,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19990,0.252262,0.501262,0.517139,0.919139,1.149139,0.927343,1.157343,1.348343,1.147385,1.549385,...,9,4.774,4.775,4.775,601,9,1101,0.001,2,-1
19991,-3.281132,-2.879132,-2.122985,-1.739985,-1.644985,-1.57197,-1.76397,-2.03197,-2.274742,-2.370742,...,9,7.149,7.147,7.147,901,9,1101,0.002,3,-1
19992,-0.797301,-0.261301,-1.367357,-1.424357,-1.175357,-0.973893,-0.839893,-0.839893,-1.026151,-1.294151,...,9,9.53,9.525,9.525,1201,9,1101,0.006,4,-1
19993,-5.619906,-5.275906,-4.109704,-3.496704,-2.998704,-2.067345,-1.799345,-1.857345,-2.296082,-4.000082,...,9,11.904,11.904,11.904,1501,9,1101,0.0,5,-1


## Normatizado com reamostrador ResamplerPoly

In [5]:
# Lê o dataset Kuhar
dataframe = read_kuhar(kuhar_path)

# Instancia o objeto para calcular a diferença entre os tempos
differ = CalcTimeDiffMean(
    groupby_column=column_group,            # Agrupa pela coluna do CSV
    column_to_diff="accel-start-time",      # Coluna para calcular a diferença
    new_column_name="timestamp diff",       # Nome da coluna com a diferença
    # filter_predicate=lambda x: (            # Aplica o filtro para remover as linhas com diferença maior/menor que 1 segundo
    #     (x["timestamp diff"] < 1) & (x["timestamp diff"] > -1.0)
    # ).all()
)

# Instacia o objeto que reamostra os dados para 20Hz (supondo que o dataset original é 100Hz, constante)
resampler = ResamplerPoly(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    up=2,                                                       # Frequência de amostragem original
    down=10,                                                     # Frequência de amostragem desejada
    groupby_column=column_group,                                # Agrupa pela coluna do txt. A reamostragem é feita para cada grupo da coluna txt
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=60,                  # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Reamostra os dados
# 3. Cria as janelas
# 4. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        resampler,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

Executing CalcTimeDiffMean
Executing ResamplerPoly


Resampling: 100%|██████████| 1944/1944 [00:06<00:00, 279.82it/s]


Executing Windowize


Creating windows: 100%|██████████| 1944/1944 [01:27<00:00, 22.19it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,accel-end-time,gyro-start-time,gyro-end-time,index,serial,user,timestamp diff,level_0,window,standard activity code
0,0.002471,0.050886,0.026327,-0.007957,0.047477,0.015004,0.010874,0.00485,0.023128,0.028366,...,0.013,0.014,0.014,1,1,1001,0.01,0,0,1
1,0.067428,-0.010694,0.017021,-0.008678,0.005025,0.050246,-0.014686,0.033231,-0.005797,0.00363,...,0.612,0.614,0.614,61,1,1001,0.008,60,1,1
2,0.037013,-0.018253,0.050906,0.000349,0.001087,0.047377,-0.022432,0.036873,-0.002844,0.031009,...,1.212,1.214,1.214,121,1,1001,0.009,120,2,1
3,0.098688,-0.010114,0.02515,0.051607,0.000195,0.080629,-0.033775,0.059053,0.026227,0.015119,...,1.812,1.813,1.813,181,1,1001,0.009,180,3,1
4,-0.013984,0.052472,-0.018912,0.047351,-0.003415,0.0093,0.035681,-0.000772,0.042241,-0.03637,...,2.412,2.424,2.424,241,1,1001,0.008,240,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20035,0.770076,0.902702,1.733595,-0.255494,-1.389281,-1.503035,-1.091561,-1.310683,-1.201414,-1.386815,...,0.966,0.966,0.966,121,9,1101,0.0,6295421,2,-1
20036,-3.034536,-1.737755,-2.05198,-0.29118,-0.023347,0.266129,0.91152,1.098911,1.325172,1.473018,...,1.442,1.441,1.441,181,9,1101,0.001,6295481,3,-1
20037,-0.35839,-1.138243,-1.262342,-1.078931,-1.196354,-1.047739,-0.07328,-0.557009,-0.526015,-1.305268,...,1.917,1.917,1.917,241,9,1101,0.0,6295541,4,-1
20038,-5.295989,-2.370364,-3.613883,-4.250051,-2.250864,-2.90727,-0.876931,-0.183237,-0.5119,-0.201191,...,2.393,2.392,2.392,301,9,1101,0.001,6295601,5,-1


## Filtra por elementos iguais

In [6]:
filter_common = FilterByCommonRows(match_columns=["user", "serial", "window", "activity code"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

## Balanceia e salva os dados brutos

In [7]:
output_path_balanced = Path("../data/raw_balanced")
output_path_balanced_standartized = Path("../data/standartized_balanced")
output_path_unbalanced = Path("../data/unbalanced")

# Salva os dados
ouptut_dir = output_path_unbalanced / "KuHar"
ouptut_dir.mkdir(parents=True, exist_ok=True)
new_df.to_csv(output_path_unbalanced / "KuHar/raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path_balanced / "KuHar" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/raw_balanced/KuHar


## Balanceia e salva os dados processados

In [8]:
# Salva os dados
new_df_standartized.to_csv(output_path_unbalanced / "KuHar/standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path_balanced_standartized / "KuHar" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/standartized_balanced/KuHar
