In [1]:
import numpy as np
import pandas as pd
import tqdm
from scipy import signal
from pathlib import Path

from dataset_processor import (
    CalcTimeDiffMean,
    PlotDiffMean,
    Resampler,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    BalanceToMinimumClass,
    Pipeline
)

In [2]:
def read_kuhar(kuhar_dir_path: str) -> pd.DataFrame:
    """Le o dataset Kuhar e retorna um DataFrame com os dados (vindo de todos os arquivos CSV)
    O dataframe contém as seguintes colunas:
    - accel-x: Aceleração no eixo x
    - accel-y: Aceleração no eixo y
    - accel-z: Aceleração no eixo z
    - gyro-x: Velocidade angular no eixo x
    - gyro-y: Velocidade angular no eixo y
    - gyro-z: Velocidade angular no eixo z
    - accel-start-time: Tempo de início da janela de aceleração
    - gyro-start-time: Tempo de início da janela de giroscópio
    - activity code: Código da atividade
    - index: Índice da amostra vindo do csv
    - user: Código do usuário
    - serial: Número de serial da atividade
    - csv: Nome do arquivo CSV

    Parameters
    ----------
    kuhar_dir_path : str
        Caminho para o dataset Kuhar

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset Kuhar
    """
    kuhar_dir_path = Path(kuhar_dir_path)

    # Cria um dicionário com os tipos de dados de cada coluna
    feature_dtypes = {
        "accel-start-time": np.float32,
        "accel-x": np.float32,
        "accel-y": np.float32,
        "accel-z": np.float32,
        "gyro-start-time": np.float32,
        "gyro-x": np.float32,
        "gyro-y": np.float32,
        "gyro-z": np.float32,
    }

    dfs = []
    for i, f in enumerate(sorted(kuhar_dir_path.rglob("*.csv"))):
        # Pega o nome da atividade (nome da pasta, ex.: 5.Lay)
        # Pega o nome do arquivo CSV (ex.: 1052_F_1.csv)
        # Separa o número da atividade e o nome (ex.: [5, 'Lay'])
        activity_no, activity_name = f.parents[0].name.split(".")
        activity_no = int(activity_no)

        # Divide o código do usuário, o tipo de atividade e o número de serial (ex.: [1055, 'G', 1])
        csv_splitted = f.stem.split("_")
        user = int(csv_splitted[0])
        serial = "_".join(csv_splitted[2:])

        # Le o arquivo CSV
        df = pd.read_csv(f, names=list(feature_dtypes.keys()), dtype=feature_dtypes)
        # Apenas reordenando as colunas (não é removida nenhuma coluna)
        df = df[
            [
                "accel-x",
                "accel-y",
                "accel-z",
                "gyro-x",
                "gyro-y",
                "gyro-z",
                "accel-start-time",
                "gyro-start-time",
            ]
        ]

        # ----- Adiciona colunas auxiliares e meta-dados ------
        # Como é um simples instante de tempo (sem duração), o tempo de início e fim são iguais
        df["accel-end-time"] = df["accel-start-time"]
        df["gyro-end-time"] = df["gyro-start-time"]
        # Adiciona a coluna com o código da atividade
        df["activity code"] = activity_no
        # Adiciona a coluna do índice (qual é o numero da linha da amostra no dataframe)
        df["index"] = range(len(df))
        # Adiciona a coluna do usuário
        df["user"] = user
        # Adiciona a coluna do serial (a vez que o usuário praticou)
        df["serial"] = serial
        # Adiciona a coluna com o caminho do csv
        df["csv"] = "/".join(f.parts[-2:])
        # ----------------------------------------------------
        dfs.append(df)
    return pd.concat(dfs)

In [3]:
# Caminho para o dataset Kuhar
kuhar_path = Path("data/raw/KuHar/1.Raw_time_domian_data")
# Local onde os dados serão salvos
output_path = Path("data/processed/KuHar/")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome da coluna que será usada para agrupar as janelas
column_group = "csv"

# activity code: standard activity code
standard_activity_code_map = {
    0: 1,
    1: 0,
    2: -1,
    3: -1,
    4: -1,
    5: -1,
    6: -1,
    7: -1,
    8: -1,
    9: -1,
    10: -1,
    11: 2,
    12: -1,
    13: -1,
    14: 5,
    15: 3,
    16: 4,
    17: -1,
}


## Bruto

In [4]:
# Lê o dataset Kuhar
dataframe = read_kuhar(kuhar_path)

# Instancia o objeto para calcular a diferença entre os tempos
differ = CalcTimeDiffMean(
    groupby_column=column_group,            # Agrupa pela coluna do CSV. Os tempos de início e fim são calculados para cada grupo da coluna CSV
    column_to_diff="accel-start-time",      # Coluna para calcular a diferença
    new_column_name="timestamp diff",       # Nome da coluna com a diferença
    filter_predicate=lambda x: (            # Aplica o filtro para remover as linhas com diferença maior/menor que 1 segundo
        (x["timestamp diff"] < 1) & (x["timestamp diff"] > -1.0)
    ).all()
)

# # Instancia o objeto que plota a diferença entre os tempos
# plotter = PlotDiffMean(column_to_plot="timestamp diff")

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=300,                 # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)
new_df

Executing CalcTimeDiffMean
Executing Windowize


Creating windows: 100%|██████████| 1939/1939 [03:46<00:00,  8.55it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,activity code,user,serial,gyro-start-time,index,gyro-end-time,accel-start-time,accel-end-time,timestamp diff,standard activity code
0,-0.00028,-0.013218,-0.006041,0.019178,0.056363,0.05167,0.066037,0.084075,0.053171,0.031872,...,0,1001,1,0.014,1,0.014,0.013,0.013,0.01,1
1,0.065882,0.070706,0.079351,0.045129,-0.00106,-0.01858,-0.026813,-0.01128,-0.00422,-0.006679,...,0,1001,1,3.023,301,3.023,3.022,3.022,0.009,1
2,0.06039,0.05483,0.026566,-0.007787,-0.023151,-0.030435,0.002795,0.008467,0.009453,0.005837,...,0,1001,1,6.023,601,6.023,6.022,6.022,0.01,1
3,0.106681,0.109868,0.097632,0.062969,0.021348,-0.008206,-0.038906,-0.051849,-0.044724,0.006483,...,0,1001,1,9.023,901,9.023,9.021,9.021,0.01,1
4,-0.010397,0.005637,0.043228,0.04267,0.045788,0.046685,0.027399,0.022593,0.009374,0.004614,...,0,1001,1,12.033,1201,12.033,12.022,12.022,0.01,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19931,0.252262,0.501262,0.517139,0.919139,1.149139,0.927343,1.157343,1.348343,1.147385,1.549385,...,9,1101,9,4.775,601,4.775,4.774,4.774,0.001,-1
19932,-3.281132,-2.879132,-2.122985,-1.739985,-1.644985,-1.57197,-1.76397,-2.03197,-2.274742,-2.370742,...,9,1101,9,7.147,901,7.147,7.149,7.149,0.002,-1
19933,-0.797301,-0.261301,-1.367357,-1.424357,-1.175357,-0.973893,-0.839893,-0.839893,-1.026151,-1.294151,...,9,1101,9,9.525,1201,9.525,9.53,9.53,0.006,-1
19934,-5.619906,-5.275906,-4.109704,-3.496704,-2.998704,-2.067345,-1.799345,-1.857345,-2.296082,-4.000082,...,9,1101,9,11.904,1501,11.904,11.904,11.904,0.0,-1


In [5]:
# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "raw_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "val.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)

### Normatizado

In [6]:
# Lê o dataset Kuhar
dataframe = read_kuhar(kuhar_path)

# Instancia o objeto para calcular a diferença entre os tempos
differ = CalcTimeDiffMean(
    groupby_column=column_group,            # Agrupa pela coluna do CSV
    column_to_diff="accel-start-time",      # Coluna para calcular a diferença
    new_column_name="timestamp diff",       # Nome da coluna com a diferença
    filter_predicate=lambda x: (            # Aplica o filtro para remover as linhas com diferença maior/menor que 1 segundo
        (x["timestamp diff"] < 1) & (x["timestamp diff"] > -1.0)
    ).all()
)

# Instacia o objeto que reamostra os dados para 20Hz (supondo que o dataset original é 100Hz, constante)
resampler = Resampler(
    groupby_column=column_group,            # Agrupa pela coluna do CSV. A reamostragem é feita para cada grupo da coluna CSV
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features (e que serão reamostradas)
    original_fs=100,                        # Frequência de amostragem original
    target_fs=20,                           # Frequência de amostragem desejada
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=60,                  # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Reamostra os dados
# 3. Cria as janelas
# 4. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        resampler,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_normalized = pipeline(dataframe)
# Salva os dados
new_df_normalized.to_csv(output_path / "standartized_unbalanced.csv", index=False)
new_df_normalized

Executing CalcTimeDiffMean
Executing Resampler


Resampling: 100%|██████████| 1939/1939 [00:14<00:00, 134.95it/s]


Executing Windowize


Creating windows: 100%|██████████| 1938/1938 [03:48<00:00,  8.50it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,user,serial,gyro-start-time,index,gyro-end-time,accel-start-time,level_0,accel-end-time,timestamp diff,standard activity code
0,0.043499,0.035264,0.036768,-0.016397,0.055488,0.007305,0.017134,-0.000681,0.02912,0.022812,...,1001,1,0.014,1,0.014,0.013,0,0.013,0.01,1
1,0.074329,-0.032561,0.03793,-0.035918,0.046273,0.016107,0.00319,0.025824,-0.013873,0.029314,...,1001,1,0.614,61,0.614,0.612,60,0.612,0.008,1
2,0.004498,0.009625,0.048604,-0.034638,0.060433,-0.013917,0.019234,0.016081,0.005937,0.055254,...,1001,1,1.214,121,1.214,1.212,120,1.212,0.009,1
3,0.006804,0.00905,0.062793,-0.007664,0.084181,-0.029758,0.050535,0.033477,0.011875,0.017603,...,1001,1,1.813,181,1.813,1.812,180,1.812,0.009,1
4,0.048082,-0.022197,0.058967,-0.024187,0.034185,0.01402,0.018829,0.021408,-0.02354,0.070291,...,1001,1,2.424,241,2.424,2.412,240,2.412,0.008,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19900,0.653998,1.8166,0.111613,-1.310521,-1.533916,-1.13089,-1.271529,-1.222093,-1.379565,-0.722524,...,1101,9,0.966,121,0.966,0.966,6276699,0.966,0.0,-1
19901,-1.83416,-1.787055,0.01614,-0.119154,0.525078,0.923293,1.201898,1.34578,1.408879,0.549591,...,1101,9,1.441,181,1.441,1.442,6276759,1.442,0.001,-1
19902,-1.240092,-1.193046,-1.043288,-1.315441,-0.249598,-0.375564,-0.54641,-0.968087,-1.860489,-3.03385,...,1101,9,1.917,241,1.917,1.917,6276819,1.917,0.0,-1
19903,-3.752496,-4.120651,-2.253497,-2.870489,-0.719731,-0.227929,-0.500369,-0.182271,-0.020641,0.646653,...,1101,9,2.392,301,2.392,2.393,6276879,2.393,0.001,-1


In [7]:
# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "standartized_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "val.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)