In [1]:
import numpy as np
import pandas as pd
import tqdm
from scipy import signal
from pathlib import Path

from dataset_processor import (
    AddGravityColumn,
    Convert_G_to_Ms2,
    ButterworthFilter,
    Resampler,
    Windowize,
    AddStandardActivityCode,
    RenameColumns,
    Pipeline
)

In [2]:
def read_uci(uci_path):
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos CSV)
    O dataframe retornado possui as seguintes colunas:
    - attitude.roll: Rotação em torno do eixo x
    - attitude.pitch: Rotação em torno do eixo y
    - attitude.yaw: Rotação em torno do eixo z
    - gravity.x: Gravidade em torno do eixo x
    - gravity.y: Gravidade em torno do eixo y
    - gravity.z: Gravidade em torno do eixo z
    - rotationRate.x: Velocidade angular em torno do eixo x
    - rotationRate.y: Velocidade angular em torno do eixo y
    - rotationRate.z: Velocidade angular em torno do eixo z
    - userAcceleration.x: Aceleração no eixo x
    - userAcceleration.y: Aceleração no eixo y
    - userAcceleration.z: Aceleração no eixo z
    - activity code: Código da atividade
    - index: Índice da amostra vindo do txt
    - user: Usuário que realizou a atividade
    - serial: Número de série da atividade
    - txt: Caminho do txt que contém a atividade

    Parameters
    ----------
    uci_path : str
        Caminho para o dataset MotionSense

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset UCI-HAR
    """
    activity_names = {
        1: "WALKING", 
        2: "WALKING_UPSTAIRS", 
        3: "WALKING_DOWNSTAIRS", 
        4: "SITTING", 
        5: "STANDING", 
        6: "LAYING",
        7: "STAND_TO_SIT",
        8: "SIT_TO_STAND",
        9: "SIT_TO_LIE",
        10: "LIE_TO_SIT",
        11: "STAND_TO_LIE",
        12: "LIE_TO_STAND"
    }
    activity_codes = {v: k for k, v in activity_names.items()}
    
    feature_columns = [
        "accel-x",
        "accel-y",
        "accel-z",
        "gyro-x",
        "gyro-y",
        "gyro-z",
    ]
    
#     df_labels = pd.read_csv("data/RawData/labels.txt", header=None, sep=" ")
    df_labels = pd.read_csv(uci_path+"/labels.txt", header=None, sep=" ")
    df_labels.columns=["serial", "user", "activity code", "start", "end"]
    
    uci_path = Path(uci_path)
    
    dfs = []
    data_path = list(uci_path.glob("*.txt"))
    new_data_path = [elem.name.split("_")+[elem] for elem in sorted(data_path)]
    df = pd.DataFrame(new_data_path, columns=["sensor", "serial", "user", "file"])
    for key, df2 in df.groupby(["serial", "user"]):
        acc, gyr = [], []
        for row_index, row in df2.iterrows():
            data = pd.read_csv(row["file"], header=None, sep=" ")
            if row["sensor"] == "acc":
                acc.append(data)
            else:
                gyr.append(data)
        new_df = pd.concat([acc[0], gyr[0]], axis=1)
        new_df.columns = feature_columns
        
        user = int(key[1].split(".")[0][4:])
        serial = int(key[0][3:])
        
        new_df['txt'] = row["file"]
        
        new_df["user"] = user
        new_df["serial"] = serial
#         new_df["activity code"] = -1
        
        for row_index, row in df_labels.loc[(df_labels["serial"] == serial) & (df_labels["user"] == user)].iterrows():
            start = row['start']
            end = row["end"]+1
            activity = row["activity code"]
            resumed_df = new_df.loc[start:end].copy()
            resumed_df["index"] = [i for i in range(start, end+1)]
            resumed_df["activity code"] = activity
            
            dfs.append(resumed_df)
    
    df = pd.concat(dfs)
    df.reset_index(inplace=True, drop=True)
    return df

In [3]:
# Caminho para o dataset UCI-HAR
uci_path = "data/RawData"
# Caminho para salvar o dataset pré-processado
output_path = Path("data/processed/UCI")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = ["txt"]

# activity code: standard activity code
standard_activity_code_map = {
    1: 2, # walk
    2: 3, # stair up
    3: 4, # stair down
    4: 0, # sit
    5: 1, # stand
    6: -1, # Laying
    7: -1, # stand to sit
    8: -1, # sit to stand
    9: -1, # sit to lie
    10: -1, # lie to sit
    11: -1, # stand to lie
    12: -1 # lie to stand
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_uci(uci_path)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=150,                 # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do txt. As janelas são criadas para cada grupo da coluna txt
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Cria as janelas
# 2. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)
new_df

Executing Windowize


  for obj in iterable:
Creating windows: 100%|█████████████████████████████████████████████████████████████████████████████████████| 61/61 [00:11<00:00,  5.12it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-146,gyro-z-147,gyro-z-148,gyro-z-149,serial,index,txt,user,activity code,standard activity code
0,1.020833,1.025,1.020833,1.016667,1.018056,1.018056,1.019445,1.016667,1.020833,1.019445,...,-0.002749,-0.005192,-0.008247,-0.013439,1.0,250.0,data/RawData/gyro_exp01_user01.txt,1,5,1
1,1.022222,1.016667,1.018056,1.023611,1.022222,1.015278,1.019445,1.019445,1.016667,1.022222,...,-0.001833,0.005498,0.002749,0.008552,1.0,400.0,data/RawData/gyro_exp01_user01.txt,1,5,1
2,1.020833,1.022222,1.018056,1.019445,1.016667,1.022222,1.019445,1.019445,1.022222,1.022222,...,0.013134,0.015272,0.007636,0.003971,1.0,550.0,data/RawData/gyro_exp01_user01.txt,1,5,1
3,1.020833,1.020833,1.015278,1.016667,1.016667,1.023611,1.022222,1.022222,1.018056,1.018056,...,-0.00672,-0.007636,-0.005192,-0.004887,1.0,700.0,data/RawData/gyro_exp01_user01.txt,1,5,1
4,1.019445,1.022222,1.026389,1.020833,1.022222,1.019445,1.022222,1.025,1.019445,1.016667,...,0.006109,0.013134,-0.00336,-0.00733,1.0,850.0,data/RawData/gyro_exp01_user01.txt,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5412,0.883333,0.85,0.797222,0.768056,0.733333,0.894445,1.291667,1.738889,1.870833,1.656945,...,0.129809,0.095295,0.009163,-0.063225,61.0,17060.0,data/RawData/gyro_exp61_user30.txt,30,3,4
5413,0.831944,0.822222,0.825,0.829167,0.829167,0.854167,0.8875,0.8875,0.956944,1.077778,...,0.113316,0.089797,0.023213,-0.066279,61.0,17210.0,data/RawData/gyro_exp61_user30.txt,30,3,4
5414,0.918056,0.968056,1.094444,1.361111,1.708333,1.765278,1.511111,1.079167,0.863889,0.815278,...,0.068722,-0.113926,-0.307876,-0.274889,61.0,17502.0,data/RawData/gyro_exp61_user30.txt,30,2,3
5415,1.061111,1.113889,1.072222,1.025,0.923611,0.822222,0.725,0.665278,0.715278,0.801389,...,0.336892,0.04612,-0.277944,-0.421497,61.0,17652.0,data/RawData/gyro_exp61_user30.txt,30,2,3


## Normatizado

In [5]:
# Lê o dataset
dataframe = read_uci(uci_path)

# Instancia o objeto que converte a aceleração para m/s²
conversor = Convert_G_to_Ms2(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro que serão convertidas
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=50                                                       # Frequência de amostragem original
)

# Instacia o objeto que reamostra os dados para 20Hz (supondo que o dataset original é 50Hz, constante)
resampler = Resampler(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    original_fs=50,                                             # Frequência de amostragem original
    target_fs=20,                                               # Frequência de amostragem desejada
    groupby_column=column_group,                                # Agrupa pela coluna do txt. A reamostragem é feita para cada grupo da coluna txt
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pela coluna do txt. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Converte a aceleração para m/s²
# 2. Aplica o filtro Butterworth
# 3. Reamostra os dados para 20Hz
# 4. Cria as janelas
# 5. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        conversor,
        butterworth,
        resampler,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_normalized = pipeline(dataframe)
# Salva os dados
new_df_normalized.to_csv(output_path / "standartized_unbalanced.csv", index=False)
new_df_normalized

Executing Convert_G_to_Ms2
Executing ButterworthFilter
Executing Resampler


  for obj in iterable:
Resampling: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 61/61 [00:00<00:00, 102.90it/s]


Executing Windowize


  for obj in iterable:
Creating windows: 100%|█████████████████████████████████████████████████████████████████████████████████████| 61/61 [00:12<00:00,  4.95it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,serial,index,txt,user,level_0,activity code,standard activity code
0,-0.634264,0.171281,-0.135635,0.065722,-0.062719,0.04229,-0.056072,0.046209,-0.037106,0.032838,...,0.009626,0.00599,-0.008162,1.0,250.0,data/RawData/gyro_exp01_user01.txt,1,0,5,1
1,-0.024101,0.025095,-0.016936,0.003781,-0.016674,0.033976,0.025013,0.005063,-0.007327,0.00374,...,0.004192,0.003253,0.00452,1.0,310.0,data/RawData/gyro_exp01_user01.txt,1,60,5,1
2,-0.004556,-0.003936,-0.015275,0.014967,0.011994,-0.002212,-0.006818,0.020497,-0.022413,0.025218,...,0.010187,0.023377,0.008979,1.0,370.0,data/RawData/gyro_exp01_user01.txt,1,120,5,1
3,0.003283,-0.039336,0.001004,0.003558,-0.011062,0.054197,-0.037099,-0.005067,-0.025795,0.020433,...,0.006614,-0.006954,-0.004478,1.0,430.0,data/RawData/gyro_exp01_user01.txt,1,180,5,1
4,0.001897,0.02529,0.005331,0.00762,-0.034475,0.033311,-0.0273,0.003002,-0.044608,-0.002604,...,0.005042,0.00753,0.004058,1.0,490.0,data/RawData/gyro_exp01_user01.txt,1,240,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5412,-1.861092,-1.601401,-2.944655,-3.064325,-3.133002,6.213208,7.928147,4.147631,-0.575786,-3.111681,...,0.504241,0.433026,0.103295,61.0,6026.0,data/RawData/gyro_exp61_user30.txt,30,807741,4,0
5413,-3.139898,-2.585915,-2.20053,-2.153485,-0.681391,-0.872905,7.922086,4.616422,1.734116,-1.226429,...,0.418928,0.350135,0.256165,61.0,6086.0,data/RawData/gyro_exp61_user30.txt,30,807801,4,0
5414,2.562544,7.625131,-0.136823,-1.211708,1.372407,-0.220543,-0.697606,-1.052568,2.733753,5.638137,...,0.036509,-0.349809,-0.246186,61.0,6146.0,data/RawData/gyro_exp61_user30.txt,30,807861,4,0
5415,-1.219406,-1.284346,-1.049944,-2.477299,-1.036662,-0.390498,-1.033367,-1.309903,-2.154416,-2.675347,...,0.053965,-0.175797,-0.638961,61.0,6206.0,data/RawData/gyro_exp61_user30.txt,30,807921,4,0
