In [1]:
import numpy as np
import pandas as pd
import tqdm
from scipy import signal
from pathlib import Path

from dataset_processor import (
    AddGravityColumn,
    Convert_G_to_Ms2,
    ButterworthFilter,
    Resampler,
    Windowize,
    AddStandardActivityCode,
    RenameColumns,
    Pipeline
)

In [2]:
def read_uci(uci_path):
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos CSV)
    O dataframe retornado possui as seguintes colunas:
    - attitude.roll: Rotação em torno do eixo x
    - attitude.pitch: Rotação em torno do eixo y
    - attitude.yaw: Rotação em torno do eixo z
    - gravity.x: Gravidade em torno do eixo x
    - gravity.y: Gravidade em torno do eixo y
    - gravity.z: Gravidade em torno do eixo z
    - rotationRate.x: Velocidade angular em torno do eixo x
    - rotationRate.y: Velocidade angular em torno do eixo y
    - rotationRate.z: Velocidade angular em torno do eixo z
    - userAcceleration.x: Aceleração no eixo x
    - userAcceleration.y: Aceleração no eixo y
    - userAcceleration.z: Aceleração no eixo z
    - activity code: Código da atividade
    - index: Índice da amostra vindo do txt
    - user: Usuário que realizou a atividade
    - serial: Número de série da atividade
    - txt: Caminho do txt que contém a atividade

    Parameters
    ----------
    uci_path : str
        Caminho para o dataset MotionSense

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset UCI-HAR
    """
    activity_names = {
        1: "WALKING", 
        2: "WALKING_UPSTAIRS", 
        3: "WALKING_DOWNSTAIRS", 
        4: "SITTING", 
        5: "STANDING", 
        6: "LAYING",
        7: "STAND_TO_SIT",
        8: "SIT_TO_STAND",
        9: "SIT_TO_LIE",
        10: "LIE_TO_SIT",
        11: "STAND_TO_LIE",
        12: "LIE_TO_STAND"
    }
    activity_codes = {v: k for k, v in activity_names.items()}
    
    feature_columns = [
        "accel-x",
        "accel-y",
        "accel-z",
        "gyro-x",
        "gyro-y",
        "gyro-z",
    ]
    
#     df_labels = pd.read_csv("data/RawData/labels.txt", header=None, sep=" ")
    df_labels = pd.read_csv(uci_path+"/labels.txt", header=None, sep=" ")
    df_labels.columns=["serial", "user", "activity code", "start", "end"]
    
    uci_path = Path(uci_path)
    
    dfs = []
    data_path = list(uci_path.glob("*.txt"))
    new_data_path = [elem.name.split("_")+[elem] for elem in sorted(data_path)]
    df = pd.DataFrame(new_data_path, columns=["sensor", "serial", "user", "file"])
    for key, df2 in df.groupby(["serial", "user"]):
        acc, gyr = [], []
        for row_index, row in df2.iterrows():
            data = pd.read_csv(row["file"], header=None, sep=" ")
            if row["sensor"] == "acc":
                acc.append(data)
            else:
                gyr.append(data)
        new_df = pd.concat([acc[0], gyr[0]], axis=1)
        new_df.columns = feature_columns
        
        user = int(key[1].split(".")[0][4:])
        serial = int(key[0][3:])
        
        new_df['txt'] = row["file"]
        
        new_df["user"] = user
        new_df["serial"] = serial
#         new_df["activity code"] = -1
        
        for row_index, row in df_labels.loc[(df_labels["serial"] == serial) & (df_labels["user"] == user)].iterrows():
            start = row['start']
            end = row["end"]+1
            activity = row["activity code"]
            resumed_df = new_df.loc[start:end].copy()
            resumed_df["index"] = [i for i in range(start, end+1)]
            resumed_df["activity code"] = activity
            
            dfs.append(resumed_df)
    
    df = pd.concat(dfs)
    df.reset_index(inplace=True, drop=True)
    return df

In [3]:
# Caminho para o dataset UCI-HAR
uci_path = "data/RawData"
# Caminho para salvar o dataset pré-processado
output_path = Path("data/processed/UCI")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = ["user", "activity code", "serial"]

# activity code: standard activity code
standard_activity_code_map = {
    1: 2, # walk
    2: 3, # stair up
    3: 4, # stair down
    4: 0, # sit
    5: 1, # stand
    6: -1, # Laying
    7: -1, # stand to sit
    8: -1, # sit to stand
    9: -1, # sit to lie
    10: -1, # lie to sit
    11: -1, # stand to lie
    12: -1 # lie to stand
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_uci(uci_path)


# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=150,                 # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do txt. As janelas são criadas para cada grupo da coluna txt
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Cria as janelas
# 2. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)
new_df

Executing Windowize


Creating windows: 100%|███████████████████████████████████████████████████████████████████████████████████| 714/714 [00:12<00:00, 55.33it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-146,gyro-z-147,gyro-z-148,gyro-z-149,activity code,serial,txt,index,user,standard activity code
0,1.002778,0.683333,0.733333,0.956944,1.05,1.013889,0.95,0.95,0.952778,0.913889,...,-0.559247,-0.533285,-0.651488,-0.625526,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7496,1,2
1,1.323611,1.470833,1.470833,1.270833,1.015278,1.015278,0.888889,0.694444,0.694444,0.676389,...,0.163406,0.180816,0.135001,0.154549,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7646,1,2
2,0.890278,0.858333,0.858333,0.840278,0.925,1.086111,1.222222,1.223611,1.193056,1.208333,...,0.718683,0.93157,0.271224,-0.105069,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7796,1,2
3,0.686111,0.719444,1.097222,1.097222,1.011111,0.848611,0.848611,0.808333,0.754167,0.759722,...,-0.161879,-0.171042,-0.127671,-0.087965,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7946,1,2
4,0.851389,0.897222,0.943056,0.938889,0.894445,0.868056,0.880556,0.9375,0.986111,0.981945,...,0.377209,0.076053,-0.118202,-0.180511,1.0,1.0,data/RawData/gyro_exp01_user01.txt,8372,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5108,0.045833,0.047222,0.072222,0.1,0.134722,0.191667,0.206944,0.115278,0.098611,0.054167,...,-0.59987,-0.612392,-0.596815,-0.619417,10.0,60.0,data/RawData/gyro_exp60_user30.txt,8558,30,-1
5109,0.073611,0.072222,0.075,0.0875,0.106944,0.126389,0.122222,0.122222,0.076389,0.009722,...,-0.258701,-0.26481,-0.280998,-0.313374,10.0,61.0,data/RawData/gyro_exp61_user30.txt,5658,30,-1
5110,1.065278,1.036111,0.916667,0.809722,0.816667,0.933333,1.090278,1.161111,1.172222,1.090278,...,0.028711,0.046426,-0.022297,-0.024435,11.0,60.0,data/RawData/gyro_exp60_user30.txt,7132,30,-1
5111,0.997222,1.005556,0.988889,0.988889,1.005556,0.976389,0.961111,0.968056,1.041667,1.177778,...,-0.463036,-0.535423,-0.37782,-0.336281,11.0,61.0,data/RawData/gyro_exp61_user30.txt,4198,30,-1


## Normatizado

In [6]:
# Lê o dataset
dataframe = read_uci(uci_path)

# Instancia o objeto que converte a aceleração para m/s²
conversor = Convert_G_to_Ms2(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro que serão convertidas
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=50                                                       # Frequência de amostragem original
)

# Instacia o objeto que reamostra os dados para 20Hz (supondo que o dataset original é 50Hz, constante)
resampler = Resampler(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    original_fs=50,                                             # Frequência de amostragem original
    target_fs=20,                                               # Frequência de amostragem desejada
    groupby_column=column_group,                                # Agrupa pela coluna do txt. A reamostragem é feita para cada grupo da coluna txt
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pela coluna do txt. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Converte a aceleração para m/s²
# 2. Aplica o filtro Butterworth
# 3. Reamostra os dados para 20Hz
# 4. Cria as janelas
# 5. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        conversor,
        butterworth,
        resampler,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_normalized = pipeline(dataframe)
# Salva os dados
new_df_normalized.to_csv(output_path / "standartized_unbalanced.csv", index=False)
new_df_normalized

Executing Convert_G_to_Ms2
Executing ButterworthFilter
Executing Resampler


Resampling: 100%|████████████████████████████████████████████████████████████████████████████████████████| 714/714 [00:01<00:00, 516.33it/s]


Executing Windowize


Creating windows: 100%|███████████████████████████████████████████████████████████████████████████████████| 714/714 [00:12<00:00, 57.36it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,activity code,serial,txt,index,level_0,user,standard activity code
0,-2.833257,-1.206962,-0.222144,-0.762373,-1.152283,-0.405408,-1.031523,0.163501,-0.054556,0.063171,...,-0.496033,-0.425188,-0.630119,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7496,6740,1,2
1,3.234248,3.319959,-1.304929,-2.615548,-3.46639,-2.351809,-1.932922,2.151541,2.061283,4.162129,...,-0.32155,0.090746,0.155261,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7556,6800,1,2
2,-0.931924,-1.648398,1.898844,1.800302,2.746011,-1.317004,4.000048,2.971553,-2.321007,-2.870096,...,-0.082779,0.609134,0.255944,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7616,6860,1,2
3,-2.149846,0.894538,-2.212124,-1.962087,-0.771756,-0.632305,-0.10154,0.199842,0.575738,2.370602,...,0.185679,-0.182803,-0.080921,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7676,6920,1,2
4,-0.789999,-0.953819,-0.928713,-0.201753,-0.429709,0.960063,1.756872,2.14327,-0.603573,0.44661,...,0.427588,0.131001,-0.144445,1.0,1.0,data/RawData/gyro_exp01_user01.txt,7736,6980,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5108,-0.436421,-0.123993,0.672125,-0.617835,-1.199618,-2.236434,-3.342621,-2.214512,-1.812965,-1.319833,...,-0.137071,0.054419,0.138349,10.0,60.0,data/RawData/gyro_exp60_user30.txt,8558,791529,30,-1
5109,0.100062,0.382779,0.592919,-0.319994,-0.281877,-0.082347,-0.320796,-0.565574,-0.481863,0.46418,...,-0.19606,-0.065447,-0.006044,10.0,61.0,data/RawData/gyro_exp61_user30.txt,5658,807372,30,-1
5110,0.54159,-1.556119,-0.346829,1.927838,-0.515841,-3.571364,1.282897,2.458398,0.428781,1.0902,...,-0.025263,0.049848,0.010395,11.0,60.0,data/RawData/gyro_exp60_user30.txt,7132,790101,30,-1
5111,-0.671581,-0.533349,-1.103813,-0.367796,1.291531,1.338266,-1.245868,-1.630863,-2.034239,-0.659126,...,-0.633716,-0.410474,-0.488779,11.0,61.0,data/RawData/gyro_exp61_user30.txt,4198,805910,30,-1
