In [1]:
import numpy as np
import pandas as pd
import tqdm
from scipy import signal
from pathlib import Path

from dataset_processor import (
    AddGravityColumn,
    Convert_G_to_Ms2,
    ButterworthFilter,
    ResamplerPoly,
    Windowize,
    AddStandardActivityCode,
    RenameColumns,
    SplitGuaranteeingAllClassesPerSplit,
    BalanceToMinimumClass,
    FilterByCommonRows,
    Pipeline
)

In [2]:
def read_motionsense(motionsense_path: str) -> pd.DataFrame:
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos CSV)
    O dataframe retornado possui as seguintes colunas:
    - attitude.roll: Rotação em torno do eixo x
    - attitude.pitch: Rotação em torno do eixo y
    - attitude.yaw: Rotação em torno do eixo z
    - gravity.x: Gravidade em torno do eixo x
    - gravity.y: Gravidade em torno do eixo y
    - gravity.z: Gravidade em torno do eixo z
    - rotationRate.x: Velocidade angular em torno do eixo x
    - rotationRate.y: Velocidade angular em torno do eixo y
    - rotationRate.z: Velocidade angular em torno do eixo z
    - userAcceleration.x: Aceleração no eixo x
    - userAcceleration.y: Aceleração no eixo y
    - userAcceleration.z: Aceleração no eixo z
    - activity code: Código da atividade
    - index: Índice da amostra vindo do csv
    - user: Usuário que realizou a atividade
    - serial: Número de série da atividade
    - csv: Caminho do csv que contém a atividade

    Parameters
    ----------
    motionsense_path : str
        Caminho para o dataset MotionSense

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset MotionSense
    """

    motionsense_path = Path(motionsense_path)
    activity_names = {0: "dws", 1: "ups", 2: "sit", 3: "std", 4: "wlk", 5: "jog"}
    activity_codes = {v: k for k, v in activity_names.items()}

    feature_dtypes = {
        "attitude.roll": np.float32,
        "attitude.pitch": np.float32,
        "attitude.yaw": np.float32,
        "gravity.x": np.float32,
        "gravity.y": np.float32,
        "gravity.z": np.float32,
        "rotationRate.x": np.float32,
        "rotationRate.y": np.float32,
        "rotationRate.z": np.float32,
        "userAcceleration.x": np.float32,
        "userAcceleration.y": np.float32,
        "userAcceleration.z": np.float32,
    }

    dfs = []
    for i, f in enumerate(sorted(motionsense_path.rglob("*.csv"))):
        # Pegando o nome da atividade
        activity_name = f.parents[0].name
        # Pariticiona o nome da atividade em o cóigo da corrida
        activity_name, serial = activity_name.split("_")
        activity_code = activity_codes[activity_name]

        user = int(f.stem.split("_")[1])
        df = pd.read_csv(
            f, names=list(feature_dtypes.keys()), dtype=feature_dtypes, skiprows=1
        )

        if df.isnull().values.any():
            continue

        # ----- Adiciona colunas auxiliares e meta-dados ------
        df["activity code"] = activity_code
        df["index"] = range(len(df))
        df["user"] = user
        df["serial"] = serial
        df["csv"] = "/".join(f.parts[-2:])
        # ----------------------------------------------------
        dfs.append(df)

    return pd.concat(dfs)


In [3]:
# Caminho para o dataset MotionSense
motionsense_path = Path("../data/original/MotionSense/A_DeviceMotion_data/A_DeviceMotion_data")

# Dicionário com o mapeamento das colunas para renomear
columns_to_rename = {
    "userAcceleration.x": "accel-x",
    "userAcceleration.y": "accel-y",
    "userAcceleration.z": "accel-z",
    "rotationRate.x": "gyro-x",
    "rotationRate.y": "gyro-y",
    "rotationRate.z": "gyro-z",
}

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
    "attitude.roll",
    "attitude.pitch",
    "attitude.yaw",
    "gravity.x",
    "gravity.y",
    "gravity.z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = "csv"

# activity code: standard activity code
standard_activity_code_map = {
    0: 4,
    1: 3,
    2: 0,
    3: 1,
    4: 2,
    5: 5
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_motionsense(motionsense_path)

# Instancia o objeto para renomear as colunas
renamer = RenameColumns(
    columns_map=columns_to_rename,    # Dicionário com o mapeamento das colunas
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=150,                 # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Renomeia as colunas
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        renamer,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
new_df

Executing RenameColumns
Executing Windowize


Creating windows: 100%|██████████| 360/360 [00:31<00:00, 11.49it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gravity.z-147,gravity.z-148,gravity.z-149,serial,activity code,csv,user,index,window,standard activity code
0,0.294894,0.219405,0.010714,-0.008389,0.199441,0.168241,0.079382,0.06936,0.072889,0.098347,...,-0.072701,-0.062489,-0.057112,1,0,dws_1/sub_1.csv,1,0,0,4
1,0.422041,0.489289,0.405519,0.330101,0.278058,0.26667,-0.038128,-0.113882,0.040205,0.70531,...,-0.438791,-0.427743,-0.398845,1,0,dws_1/sub_1.csv,1,150,1,4
2,0.050002,0.279396,0.397511,0.489769,0.376962,0.086257,0.297043,0.189549,-0.11724,-0.219285,...,-0.035123,-0.006991,0.021725,1,0,dws_1/sub_1.csv,1,300,2,4
3,0.453086,0.428134,0.145774,0.035071,-0.073498,-0.076478,-0.071926,-0.070026,-0.09827,-0.088027,...,0.025593,0.009984,-0.012116,1,0,dws_1/sub_1.csv,1,450,3,4
4,-0.12326,-0.197224,0.104588,0.464974,0.476676,-0.173624,-0.311906,-0.396358,-0.354741,-0.343695,...,-0.017304,-0.065986,-0.116997,1,0,dws_1/sub_1.csv,1,600,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9237,-0.219509,0.039405,0.177356,0.163627,0.080003,-0.014922,-0.057913,0.004133,0.091218,0.165302,...,-0.563828,-0.51997,-0.519647,8,4,wlk_8/sub_9.csv,9,3900,26,2
9238,0.41574,0.001829,-0.467739,-0.385918,0.186728,0.043864,-0.726447,-0.768545,-0.734806,-0.569898,...,-0.392729,-0.444477,-0.492672,8,4,wlk_8/sub_9.csv,9,4050,27,2
9239,-0.128104,0.136487,0.138946,-0.044175,-0.09986,-0.104913,-0.211261,-0.335558,-0.412587,-0.413217,...,-0.255729,-0.223467,-0.191943,8,4,wlk_8/sub_9.csv,9,4200,28,2
9240,-0.198929,-0.270276,-0.092972,0.201214,0.290746,0.130444,-0.023268,0.024589,0.179717,0.089722,...,-0.595136,-0.550171,-0.534283,8,4,wlk_8/sub_9.csv,9,4350,29,2


## Normatizado com reamostrador ResamplerPoly

In [5]:
# Lê o dataset
dataframe = read_motionsense(motionsense_path)

# Instancia o objeto para renomear as colunas
renamer = RenameColumns(
    columns_map=columns_to_rename,    # Dicionário com o mapeamento das colunas
)

# Instancia o objeto que adiciona a gravidade
add_gravity = AddGravityColumn(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro
    gravity_columns=["gravity.x", "gravity.y", "gravity.z"],    # Nome das colunas da gravidade
)

# Instancia o objeto que converte a aceleração para m/s²
conversor = Convert_G_to_Ms2(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro que serão convertidas
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=50                                                       # Frequência de amostragem original
)

# Instacia o objeto que reamostra os dados para 20Hz (supondo que o dataset original é 50Hz, constante)
resampler = ResamplerPoly(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    up=2,                                                       # Frequência de amostragem original
    down=5,                                                     # Frequência de amostragem desejada
    groupby_column=column_group,                                # Agrupa pela coluna do txt. A reamostragem é feita para cada grupo da coluna txt
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Renomeia as colunas
# 2. Adiciona a coluna com a gravidade
# 3. Converte a aceleração para m/s²
# 4. Aplica o filtro Butterworth
# 5. Reamostra os dados para 20Hz
# 6. Cria as janelas
# 7. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        renamer,
        add_gravity,
        conversor,
        butterworth,
        resampler,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

Executing RenameColumns
Executing AddGravityColumn
Executing Convert_G_to_Ms2
Executing ButterworthFilter
Executing ResamplerPoly


Resampling: 100%|██████████| 360/360 [00:02<00:00, 129.00it/s]


Executing Windowize


Creating windows: 100%|██████████| 360/360 [00:30<00:00, 11.70it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gravity.z-58,gravity.z-59,serial,activity code,csv,user,index,level_0,window,standard activity code
0,-0.056335,-1.597768,-0.647864,-1.014687,-1.576943,-1.961893,-0.728436,-5.503354,-4.969987,-4.599815,...,-0.109152,-0.065553,1,0,dws_1/sub_1.csv,1,0,0,0,4
1,2.434427,1.988327,0.09301,-1.645879,9.162344,-2.653465,-8.728634,-5.290781,-5.477959,0.524394,...,-0.395177,-0.432898,1,0,dws_1/sub_1.csv,1,60,60,1,4
2,-0.614854,2.97827,1.495153,-1.058126,-1.724007,1.666742,-1.232028,5.136267,2.597605,1.077606,...,-0.079832,-0.020429,1,0,dws_1/sub_1.csv,1,120,120,2,4
3,3.625902,0.178938,-1.471411,-1.69824,-1.199284,1.073718,2.875087,1.023888,2.727699,1.449058,...,0.043088,0.018484,1,0,dws_1/sub_1.csv,1,180,180,3,4
4,-0.815682,3.910288,0.353055,-4.134688,-3.715462,-1.41405,3.31792,-1.474545,-4.747294,1.649691,...,0.036446,-0.041059,1,0,dws_1/sub_1.csv,1,240,240,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9241,-0.82569,2.196944,0.766301,0.145663,3.231357,3.435711,4.219603,-0.713973,-6.63593,2.241874,...,-0.662583,-0.539062,8,4,wlk_8/sub_9.csv,9,1560,1560,26,2
9242,5.087142,-3.021826,0.294042,-7.527696,-0.996758,3.778013,-2.900571,0.937303,1.148419,2.109845,...,-0.248762,-0.421245,8,4,wlk_8/sub_9.csv,9,1620,1620,27,2
9243,1.021549,1.093964,0.086066,-2.401005,-3.17954,-1.0941,5.645998,-0.88858,-1.954384,-0.785932,...,-0.324727,-0.239532,8,4,wlk_8/sub_9.csv,9,1680,1680,28,2
9244,-1.005314,1.409886,2.012239,0.98732,2.924799,7.212666,-5.244421,-7.331109,3.04185,0.156724,...,-0.684642,-0.568969,8,4,wlk_8/sub_9.csv,9,1740,1740,29,2


## Filtra por elementos iguais

In [6]:
filter_common = FilterByCommonRows(match_columns=["user", "serial", "window"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

## Balanceia e salva os dados brutos

In [7]:
output_path_balanced = Path("../data/raw_balanced")
output_path_balanced_standartized = Path("../data/standartized_balanced")
output_path_unbalanced = Path("../data/unbalanced")

# Salva os dados
ouptut_dir = output_path_unbalanced / "MotionSense"
ouptut_dir.mkdir(parents=True, exist_ok=True)
new_df.to_csv(output_path_unbalanced / "MotionSense/raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path_balanced / "MotionSense" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/raw_balanced/MotionSense


## Balanceia e salva os dados processados

In [8]:
# Salva os dados
new_df_standartized.to_csv(output_path_unbalanced / "MotionSense/standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path_balanced_standartized / "MotionSense" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/standartized_balanced/MotionSense
