In [1]:
import numpy as np
import pandas as pd
import tqdm
from scipy import signal
from pathlib import Path

from dataset_processor import (
    Convert_G_to_Ms2,
    ButterworthFilter,
    ResamplerPoly,
    Windowize,
    AddStandardActivityCode,
    FilterByCommonRows,
    SplitGuaranteeingAllClassesPerSplit,
    BalanceToMinimumClass,
    Pipeline
)

In [2]:
def read_uci(uci_path):
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos CSV)
    O dataframe retornado possui as seguintes colunas:
    - attitude.roll: Rotação em torno do eixo x
    - attitude.pitch: Rotação em torno do eixo y
    - attitude.yaw: Rotação em torno do eixo z
    - gravity.x: Gravidade em torno do eixo x
    - gravity.y: Gravidade em torno do eixo y
    - gravity.z: Gravidade em torno do eixo z
    - rotationRate.x: Velocidade angular em torno do eixo x
    - rotationRate.y: Velocidade angular em torno do eixo y
    - rotationRate.z: Velocidade angular em torno do eixo z
    - userAcceleration.x: Aceleração no eixo x
    - userAcceleration.y: Aceleração no eixo y
    - userAcceleration.z: Aceleração no eixo z
    - activity code: Código da atividade
    - index: Índice da amostra vindo do txt
    - user: Usuário que realizou a atividade
    - serial: Número de série da atividade
    - txt: Caminho do txt que contém a atividade

    Parameters
    ----------
    uci_path : str
        Caminho para o dataset MotionSense

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset UCI-HAR
    """
    activity_names = {
        1: "WALKING", 
        2: "WALKING_UPSTAIRS", 
        3: "WALKING_DOWNSTAIRS", 
        4: "SITTING", 
        5: "STANDING", 
        6: "LAYING",
        7: "STAND_TO_SIT",
        8: "SIT_TO_STAND",
        9: "SIT_TO_LIE",
        10: "LIE_TO_SIT",
        11: "STAND_TO_LIE",
        12: "LIE_TO_STAND"
    }
    activity_codes = {v: k for k, v in activity_names.items()}
    
    feature_columns = [
        "accel-x",
        "accel-y",
        "accel-z",
        "gyro-x",
        "gyro-y",
        "gyro-z",
    ]
    
#     df_labels = pd.read_csv("data/RawData/labels.txt", header=None, sep=" ")
    df_labels = pd.read_csv(uci_path / "labels.txt", header=None, sep=" ")
    df_labels.columns=["serial", "user", "activity code", "start", "end"]
    
    uci_path = Path(uci_path)
    
    dfs = []
    data_path = list(uci_path.glob("*.txt"))
    new_data_path = [elem.name.split("_")+[elem] for elem in sorted(data_path)]
    df = pd.DataFrame(new_data_path, columns=["sensor", "serial", "user", "file"])
    for key, df2 in df.groupby(["serial", "user"]):
        acc, gyr = None, None
        for row_index, row in df2.iterrows():
            data = pd.read_csv(row["file"], header=None, sep=" ")
            if row["sensor"] == "acc":
                acc = data
            else:
                gyr = data
        new_df = pd.concat([acc, gyr], axis=1)
        new_df.columns = feature_columns
        
        user = int(key[1].split(".")[0][4:])
        serial = int(key[0][3:])
        
        new_df['txt'] = row["file"]
        
        new_df["user"] = user
        new_df["serial"] = serial
#         new_df["activity code"] = -1
        
        for row_index, row in df_labels.loc[(df_labels["serial"] == serial) & (df_labels["user"] == user)].iterrows():
            start = row['start']
            end = row["end"]+1
            activity = row["activity code"]
            resumed_df = new_df.loc[start:end].copy()
            resumed_df["index"] = [i for i in range(start, end+1)]
            resumed_df["activity code"] = activity

            # Drop samples with NaN
            if resumed_df.isnull().values.any():
                continue
            
            dfs.append(resumed_df)
    
    df = pd.concat(dfs)
    df.reset_index(inplace=True, drop=True)
    return df

In [3]:
# Caminho para o dataset UCI-HAR
uci_path = Path("../data/raw/UCI/RawData/")
# Caminho para salvar o dataset pré-processado
output_path = Path("../data/processed/UCI")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = ["user", "activity code", "serial"]

# activity code: standard activity code
standard_activity_code_map = {
    1: 2, # walk
    2: 3, # stair up
    3: 4, # stair down
    4: 0, # sit
    5: 1, # stand
    6: -1, # Laying
    7: -1, # stand to sit
    8: -1, # sit to stand
    9: -1, # sit to lie
    10: -1, # lie to sit
    11: -1, # stand to lie
    12: -1 # lie to stand
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_uci(uci_path)


# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=150,                 # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do txt. As janelas são criadas para cada grupo da coluna txt
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Cria as janelas
# 2. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
new_df

Executing Windowize


Creating windows: 100%|██████████| 714/714 [00:15<00:00, 46.30it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-147,gyro-z-148,gyro-z-149,user,activity code,index,serial,txt,window,standard activity code
0,1.002778,0.683333,0.733333,0.956944,1.05,1.013889,0.95,0.95,0.952778,0.913889,...,-0.533285,-0.651488,-0.625526,1.0,1.0,7496.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,0,2
1,1.323611,1.470833,1.470833,1.270833,1.015278,1.015278,0.888889,0.694444,0.694444,0.676389,...,0.180816,0.135001,0.154549,1.0,1.0,7646.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,1,2
2,0.890278,0.858333,0.858333,0.840278,0.925,1.086111,1.222222,1.223611,1.193056,1.208333,...,0.93157,0.271224,-0.105069,1.0,1.0,7796.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,2,2
3,0.686111,0.719444,1.097222,1.097222,1.011111,0.848611,0.848611,0.808333,0.754167,0.759722,...,-0.171042,-0.127671,-0.087965,1.0,1.0,7946.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,3,2
4,0.851389,0.897222,0.943056,0.938889,0.894445,0.868056,0.880556,0.9375,0.986111,0.981945,...,0.076053,-0.118202,-0.180511,1.0,1.0,8372.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5108,0.045833,0.047222,0.072222,0.1,0.134722,0.191667,0.206944,0.115278,0.098611,0.054167,...,-0.612392,-0.596815,-0.619417,30.0,10.0,8558.0,60.0,../data/raw/UCI/RawData/gyro_exp60_user30.txt,0,-1
5109,0.073611,0.072222,0.075,0.0875,0.106944,0.126389,0.122222,0.122222,0.076389,0.009722,...,-0.26481,-0.280998,-0.313374,30.0,10.0,5658.0,61.0,../data/raw/UCI/RawData/gyro_exp61_user30.txt,0,-1
5110,1.065278,1.036111,0.916667,0.809722,0.816667,0.933333,1.090278,1.161111,1.172222,1.090278,...,0.046426,-0.022297,-0.024435,30.0,11.0,7132.0,60.0,../data/raw/UCI/RawData/gyro_exp60_user30.txt,0,-1
5111,0.997222,1.005556,0.988889,0.988889,1.005556,0.976389,0.961111,0.968056,1.041667,1.177778,...,-0.535423,-0.37782,-0.336281,30.0,11.0,4198.0,61.0,../data/raw/UCI/RawData/gyro_exp61_user30.txt,0,-1


## Normatizado com Interpolador

In [5]:
# Lê o dataset
dataframe = read_uci(uci_path)

# Instancia o objeto que converte a aceleração para m/s²
conversor = Convert_G_to_Ms2(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro que serão convertidas
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=50                                                       # Frequência de amostragem original
)

# Instacia o objeto que reamostra os dados para 20Hz (supondo que o dataset original é 50Hz, constante)
resampler = ResamplerPoly(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    up=2,                                                       # Frequência de amostragem original
    down=5,                                                     # Frequência de amostragem desejada
    groupby_column=column_group,                                # Agrupa pela coluna do txt. A reamostragem é feita para cada grupo da coluna txt
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pela coluna do txt. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Converte a aceleração para m/s²
# 2. Aplica o filtro Butterworth
# 3. Reamostra os dados para 20Hz
# 4. Cria as janelas
# 5. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        conversor,
        butterworth,
        resampler,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

Executing Convert_G_to_Ms2
Executing ButterworthFilter
Executing ResamplerPoly


Resampling: 100%|██████████| 714/714 [00:02<00:00, 335.87it/s]


Executing Windowize


Creating windows: 100%|██████████| 714/714 [00:16<00:00, 43.10it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-58,gyro-z-59,user,level_0,activity code,index,serial,txt,window,standard activity code
0,-1.294861,-1.821932,0.177586,-1.053442,-0.923933,-0.594807,-0.876665,0.012722,0.072624,-0.04399,...,-0.415797,-0.611074,1.0,6740.0,1.0,7496.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,0,2
1,2.534239,3.831228,-0.723642,-2.601742,-3.328371,-2.629107,-2.048427,1.414561,2.372414,3.57132,...,-0.037839,0.181263,1.0,6800.0,1.0,7556.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,1,2
2,-0.559527,-1.948379,1.03147,1.863631,2.848757,-0.354659,1.650163,4.927537,-1.608229,-2.495104,...,0.323679,0.559186,1.0,6860.0,1.0,7616.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,2,2
3,-3.345663,0.554403,-0.9,-2.485545,-1.182032,-0.612004,-0.443396,0.193144,0.227264,1.523604,...,0.00123,-0.17577,1.0,6920.0,1.0,7676.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,3,2
4,-1.496035,-0.508597,-1.306504,-0.350937,-0.499819,0.159376,1.249205,2.242568,1.14331,-0.926829,...,0.45223,-0.029443,1.0,6980.0,1.0,7736.0,1.0,../data/raw/UCI/RawData/gyro_exp01_user01.txt,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5115,0.150832,0.260999,0.655816,0.227044,-0.573703,-0.059493,-0.229176,-0.158066,-0.55108,-0.628638,...,-0.27472,-0.261726,30.0,807372.0,10.0,5658.0,61.0,../data/raw/UCI/RawData/gyro_exp61_user30.txt,0,-1
5116,0.503664,-1.466862,-0.624668,1.818005,0.323614,-3.594533,-0.257988,2.925112,0.876846,0.648942,...,0.068806,-0.001791,30.0,790101.0,11.0,7132.0,60.0,../data/raw/UCI/RawData/gyro_exp60_user30.txt,0,-1
5117,-0.476063,-0.70046,-0.938115,-0.534469,1.455487,1.184784,-1.104806,-1.751874,-1.944943,-0.715098,...,-0.423297,-0.476595,30.0,805910.0,11.0,4198.0,61.0,../data/raw/UCI/RawData/gyro_exp61_user30.txt,0,-1
5118,0.377754,0.30951,0.026914,0.018463,-0.355628,0.128436,0.286268,0.240792,0.081811,0.076491,...,0.083232,0.203615,30.0,794737.0,12.0,11762.0,60.0,../data/raw/UCI/RawData/gyro_exp60_user30.txt,0,-1


## Filtra por elementos iguais

In [6]:
filter_common = FilterByCommonRows(match_columns=["user", "serial", "window", "activity code"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

## Balanceia e salva os dados brutos

In [7]:
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.8,
    random_state=42
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "raw_balanced"
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/processed/UCI/raw_balanced


## Balanceia e salva os dados processados

In [8]:
# Salva os dados
new_df_standartized.to_csv(output_path / "standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "standartized_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/processed/UCI/standartized_balanced
