In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from dataset_processor import (
    ButterworthFilter,
    Interpolate,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    FilterByCommonRows,
    BalanceToMinimumClass,
    Pipeline
)

In [2]:
def read_wisdm(wisdm_path: str) -> pd.DataFrame:
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos CSV)
    O dataframe retornado possui as seguintes colunas:
    - activity code: Código da atividade
    - user: Usuário que realizou a atividade
    - timestamp-accel: Timestamp da aceleração
    - accel-x: Aceleração no eixo x
    - accel-y: Aceleração no eixo y
    - accel-z: Aceleração no eixo z
    - timestamp-gyro: Timestamp do giroscópio
    - gyro-x: Giroscópio no eixo x
    - gyro-y: Giroscópio no eixo y
    - gyro-z: Giroscópio no eixo z

    Parameters
    ----------
    wisdm_path : str
        Caminho para o dataset MotionSense

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset WISDM
    """

    # activity_codes = {v: k for k, v in activity_names.items()}
    
    feature_columns_acc = [
        "user",
        "activity code",
        "timestamp-accel",
        "accel-x",
        "accel-y",
        "accel-z",
    ]
    feature_columns_gyr = [
        "user",
        "activity code",
        "timestamp-gyro",
        "gyro-x",
        "gyro-y",
        "gyro-z",
    ]

    # Lista com letras maiúsculas de A até S sem o N
    labels = [chr(i) for i in range(65, 84) if chr(i) != "N"]

    dfs = []

    for user in range(1600,1651):

        df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
        df_acc = df_acc[df_acc.columns[0:-1]]
        df_acc.columns = feature_columns_acc
        df_acc["timestamp-accel"] = df_acc["timestamp-accel"].astype(np.int64)


        df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
        df_gyr = df_gyr[df_gyr.columns[0:-1]]
        df_gyr.columns = feature_columns_gyr
        df_gyr["timestamp-gyro"] = df_gyr["timestamp-gyro"].astype(np.int64)

        for activity in labels:
            acc = df_acc[df_acc["activity code"] == activity].copy()
            gyr = df_gyr[df_gyr["activity code"] == activity].copy()

            time_acc = np.array(acc["timestamp-accel"])
            time_gyr = np.array(gyr["timestamp-gyro"])

            # Setando o tempo inicial para 0
            if len(time_acc) > 0 and len(time_gyr) > 0:
                time_acc = (time_acc - time_acc[0]) / 1000000000
                time_gyr = (time_gyr - time_gyr[0]) / 1000000000

                # ### Retirando os intervalos sem amostra (periodos vazios)
                # if np.any(np.diff(time_acc)<0):
                #     print("Acelerômetro com amostras fora de ordem")
                #     pos = np.nonzero(np.diff(time_acc)<0)[0].astype(int)
                #     for k in pos:
                #         time_acc[k+1:] = time_acc[k+1:]+time_acc[k]+1/20
                # if np.any(np.diff(time_gyr)<0):
                #     print("Giroscópio com amostras fora de ordem")
                #     pos = np.nonzero(np.diff(time_gyr)<0)[0].astype(int)
                #     for k in pos:
                #         time_gyr[k+1:] = time_gyr[k+1:]+time_gyr[k]+1/20

                tam = min(len(time_acc), len(time_gyr))

                acc["timestamp-accel"] = time_acc
                gyr["timestamp-gyro"] = time_gyr

                acc = acc.iloc[:tam]
                gyr = gyr.iloc[:tam]

                # Criando um dataframe com os dados de aceleração e giroscópio
                df = pd.concat([acc[feature_columns_acc[2:]], gyr[feature_columns_gyr[2:]]], axis=1)
                df["activity code"] = activity
                df["user"] = user

                # Drop samples with NaN
                if df.isnull().values.any():
                    continue

                dfs.append(df)

    df = pd.concat(dfs)
    df.reset_index(inplace=True, drop=True)

    for column in feature_columns_acc[2:] + feature_columns_gyr[2:]:
        df[column] = df[column].astype(np.float32)
    df["user"] = df["user"].astype(np.int32)

    return df.dropna().reset_index(drop=True)

In [3]:
# Caminho para o dataset WISDM
wisdm_path = Path("../data/raw/WISDM/wisdm-dataset/raw/phone")
# Caminho para salvar o dataset pré-processado
output_path = Path("data/processed/WISDM")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = ["user", "activity code"]

# activity code: standard activity code
standard_activity_code_map = {
    "A": 2,
    "B": 5,
    "C": 6,
    "D": 0,
    "E": 1,
    "F": -1,
    "G": -1,
    "H": -1,
    "I": -1,
    "J": -1,
    "K": -1,   
    "L": -1,
    "M": -1,
    "O": -1,
    "P": -1,
    "Q": -1,
    "R": -1,
    "S": -1,
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_wisdm(wisdm_path)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=60,                  # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Renomeia as colunas
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
new_df

  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_p

Executing Windowize


Creating windows: 100%|██████████| 68/68 [00:15<00:00,  4.40it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,timestamp-accel,user,timestamp-gyro,window,standard activity code
0,-0.364761,-0.87973,2.001495,0.450623,-2.164352,-4.332779,-0.319443,1.566452,-0.323746,-1.811676,...,-0.187714,-0.052719,0.050415,0.570847,A,0.0,1600,0.0,0,2
1,-0.019897,0.058136,0.805832,0.392441,-3.992523,-4.075867,-0.342834,0.767914,-0.467209,-2.23732,...,0.009583,0.365555,0.564865,0.816101,A,3.02124,1600,3.02124,1,2
2,-0.65184,-1.380219,0.191849,-1.169861,-5.252274,0.886673,1.784363,-0.943817,-2.776306,-1.056335,...,0.043823,0.137939,0.786972,0.793564,A,6.04248,1600,6.04248,2,2
3,-2.382492,0.510223,-0.461853,-3.781677,-5.367844,-0.22345,1.547012,-0.414993,-3.513077,-1.619766,...,-0.012558,0.134155,0.945755,0.933792,A,9.063731,1600,9.063731,3,2
4,0.072647,2.348526,-2.201248,-7.240723,-4.361862,-1.85614,0.024414,0.723221,-1.243652,-0.348114,...,-0.115829,0.335556,0.492676,0.721069,A,12.084971,1600,12.084971,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4603,-9.414823,-9.302257,-8.461604,-7.321575,-7.27128,-7.204219,-6.205496,-5.455855,-4.385281,-4.066743,...,-0.18003,0.274838,-0.12144,-0.124636,A,83.426071,1650,166.850922,69,2
4604,-7.505991,-9.354947,-10.918895,-12.178675,-12.746294,-12.446917,-12.216995,-12.628939,-13.306729,-11.912827,...,0.108657,0.012783,-0.128897,0.150202,A,84.634987,1650,169.269012,70,2
4605,-11.402688,-9.271121,-8.820857,-9.297466,-9.24717,-8.679551,-8.595725,-8.293953,-7.676038,-6.933582,...,0.004261,0.426106,0.609331,0.588026,A,85.844406,1650,171.687088,71,2
4606,-3.436853,-4.186494,-5.659431,-6.71803,-7.616162,-8.581355,-9.527389,-10.504556,-11.199112,-10.818304,...,-0.393083,-0.345146,0.076699,-0.079895,A,87.051956,1650,174.105209,72,2


## Normatizado com Interpolador

In [5]:
# Instacia o objeto que interpola os dados para 20Hz (supondo que o dataset original é 50Hz, constante)
interpolator = Interpolate(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    original_fs=20,                                             # Frequência de amostragem original (50Hz)
    target_fs=20,                                               # Frequência de amostragem desejada (20Hz)
    kind="cubic",                                               # Tipo de interpolação (cúbica)
    groupby_column=column_group,                                # Agrupa pela coluna do CSV. A reamostragem é feita para cada grupo da coluna CSV
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=20                                                       # Frequência de amostragem original
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Renomeia as colunas
# 2. Adiciona a coluna com a gravidade
# 3. Converte a aceleração para m/s²
# 4. Aplica o filtro Butterworth
# 5. Reamostra os dados para 20Hz
# 6. Cria as janelas
# 7. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        interpolator,
        butterworth,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None)
  df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None)
  df_acc = pd.read_csv(wisdm_p

Executing Interpolate


Interpoling: 100%|██████████| 68/68 [00:00<00:00, 88.73it/s]


Executing ButterworthFilter
Executing Windowize


Creating windows: 100%|██████████| 68/68 [00:16<00:00,  4.02it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,timestamp-gyro,activity code,timestamp-accel,index,user,window,standard activity code
0,-0.073361,-0.558465,2.352381,0.830796,-1.755308,-3.895356,0.145797,2.058886,0.195211,-1.266913,...,-0.052719,0.050415,0.570847,0.0,A,0.0,0,1600,0,2
1,1.12404,1.210188,1.965836,1.560183,-2.817309,-2.893501,0.846309,1.963412,0.734176,-1.030557,...,0.365555,0.564865,0.816101,3.02124,A,3.02124,60,1600,1,2
2,0.65306,-0.073597,1.499963,0.13952,-3.941849,2.197918,3.096207,0.368405,-1.463929,0.255971,...,0.137939,0.786972,0.793564,6.04248,A,6.04248,120,1600,2,2
3,-0.967224,1.931968,0.966193,-2.347549,-3.927892,1.222026,2.997678,1.040496,-2.053155,-0.155828,...,0.134155,0.945755,0.933792,9.063731,A,9.063731,180,1600,3,2
4,1.683682,3.963042,-0.583656,-5.620481,-2.739419,-0.231956,1.649869,2.349471,0.382916,1.278294,...,0.335556,0.492676,0.721069,12.084971,A,12.084971,240,1600,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4603,-0.532345,-0.428254,0.403879,1.535514,1.577699,1.63708,2.628674,3.371829,4.436619,4.750095,...,0.274838,-0.12144,-0.124636,166.850922,A,83.426071,277642,1650,69,2
4604,1.200491,-0.650463,-2.216153,-3.47757,-4.046865,-3.749326,-3.5215,-3.935855,-4.616395,-3.225562,...,0.012783,-0.128897,0.150202,169.269012,A,84.634987,277702,1650,70,2
4605,-2.748227,-0.618142,-0.169672,-0.648301,-0.600145,-0.03467,0.047128,0.347106,0.96357,1.705009,...,0.426106,0.609331,0.588026,171.687088,A,85.844406,277762,1650,71,2
4606,5.226098,4.483644,3.018338,1.967609,1.077348,0.119765,-0.819197,-1.79011,-2.479495,-2.094846,...,-0.345146,0.076699,-0.079895,174.105209,A,87.051956,277822,1650,72,2


# Filtra por elementos iguais

In [6]:
filter_common = FilterByCommonRows(match_columns=["user", "activity code", "window"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

In [15]:
# new_df["standard activity code"].unique()

array([ 2,  5,  6,  0,  1, -1])

## Balanceia e salva os dados brutos

In [21]:
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.8,
    random_state=0
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.9,
    random_state=0
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "raw_balanced"
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

In [None]:
new_df["standard activity code"].unique()

## Balanceia e salva os dados processados

In [None]:
# Salva os dados
new_df_standartized.to_csv(output_path / "standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "standartized_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")