In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from dataset_processor import (
    ButterworthFilter,
    CalcTimeDiffMean,
    Interpolate,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    FilterByCommonRows,
    BalanceToMinimumClass,
    Pipeline
)

In [2]:
def read_wisdm(wisdm_path: str) -> pd.DataFrame:
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos CSV)
    O dataframe retornado possui as seguintes colunas:
    - activity code: Código da atividade
    - user: Usuário que realizou a atividade
    - timestamp-accel: Timestamp da aceleração
    - accel-x: Aceleração no eixo x
    - accel-y: Aceleração no eixo y
    - accel-z: Aceleração no eixo z
    - timestamp-gyro: Timestamp do giroscópio
    - gyro-x: Giroscópio no eixo x
    - gyro-y: Giroscópio no eixo y
    - gyro-z: Giroscópio no eixo z

    Parameters
    ----------
    wisdm_path : str
        Caminho para o dataset MotionSense

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset WISDM
    """

    # activity_codes = {v: k for k, v in activity_names.items()}
    
    feature_columns_acc = [
        "user",
        "activity code",
        "timestamp-accel",
        "accel-x",
        "accel-y",
        "accel-z",
    ]
    feature_columns_gyr = [
        "user",
        "activity code",
        "timestamp-gyro",
        "gyro-x",
        "gyro-y",
        "gyro-z",
    ]

    # Lista com letras maiúsculas de A até S sem o N
    labels = [chr(i) for i in range(65, 84) if chr(i) != "N"]

    dfs = []

    for user in range(1600,1651):

        df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None, engine="python")
        df_acc = df_acc[df_acc.columns[0:-1]]
        df_acc.columns = feature_columns_acc
        df_acc["timestamp-accel"] = df_acc["timestamp-accel"].astype(np.int64)


        df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None, engine="python")
        df_gyr = df_gyr[df_gyr.columns[0:-1]]
        df_gyr.columns = feature_columns_gyr
        df_gyr["timestamp-gyro"] = df_gyr["timestamp-gyro"].astype(np.int64)

        for activity in labels:
            acc = df_acc[df_acc["activity code"] == activity].copy()
            gyr = df_gyr[df_gyr["activity code"] == activity].copy()

            time_acc = np.array(acc["timestamp-accel"])
            time_gyr = np.array(gyr["timestamp-gyro"])

            # Setando o tempo inicial para 0
            if len(time_acc) > 0 and len(time_gyr) > 0:
                time_acc = (time_acc - time_acc[0]) / 1000000000
                time_gyr = (time_gyr - time_gyr[0]) / 1000000000

                # ### Retirando os intervalos sem amostra (periodos vazios)
                # if np.any(np.diff(time_acc)<0):
                #     print("Acelerômetro com amostras fora de ordem")
                #     pos = np.nonzero(np.diff(time_acc)<0)[0].astype(int)
                #     for k in pos:
                #         time_acc[k+1:] = time_acc[k+1:]+time_acc[k]+1/20
                # if np.any(np.diff(time_gyr)<0):
                #     print("Giroscópio com amostras fora de ordem")
                #     pos = np.nonzero(np.diff(time_gyr)<0)[0].astype(int)
                #     for k in pos:
                #         time_gyr[k+1:] = time_gyr[k+1:]+time_gyr[k]+1/20

                tam = min(len(time_acc), len(time_gyr))

                acc["timestamp-accel"] = time_acc
                gyr["timestamp-gyro"] = time_gyr

                acc = acc.iloc[:tam]
                gyr = gyr.iloc[:tam]

                # Criando um dataframe com os dados de aceleração e giroscópio
                df = pd.concat([acc[feature_columns_acc[2:]], gyr[feature_columns_gyr[2:]]], axis=1)
                df["activity code"] = activity
                df["user"] = user
                df = df.dropna()

                # # Drop samples with NaN
                # if df.isnull().values.any():
                #     continue

                dfs.append(df)

    df = pd.concat(dfs)
    df.reset_index(inplace=True, drop=True)

    for column in feature_columns_acc[2:] + feature_columns_gyr[2:]:
        df[column] = df[column].astype(np.float32)
    df["user"] = df["user"].astype(np.int32)

    return df.dropna().reset_index(drop=True)

In [3]:
# Caminho para o dataset WISDM
wisdm_path = Path("../data/raw/WISDM/wisdm-dataset/raw/phone")
# Caminho para salvar o dataset pré-processado
output_path = Path("data/processed/WISDM")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = ["user", "activity code"]

# activity code: standard activity code
standard_activity_code_map = {
    "A": 2,
    "B": 5,
    "C": 6,
    "D": 0,
    "E": 1,
    "F": -1,
    "G": -1,
    "H": -1,
    "I": -1,
    "J": -1,
    "K": -1,   
    "L": -1,
    "M": -1,
    "O": -1,
    "P": -1,
    "Q": -1,
    "R": -1,
    "S": -1,
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_wisdm(wisdm_path)

differ = CalcTimeDiffMean(
    groupby_column=column_group,
    column_to_diff="timestamp-accel",
    new_column_name="accel-timestamp diff",
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,  # Nome das colunas que serão usadas como features
    samples_per_window=60,  # Numero de amostras por janela
    samples_per_overlap=0,  # Numero de amostras que se sobrepõem
    groupby_column=column_group,  # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Renomeia as colunas
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline([differ, windowizer, standard_label_adder])

# Executa o pipeline
new_df = pipeline(dataframe)
new_df


Executing CalcTimeDiffMean
Executing Windowize


Creating windows: 100%|██████████| 584/584 [02:08<00:00,  4.54it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,timestamp-accel,accel-timestamp diff,timestamp-gyro,user,activity code,window,standard activity code
0,-0.87973,2.001495,0.450623,-2.164352,-4.332779,-0.319443,1.566452,-0.323746,-1.811676,-1.134048,...,0.050415,0.570847,0.916656,0.050354,0.050354,0.050354,1600.0,A,0,2
1,0.058136,0.805832,0.392441,-3.992523,-4.075867,-0.342834,0.767914,-0.467209,-2.23732,-1.930817,...,0.564865,0.816101,0.347443,3.071594,0.050354,3.071594,1600.0,A,1,2
2,-1.380219,0.191849,-1.169861,-5.252274,0.886673,1.784363,-0.943817,-2.776306,-1.056335,0.467941,...,0.786972,0.793564,-0.182907,6.092834,0.050354,6.092834,1600.0,A,2,2
3,0.510223,-0.461853,-3.781677,-5.367844,-0.22345,1.547012,-0.414993,-3.513077,-1.619766,0.369415,...,0.945755,0.933792,-0.175034,9.114085,0.050354,9.114085,1600.0,A,3,2
4,2.348526,-2.201248,-7.240723,-4.361862,-1.85614,0.024414,0.723221,-1.243652,-0.348114,1.888306,...,0.492676,0.721069,0.308258,12.135325,0.050354,12.135325,1600.0,A,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36493,-9.302257,-8.461604,-7.321575,-7.27128,-7.204219,-6.205496,-5.455855,-4.385281,-4.066743,-4.550537,...,-0.12144,-0.124636,-0.050067,83.446365,0.020294,166.891235,1650.0,A,69,2
36494,-9.354947,-10.918895,-12.178675,-12.746294,-12.446917,-12.216995,-12.628939,-13.306729,-11.912827,-10.360855,...,-0.128897,0.150202,0.254598,84.655197,0.02021,169.309326,1650.0,A,70,2
36495,-9.271121,-8.820857,-9.297466,-9.24717,-8.679551,-8.595725,-8.293953,-7.676038,-6.933582,-5.961204,...,0.609331,0.588026,0.199204,85.864693,0.020287,171.727371,1650.0,A,71,2
36496,-4.186494,-5.659431,-6.71803,-7.616162,-8.581355,-9.527389,-10.504556,-11.199112,-10.818304,-10.547667,...,0.076699,-0.079895,-0.106526,87.072159,0.020203,174.145462,1650.0,A,72,2


## Normatizado com Interpolador

In [5]:
# Lê o dataset
dataframe = read_wisdm(wisdm_path)

differ = CalcTimeDiffMean(
    groupby_column=column_group,
    column_to_diff="timestamp-accel",
    new_column_name="accel-timestamp diff",
)

# Instacia o objeto que interpola os dados para 20Hz (supondo que o dataset original é 50Hz, constante)
interpolator = Interpolate(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    original_fs=20,                                             # Frequência de amostragem original (50Hz)
    target_fs=20,                                               # Frequência de amostragem desejada (20Hz)
    kind="cubic",                                               # Tipo de interpolação (cúbica)
    groupby_column=column_group,                                # Agrupa pela coluna do CSV. A reamostragem é feita para cada grupo da coluna CSV
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=20                                                       # Frequência de amostragem original
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pela coluna do CSV. As janelas são criadas para cada grupo da coluna CSV
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Renomeia as colunas
# 2. Adiciona a coluna com a gravidade
# 3. Converte a aceleração para m/s²
# 4. Aplica o filtro Butterworth
# 5. Reamostra os dados para 20Hz
# 6. Cria as janelas
# 7. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        interpolator,
        butterworth,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

Executing CalcTimeDiffMean
Executing Interpolate


Interpoling: 100%|██████████| 584/584 [00:05<00:00, 104.05it/s]


Executing ButterworthFilter
Executing Windowize


Creating windows: 100%|██████████| 584/584 [02:22<00:00,  4.10it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-58,gyro-z-59,timestamp-accel,accel-timestamp diff,timestamp-gyro,user,activity code,index,window,standard activity code
0,0.225292,3.064038,1.472824,-1.180252,-3.384453,0.595507,2.450479,0.531852,-0.981989,-0.327745,...,0.570847,0.916656,0.050354,0.050354,0.050354,1600.0,A,0,0,2
1,1.233994,1.990547,1.585515,-2.79162,-2.8677,0.871999,1.988788,0.759055,-1.006336,-0.695797,...,0.816101,0.347443,3.071594,0.050354,3.071594,1600.0,A,60,1,2
2,-0.07492,1.498832,0.138574,-3.94262,2.197315,3.095761,0.368105,-1.464093,0.255932,1.780026,...,0.793564,-0.182907,6.092834,0.050354,6.092834,1600.0,A,120,2,2
3,1.931858,0.966084,-2.347656,-3.927997,1.221924,2.997579,1.040402,-2.053245,-0.155912,1.83686,...,0.933792,-0.175034,9.114085,0.050354,9.114085,1600.0,A,180,3,2
4,3.963044,-0.583654,-5.62048,-2.739418,-0.231956,1.649868,2.34947,0.382915,1.278292,3.514072,...,0.721069,0.308258,12.135325,0.050354,12.135325,1600.0,A,240,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36481,-0.428254,0.403879,1.535514,1.577699,1.63708,2.628674,3.371829,4.436619,4.750095,4.261946,...,-0.124636,-0.050067,83.446365,0.020294,166.891235,1650.0,A,2205978,69,2
36482,-0.650463,-2.216153,-3.47757,-4.046865,-3.749326,-3.5215,-3.935855,-4.616395,-3.225562,-1.676919,...,0.150202,0.254598,84.655197,0.02021,169.309326,1650.0,A,2206038,70,2
36483,-0.618142,-0.169672,-0.648301,-0.600145,-0.03467,0.047128,0.347106,0.96357,1.705009,2.676874,...,0.588026,0.199204,85.864693,0.020287,171.727371,1650.0,A,2206098,71,2
36484,4.483644,3.018338,1.967609,1.077348,0.119765,-0.819197,-1.79011,-2.479495,-2.094846,-1.82191,...,-0.079895,-0.106526,87.072159,0.020203,174.145462,1650.0,A,2206158,72,2


# Filtra por elementos iguais

In [6]:
filter_common = FilterByCommonRows(match_columns=["user", "activity code", "window"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

In [7]:
# new_df["standard activity code"].unique()

## Balanceia e salva os dados brutos

In [8]:
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.8,
    random_state=0
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.9,
    random_state=0
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "raw_balanced"
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Train classes: [2 5 6 0 1]
Test classes: [2 5 6 0 1]
Train classes: [2 5 6 0 1]
Test classes: [2 5 6 0 1]
Data saved at data/processed/WISDM/raw_balanced


In [9]:
new_df["standard activity code"].unique()

array([ 2,  5,  6,  0,  1, -1])

## Balanceia e salva os dados processados

In [10]:
# Salva os dados
new_df_standartized.to_csv(output_path / "standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "standartized_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Train classes: [2 5 6 0 1]
Test classes: [2 5 6 0 1]
Train classes: [2 5 6 0 1]
Test classes: [2 5 6 0 1]
Data saved at data/processed/WISDM/standartized_balanced
