In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import interpolate

from dataset_processor import (
    ButterworthFilter,
    CalcTimeDiffMean,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    FilterByCommonRows,
    BalanceToMinimumClass,
    Pipeline
)

In [2]:
def read_wisdm(wisdm_path: str, interpol = True) -> pd.DataFrame:
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos txt)
    O dataframe retornado possui as seguintes colunas:
    - activity code: Código da atividade
    - user: Usuário que realizou a atividade
    - timestamp-accel: Timestamp da aceleração
    - accel-x: Aceleração no eixo x
    - accel-y: Aceleração no eixo y
    - accel-z: Aceleração no eixo z
    - timestamp-gyro: Timestamp do giroscópio
    - gyro-x: Giroscópio no eixo x
    - gyro-y: Giroscópio no eixo y
    - gyro-z: Giroscópio no eixo z

    Parameters
    ----------
    wisdm_path : str
        Caminho para o dataset WISDM

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset WISDM
    """
    
    feature_columns_acc = [
        "user",
        "activity code",
        "timestamp-accel",
        "accel-x",
        "accel-y",
        "accel-z",
    ]
    feature_columns_gyr = [
        "user",
        "activity code",
        "timestamp-gyro",
        "gyro-x",
        "gyro-y",
        "gyro-z",
    ]

    # Lista com letras maiúsculas de A até S sem o N
    labels = [chr(i) for i in range(65, 84) if chr(i) != "N"]

    dfs = []

    for user in range(1600,1651):

        df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None, engine="python")
        df_acc = df_acc[df_acc.columns[0:-1]]
        df_acc.columns = feature_columns_acc
        df_acc["timestamp-accel"] = df_acc["timestamp-accel"].astype(np.int64)


        df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None, engine="python")
        df_gyr = df_gyr[df_gyr.columns[0:-1]]
        df_gyr.columns = feature_columns_gyr
        df_gyr["timestamp-gyro"] = df_gyr["timestamp-gyro"].astype(np.int64)

        for activity in labels:
            acc = df_acc[df_acc["activity code"] == activity].copy()
            gyr = df_gyr[df_gyr["activity code"] == activity].copy()

            time_acc = np.array(acc["timestamp-accel"])
            time_gyr = np.array(gyr["timestamp-gyro"])

            if interpol:
                # Setando o tempo inicial para 0
                if len(time_acc) > 0 and len(time_gyr) > 0:
                    time_acc = (time_acc - time_acc[0]) / 1000000000
                    time_gyr = (time_gyr - time_gyr[0]) / 1000000000

                    ### Retirando os intervalos sem amostra (periodos vazios)
                    if np.any(np.diff(time_acc)<0):
                        pos = np.nonzero(np.diff(time_acc)<0)[0].astype(int)
                        for k in pos:
                            time_acc[k+1:] = time_acc[k+1:]+time_acc[k]+1/20
                    if np.any(np.diff(time_gyr)<0):
                        pos = np.nonzero(np.diff(time_gyr)<0)[0].astype(int)
                        for k in pos:
                            time_gyr[k+1:] = time_gyr[k+1:]+time_gyr[k]+1/20

                    # Interpolando os dados
                    sigs_acc = []
                    sigs_gyr = []
                    for sig_acc, sig_gyr in zip(acc[feature_columns_acc[2:]], gyr[feature_columns_gyr[2:]]):
                        fA = np.array(acc[sig_acc])
                        fG = np.array(gyr[sig_gyr])

                        intp1 = interpolate.interp1d(time_acc, fA, kind='cubic')
                        intp2 = interpolate.interp1d(time_gyr, fG, kind='cubic')
                        nt1 = np.arange(0,time_acc[-1],1/20)
                        nt2 = np.arange(0,time_gyr[-1],1/20)
                        sigs_acc.append(intp1(nt1))
                        sigs_gyr.append(intp2(nt2))

                    tam = min(len(nt1), len(nt2))

                    new_acc = pd.DataFrame()
                    new_gyr = pd.DataFrame()

                    for x, y in zip(sigs_acc, sigs_gyr):
                        x = x[:tam]
                        y = y[:tam]
                    
                    new_acc["timestamp-accel"] = nt1[:tam]
                    new_gyr["timestamp-gyro"] = nt2[:tam]

                    for sig_acc, sig_gyr, column_acc, column_gyr in zip(sigs_acc, sigs_gyr, feature_columns_acc[2:], feature_columns_gyr[2:]):
                        new_acc[column_acc] = sig_acc[:tam]
                        new_gyr[column_gyr] = sig_gyr[:tam]
            else:
                tam = min(len(time_acc), len(time_gyr))
                new_acc = acc[feature_columns_acc[2:]].iloc[:tam]
                new_gyr = gyr[feature_columns_gyr[2:]].iloc[:tam]
                
            # Criando um dataframe com os dados de aceleração e giroscópio
            df = pd.concat([new_acc, new_gyr], axis=1)
            df["activity code"] = activity
            df["user"] = user
            df = df.dropna()

            dfs.append(df)

    df = pd.concat(dfs)
    df.reset_index(inplace=True, drop=True)

    for column in feature_columns_acc[2:] + feature_columns_gyr[2:]:
        df[column] = df[column].astype(np.float32)
    df["user"] = df["user"].astype(np.int32)

    return df.dropna().reset_index(drop=True)

In [3]:
# Caminho para o dataset WISDM
wisdm_path = Path("../data/raw/WISDM/wisdm-dataset/raw/phone")
# Caminho para salvar o dataset pré-processado
output_path = Path("../data/processed/WISDM")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = ["user", "activity code"]

# activity code: standard activity code
standard_activity_code_map = {
    "A": 2,
    "B": 5,
    "C": 6,
    "D": 0,
    "E": 1,
    "F": -1,
    "G": -1,
    "H": -1,
    "I": -1,
    "J": -1,
    "K": -1,   
    "L": -1,
    "M": -1,
    "O": -1,
    "P": -1,
    "Q": -1,
    "R": -1,
    "S": -1,
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_wisdm(wisdm_path, interpol=False)

differ = CalcTimeDiffMean(
    groupby_column=column_group,
    column_to_diff="timestamp-accel",
    new_column_name="accel-timestamp diff",
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pelas colunas user e activity code. As janelas são criadas para cada grupo das colunas user e activity code
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline([differ, windowizer, standard_label_adder])

# Executa o pipeline
new_df = pipeline(dataframe)
new_df


Executing CalcTimeDiffMean
Executing Windowize


Creating windows: 100%|██████████| 584/584 [02:05<00:00,  4.65it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,user,timestamp-gyro,timestamp-accel,accel-timestamp diff,activity code,window,standard activity code
0,-0.87973,2.001495,0.450623,-2.164352,-4.332779,-0.319443,1.566452,-0.323746,-1.811676,-1.134048,...,0.050415,0.570847,0.916656,1600.0,252207962259456.0,252207710601216.0,50331648.0,A,0,2
1,0.058136,0.805832,0.392441,-3.992523,-4.075867,-0.342834,0.767914,-0.467209,-2.23732,-1.930817,...,0.564865,0.816101,0.347443,1600.0,252210982158336.0,252210730500096.0,50331648.0,A,1,2
2,-1.380219,0.191849,-1.169861,-5.252274,0.886673,1.784363,-0.943817,-2.776306,-1.056335,0.467941,...,0.786972,0.793564,-0.182907,1600.0,252214018834432.0,252213767176192.0,50331648.0,A,2,2
3,0.510223,-0.461853,-3.781677,-5.367844,-0.22345,1.547012,-0.414993,-3.513077,-1.619766,0.369415,...,0.945755,0.933792,-0.175034,1600.0,252217038733312.0,252216787075072.0,50331648.0,A,3,2
4,2.348526,-2.201248,-7.240723,-4.361862,-1.85614,0.024414,0.723221,-1.243652,-0.348114,1.888306,...,0.492676,0.721069,0.308258,1600.0,252220058632192.0,252219806973952.0,50331648.0,A,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36493,-9.302257,-8.461604,-7.321575,-7.27128,-7.204219,-6.205496,-5.455855,-4.385281,-4.066743,-4.550537,...,-0.12144,-0.124636,-0.050067,1650.0,359773639802880.0,359690055712768.0,0.0,A,69,2
36494,-9.354947,-10.918895,-12.178675,-12.746294,-12.446917,-12.216995,-12.628939,-13.306729,-11.912827,-10.360855,...,-0.128897,0.150202,0.254598,1650.0,359776055721984.0,359691263672320.0,0.0,A,70,2
36495,-9.271121,-8.820857,-9.297466,-9.24717,-8.679551,-8.595725,-8.293953,-7.676038,-6.933582,-5.961204,...,0.609331,0.588026,0.199204,1650.0,359778471641088.0,359692471631872.0,0.0,A,71,2
36496,-4.186494,-5.659431,-6.71803,-7.616162,-8.581355,-9.527389,-10.504556,-11.199112,-10.818304,-10.547667,...,0.076699,-0.079895,-0.106526,1650.0,359780887560192.0,359693679591424.0,0.0,A,72,2


## Normatizado

In [5]:
# Lê o dataset
dataframe = read_wisdm(wisdm_path, interpol=True)

differ = CalcTimeDiffMean(
    groupby_column=column_group,
    column_to_diff="timestamp-accel",
    new_column_name="accel-timestamp diff",
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=20                                                       # Frequência de amostragem original
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pelas colunas user e activity code. As janelas são criadas para cada grupo das colunas user e activity code
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Aplica o filtro Butterworth
# 3. Cria as janelas
# 4. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        butterworth,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

Executing CalcTimeDiffMean
Executing ButterworthFilter
Executing Windowize


Creating windows: 100%|██████████| 918/918 [03:48<00:00,  4.01it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,user,timestamp-gyro,timestamp-accel,accel-timestamp diff,activity code,window,standard activity code
0,0.203999,3.052405,1.529101,-1.086387,-3.412541,0.4171,2.475407,0.669701,-0.941456,-0.431584,...,-0.02722,0.314905,0.890524,1600.0,252207962259456.0,252207710601216.0,50331648.0,A,0,2
1,1.11181,1.609697,2.261287,-0.814601,-3.58796,-0.875369,1.881903,1.540785,-0.275996,-1.125939,...,0.406141,0.615971,0.795717,1600.0,252210965381120.0,252210713722880.0,50331648.0,A,1,2
2,0.448921,0.063567,1.624386,-0.52372,-3.635738,2.741032,2.867946,0.081795,-1.445302,0.490634,...,0.100644,0.608968,0.928236,1600.0,252213968502784.0,252213716844544.0,50331648.0,A,2,2
3,-0.473682,1.044933,1.709572,-1.312519,-4.159498,-0.479369,2.969187,1.993398,-1.367156,-1.302158,...,-0.015974,0.365056,1.097406,1600.0,252216971624448.0,252216719966208.0,50331648.0,A,3,2
4,0.817271,2.589154,3.346748,-2.516122,-5.412037,-1.846764,0.277373,2.061176,1.953867,0.211111,...,-0.174049,0.307581,0.470185,1600.0,252219974746112.0,252219723087872.0,50331648.0,A,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55239,-0.431281,-0.341486,-0.037555,-0.003447,0.493422,0.00033,-0.064159,0.025941,0.141851,-0.002414,...,0.16659,0.143167,0.114883,1650.0,357716350468096.0,357716216250368.0,33554432.0,S,54,-1
55240,0.531837,0.13,-0.382092,-0.316574,-0.393507,0.592278,0.98959,0.251557,0.186822,0.390787,...,-0.493232,-0.242802,0.01463,1650.0,357719336812544.0,357719236149248.0,67108864.0,S,55,-1
55241,-0.056782,1.185063,1.117809,0.215498,0.853678,0.15069,1.039669,0.764552,0.64451,0.749803,...,0.239498,0.29974,0.322814,1650.0,357722356711424.0,357722222493696.0,33554432.0,S,56,-1
55242,0.041996,-0.016277,-0.396841,-0.193471,-0.246525,-0.536402,-0.781294,-0.801998,-0.592809,-0.40295,...,-0.0708,-0.115323,-0.14952,1650.0,357725343055872.0,357725242392576.0,67108864.0,S,57,-1


# Filtra por elementos iguais

In [6]:
filter_common = FilterByCommonRows(match_columns=["user", "activity code", "window"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

## Balanceia e salva os dados brutos

In [7]:
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.8,
    random_state=0
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.9,
    random_state=0
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "raw_balanced"
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/processed/WISDM/raw_balanced


## Balanceia e salva os dados processados

In [8]:
# Salva os dados
new_df_standartized.to_csv(output_path / "standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "standartized_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/processed/WISDM/standartized_balanced
