In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from dataset_processor import (
    CalcTimeDiffMean,
    ButterworthFilter,
    ResamplerPoly,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    BalanceToMinimumClass,
    FilterByCommonRows,
    Pipeline
)

import os
from tqdm.notebook import tqdm
from natsort import natsorted
from zipfile import ZipFile

In [2]:
# Vamos definir a pasta que está o dataset
workspace = Path("../data/raw/RealWorld")
root = Path("../data/raw/RealWorld/realworld2016_dataset")

In [3]:
# Lista de usuários e atividades
users = natsorted(os.listdir(root))
tarefas = ['climbingdown', 'climbingup', 'jumping', 'lying', 'running', 'sitting', 'standing', 'walking']
SAC = ['sitting', 'standing', 'walking', 'climbingup', 'climbingdown', 'running']
posicao = ['chest', 'forearm', 'head', 'shin', 'thigh', 'upperarm', 'waist']

In [4]:
# Vamos criar uma pasta para descompactar os arquivos .zip
if not os.path.isdir(workspace / "junk"):
    os.mkdir(workspace / "junk")
os.path.isdir(workspace / "junk")
# e uma pasta para reorganizar os dados descompactados
if not os.path.isdir(workspace / "realworld2016_dataset_organized"):
    os.mkdir(workspace / "realworld2016_dataset_organized")
os.path.isdir(workspace / "realworld2016_dataset_organized")

True

In [5]:
# Vamos criar as pastas de cada usuário
for i in users:
    if not os.path.isdir(workspace / "realworld2016_dataset_organized" / i):
        os.mkdir(workspace / "realworld2016_dataset_organized" / i)

In [6]:
def getfiles(user, activity, workspace, root):
    # Essa função vai descompactar os arquivos na pasta junk
    # e em seguida mover os csv para a pasta realworldcsvs
    folder = workspace / "realworld2016_dataset_organized"


    for sensor in ["acc", "gyr"]:
        file = root / user / f"data/{sensor}_{activity}_csv.zip"
        with ZipFile(file, 'r') as zip:
            zip.extractall(workspace / "junk")

        for i in os.listdir(workspace / "junk"):
            if i.find('zip')>-1:
                file = workspace / "junk" / i
                with ZipFile(file, 'r') as zip:
                    zip.extractall(workspace / "junk")

        for files in os.listdir(workspace / "junk"):
            if os.path.isfile(workspace / "junk" / files):
                if files.find(activity)>-1 and files.find('zip')<0:
                    os.rename(workspace / "junk" / files, folder / user / files)
                else:
                    os.remove(workspace / "junk" / files)

        os.rmdir(workspace / "junk")

# Vamos iterar sobre os arquivos que interessam
for user in users:
    for activity in tarefas:
        getfiles(user, activity, workspace, root)

In [7]:
# Agora vamos criar uma pasta para os dados de acelerômetro e giroscópio
for user in users:
    if not os.path.isdir(workspace / "realworld2016_dataset_organized" / user / "acc"):
        os.mkdir(workspace / "realworld2016_dataset_organized" / user / "acc")
    if not os.path.isdir(workspace / "realworld2016_dataset_organized" / user / "gyr"):
        os.mkdir(workspace / "realworld2016_dataset_organized" / user / "gyr")

In [8]:
# E vamos mover os arquivos para as pastas corretas
for user in users:
    for files in os.listdir(workspace / "realworld2016_dataset_organized" / user):
        if files.find('acc')>-1 and os.path.isfile(workspace / "realworld2016_dataset_organized" / user / files):
            origin = workspace / "realworld2016_dataset_organized" / user / files
            destiny = workspace / "realworld2016_dataset_organized" / user / "acc" / files
            os.rename(origin, destiny)
        if files.find('Gyr')>-1 and os.path.isfile(workspace / "realworld2016_dataset_organized" / user / files):
            origin = workspace / "realworld2016_dataset_organized" / user / files
            destiny = workspace / "realworld2016_dataset_organized" / user / "gyr" / files
            os.rename(origin, destiny)

In [9]:
# Vamos verificar se as pastas tem a mesma quantidade de arquivos
flag = 1
for user in users:
    files_acc = os.listdir(workspace / "realworld2016_dataset_organized" / user / "acc")
    files_gyr = os.listdir(workspace / "realworld2016_dataset_organized" / user / "gyr")
    if len(files_acc) != len(files_gyr):
        flag = 0
        print(f"User {user} has {len(files_acc)} acc files and {len(files_gyr)} gyr files")
        flag = -1
if flag == 1:
    print("All users have the same number of acc and gyr files")


All users have the same number of acc and gyr files


In [10]:
def read_realworld(workspace, users):
    """Le o dataset RealWorld e retorna um DataFrame com os dados (vindo de todos os arquivos CSV)
    O dataframe contém as seguintes colunas:
    """

    # Agora vamos nos preparar para criar as views
    tarefas = ['climbingdown', 'climbingup', 'jumping', 'lying', 'running', 'sitting', 'standing', 'walking']

    # Por enquanto só vamos criar as views das seguintes posições
    posicao = ['thigh', 'upperarm', 'waist']

    # Lista de features
    feature_acc = ["index", "accel-start-time", "accel-x", "accel-y", "accel-z"]
    feature_gyr = ["index", "gyro-start-time", "gyro-x", "gyro-y", "gyro-z"]

    dfs = []

    for p in posicao:
        for user in users:
            filesacc = sorted(os.listdir(workspace / "realworld2016_dataset_organized" / user / "acc"))
            filesgyr = sorted(os.listdir(workspace / "realworld2016_dataset_organized" / user / "gyr"))

            pos = []
            for i in range(len(filesacc)):
                if filesacc[i].find(p)>-1:
                    pos.append(i)
            
            for i in pos:
                acc = pd.read_csv(workspace / "realworld2016_dataset_organized" / user / "acc" / filesacc[i])
                acc.columns = feature_acc
                gyr = pd.read_csv(workspace / "realworld2016_dataset_organized" / user / "gyr" / filesgyr[i])
                gyr.columns = feature_gyr
                for activity in tarefas:
                    if filesacc[i].find(activity)>-1:
                        break

                if not abs(acc.shape[0]-gyr.shape[0])<200:
                    # Remove todas as linhas dos dataframes
                    acc.drop(acc.index, inplace=True)
                    gyr.drop(gyr.index, inplace=True)

                tam = min(acc.shape[0],gyr.shape[0])
                
                new_acc = acc[feature_acc].iloc[:tam]
                new_gyr = gyr[feature_gyr[1:]].iloc[:tam]
                
                # Criando um dataframe com os dados de aceleração e giroscópio
                df = pd.concat([new_acc, new_gyr], axis=1)
                df['user'] = user
                df['position'] = p
                df['activity code'] = activity
                # df['activity code'] = map[tarefas.index(activity)]

                # Drop samples with NaN
                if df.isnull().values.any():
                    continue
                
                dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.reset_index(inplace=True, drop=True)
    return df

In [11]:
# Caminho para o dataset RealWorld
realworld_path = Path("../data/raw/RealWorld/realworld2016_dataset_organized")
# Local onde os dados serão salvos
output_path = Path("../data/processed/RealWorld")
# Cria o caminho de saída se ele não existir
output_path.mkdir(parents=True, exist_ok=True)

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome da coluna que será usada para agrupar as janelas
column_group = ["user", "activity code", "position"]

# activity code: standard activity code
maping = [4, 3, -1, -1, 5, 0, 1, 2]
standard_activity_code_map = {
    activity: maping[tarefas.index(activity)] for activity in tarefas 
}

standard_activity_code_map


{'climbingdown': 4,
 'climbingup': 3,
 'jumping': -1,
 'lying': -1,
 'running': 5,
 'sitting': 0,
 'standing': 1,
 'walking': 2}

In [12]:
# dataframe

## Bruto

In [13]:
# Lê o dataset Kuhar
dataframe = read_realworld(workspace, users)

# Instancia o objeto para calcular a diferença entre os tempos
differ = CalcTimeDiffMean(
    groupby_column=column_group,            # Agrupa pela coluna do user, activity code e position. Os tempos de início e fim são calculados para cada grupo da coluna user, activity code e position
    column_to_diff="accel-start-time",      # Coluna para calcular a diferença
    new_column_name="timestamp diff",       # Nome da coluna com a diferença
    # filter_predicate=lambda x: (            # Aplica o filtro para remover as linhas com diferença maior/menor que 1 segundo
    #     (x["timestamp diff"] < 1) & (x["timestamp diff"] > -1.0)
    # ).all()
)

# # Instancia o objeto que plota a diferença entre os tempos
# plotter = PlotDiffMean(column_to_plot="timestamp diff")

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=150,                 # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do user, activity code e position. As janelas são criadas para cada grupo da coluna user, activity code e position
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df = pipeline(dataframe)
new_df

Executing CalcTimeDiffMean
Executing Windowize


Creating windows: 100%|██████████| 357/357 [03:54<00:00,  1.52it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-149,gyro-start-time,activity code,user,position,timestamp diff,index,accel-start-time,window,standard activity code
0,-0.72245,-0.744597,-0.738611,-0.743998,-0.751181,-0.746392,-0.729633,-0.73502,-0.754772,-0.732027,...,0.18727,1435996968068.0,climbingdown,proband1,thigh,49.0,2,1435996968069,0,4
1,-0.740407,-0.73921,-0.747589,-0.7434,-0.752378,-0.730232,-0.724845,-0.734421,-0.774524,-0.742801,...,0.191546,1435996971088.0,climbingdown,proband1,thigh,65.0,152,1435996971089,1,4
2,-0.73083,-0.754772,-0.728436,-0.731429,-0.752378,-0.724246,-0.709282,-0.742202,-0.736816,-0.731429,...,0.192462,1435996974050.0,climbingdown,proband1,thigh,23.0,302,1435996974051,2,4
3,-0.749385,-0.731429,-0.752976,-0.737414,-0.730232,-0.729034,-0.742202,-0.731429,-0.738611,-0.765546,...,0.260268,1435996977088.0,climbingdown,proband1,thigh,57.0,452,1435996977089,3,4
4,-0.388459,-0.413,-0.589572,-0.888847,-1.364695,-1.27611,-0.851139,0.303465,1.134253,1.686715,...,0.128322,1435996980051.0,climbingdown,proband1,thigh,7.0,602,1435996980052,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67062,10.401609,9.780314,8.334216,8.604162,10.982801,13.145963,15.128362,15.462952,13.983336,11.419145,...,-0.245714,1436620754166.0,walking,proband9,waist,19.0,30602,1436620754093,204,2
67063,16.109985,13.925276,11.432912,9.660604,8.698134,8.591593,9.77373,10.96305,12.142194,12.602479,...,-0.156528,1436620757117.0,walking,proband9,waist,18.0,30752,1436620757096,205,2
67064,17.18498,16.521189,13.290214,10.052056,7.885304,8.237849,9.449914,11.432912,13.253702,13.499706,...,-0.028857,1436620760121.0,walking,proband9,waist,10.0,30902,1436620760098,206,2
67065,8.787319,8.893861,9.142858,9.667787,10.164583,10.432734,10.344149,10.085575,9.689933,9.475652,...,-0.053902,1436620763124.0,walking,proband9,waist,17.0,31052,1436620763102,207,2


## Normatizado com reamostrador ResamplerPoly

In [14]:
# Lê o dataset Kuhar
dataframe = read_realworld(workspace, users)

# Instancia o objeto para calcular a diferença entre os tempos
differ = CalcTimeDiffMean(
    groupby_column=column_group,            # Agrupa pela coluna do user, activity code e position
    column_to_diff="accel-start-time",      # Coluna para calcular a diferença
    new_column_name="timestamp diff",       # Nome da coluna com a diferença
    # filter_predicate=lambda x: (          # Aplica o filtro para remover as linhas com diferença maior/menor que 1 segundo
    #     (x["timestamp diff"] < 1) & (x["timestamp diff"] > -1.0)
    # ).all()
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=50                                                       # Frequência de amostragem original
)

# Instacia o objeto que reamostra os dados para 20Hz (supondo que o dataset original é 50Hz, constante)
resampler = ResamplerPoly(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    up=2,                                                       # Frequência de amostragem original
    down=5,                                                     # Frequência de amostragem desejada
    groupby_column=column_group,                                # Agrupa pela coluna do txt. A reamostragem é feita para cada grupo da coluna txt
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,     # Nome das colunas que serão usadas como features
    samples_per_window=60,                  # Numero de amostras por janela
    samples_per_overlap=0,                  # Numero de amostras que se sobrepõem
    groupby_column=column_group,            # Agrupa pela coluna do user, activity code e position. As janelas são criadas para cada grupo da coluna user, activity code e position
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Reamostra os dados
# 3. Cria as janelas
# 4. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        butterworth,
        resampler,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

Executing CalcTimeDiffMean
Executing ButterworthFilter
Executing ResamplerPoly


Resampling: 100%|██████████| 357/357 [00:06<00:00, 52.17it/s]


Executing Windowize


Creating windows: 100%|██████████| 357/357 [03:47<00:00,  1.57it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-start-time,activity code,user,position,timestamp diff,index,level_0,accel-start-time,window,standard activity code
0,-0.00324,-0.020704,-0.013915,-0.01096,-0.005935,-0.023875,-0.015348,-0.019469,-0.001657,0.016329,...,1435996968068.0,climbingdown,proband1,thigh,49.0,2,0,1435996968069,0,4
1,0.000837,-0.005386,0.01028,-0.003999,-0.012274,0.004789,0.003616,0.01762,0.003748,-0.025447,...,1435996969245.0,climbingdown,proband1,thigh,1.0,62,60,1435996969245,1,4
2,0.003087,-0.008571,0.005173,0.000152,-0.002865,-0.004012,-0.011903,-0.010582,-0.005746,-0.01199,...,1435996970444.0,climbingdown,proband1,thigh,17.0,122,120,1435996970445,2,4
3,0.046743,0.058552,0.055191,0.040569,0.008711,0.029653,0.011902,-0.052312,-0.060942,-0.070503,...,1435996971643.0,climbingdown,proband1,thigh,20.0,182,180,1435996971643,3,4
4,-0.958723,-1.334095,-1.784825,0.023354,2.58109,1.464809,-1.438176,-0.464044,0.973519,0.058859,...,1435996972846.0,climbingdown,proband1,thigh,22.0,242,240,1435996972847,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67068,0.481085,-1.27123,3.211321,5.193695,-1.059502,2.655824,4.018883,-1.796293,-5.298416,-4.613349,...,1436620386574.0,walking,proband9,waist,20.0,12242,8764033,1436620386549,204,2
67069,5.416058,0.657012,-1.261492,1.687057,2.630411,-0.570247,-5.18147,-3.940197,0.374687,0.237346,...,1436620387775.0,walking,proband9,waist,17.0,12302,8764093,1436620387752,205,2
67070,7.136875,1.922444,-2.007572,2.777827,1.764299,-3.235877,-3.662546,-1.60652,0.446182,-0.434123,...,1436620388971.0,walking,proband9,waist,21.0,12362,8764153,1436620388950,206,2
67071,-1.221008,-0.321139,0.48833,0.083197,-0.457829,0.244022,0.21995,-0.240095,-0.029459,0.001245,...,1436620390175.0,walking,proband9,waist,20.0,12422,8764213,1436620390154,207,2


## Filtra por elementos iguais

In [15]:
filter_common = FilterByCommonRows(match_columns=["user", "window", "activity code", "position"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

## Balanceia e salva os dados brutos

In [16]:
# Salva os dados
new_df.to_csv(output_path / "raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df[new_df["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "raw_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/processed/RealWorld/raw_balanced


## Balanceia e salva os dados processados

In [17]:
# Salva os dados
new_df_standartized.to_csv(output_path / "standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

balancer = BalanceToMinimumClass(class_column="standard activity code")
train_df = balancer(train_df)
val_df = balancer(val_df)
test_df = balancer(test_df)

ouptut_dir = output_path / "standartized_balanced" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

Data saved at ../data/processed/RealWorld/standartized_balanced
