In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import interpolate

from dataset_processor import (
    ButterworthFilter,
    CalcTimeDiffMean,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    FilterByCommonRows,
    BalanceToMinimumClass,
    Pipeline
)

In [2]:
def read_wisdm(wisdm_path: str, interpol = True) -> pd.DataFrame:
    """Le o dataset do motionsense e retorna um dataframe com os dados (vindos de todos os arquivos txt)
    O dataframe retornado possui as seguintes colunas:
    - activity code: Código da atividade
    - user: Usuário que realizou a atividade
    - timestamp-accel: Timestamp da aceleração
    - accel-x: Aceleração no eixo x
    - accel-y: Aceleração no eixo y
    - accel-z: Aceleração no eixo z
    - timestamp-gyro: Timestamp do giroscópio
    - gyro-x: Giroscópio no eixo x
    - gyro-y: Giroscópio no eixo y
    - gyro-z: Giroscópio no eixo z

    Parameters
    ----------
    wisdm_path : str
        Caminho para o dataset WISDM

    Returns
    -------
    pd.DataFrame
        Dataframe com os dados do dataset WISDM
    """
    
    feature_columns_acc = [
        "user",
        "activity code",
        "timestamp-accel",
        "accel-x",
        "accel-y",
        "accel-z",
    ]
    feature_columns_gyr = [
        "user",
        "activity code",
        "timestamp-gyro",
        "gyro-x",
        "gyro-y",
        "gyro-z",
    ]

    # Lista com letras maiúsculas de A até S sem o N
    labels = [chr(i) for i in range(65, 84) if chr(i) != "N"]

    dfs = []
    window = 1
    for user in range(1600,1651):
        window = 1
        df_acc = pd.read_csv(wisdm_path / f"accel/data_{user}_accel_phone.txt", sep=",|;", header=None, engine="python")
        df_acc = df_acc[df_acc.columns[0:-1]]
        df_acc.columns = feature_columns_acc
        df_acc["timestamp-accel"] = df_acc["timestamp-accel"].astype(np.int64)


        df_gyr = pd.read_csv(wisdm_path / f"gyro/data_{user}_gyro_phone.txt", sep=",|;", header=None, engine="python")
        df_gyr = df_gyr[df_gyr.columns[0:-1]]
        df_gyr.columns = feature_columns_gyr
        df_gyr["timestamp-gyro"] = df_gyr["timestamp-gyro"].astype(np.int64)

        for activity in labels:
            acc = df_acc[df_acc["activity code"] == activity].copy()
            gyr = df_gyr[df_gyr["activity code"] == activity].copy()

            time_acc = np.array(acc["timestamp-accel"])
            time_gyr = np.array(gyr["timestamp-gyro"])

            if interpol:
                # Setando o tempo inicial para 0
                if len(time_acc) > 0 and len(time_gyr) > 0:
                    time_acc = (time_acc - time_acc[0]) / 1000000000
                    time_gyr = (time_gyr - time_gyr[0]) / 1000000000

                    ### Retirando os intervalos sem amostra (periodos vazios)
                    if np.any(np.diff(time_acc)<0):
                        pos = np.nonzero(np.diff(time_acc)<0)[0].astype(int)
                        for k in pos:
                            time_acc[k+1:] = time_acc[k+1:]+time_acc[k]+1/20
                    if np.any(np.diff(time_gyr)<0):
                        pos = np.nonzero(np.diff(time_gyr)<0)[0].astype(int)
                        for k in pos:
                            time_gyr[k+1:] = time_gyr[k+1:]+time_gyr[k]+1/20

                    # Interpolando os dados
                    sigs_acc = []
                    sigs_gyr = []
                    for sig_acc, sig_gyr in zip(acc[feature_columns_acc[2:]], gyr[feature_columns_gyr[2:]]):
                        fA = np.array(acc[sig_acc])
                        fG = np.array(gyr[sig_gyr])

                        intp1 = interpolate.interp1d(time_acc, fA, kind='cubic')
                        intp2 = interpolate.interp1d(time_gyr, fG, kind='cubic')
                        nt1 = np.arange(0,time_acc[-1],1/20)
                        nt2 = np.arange(0,time_gyr[-1],1/20)
                        sigs_acc.append(intp1(nt1))
                        sigs_gyr.append(intp2(nt2))

                    tam = min(len(nt1), len(nt2))

                    new_acc = pd.DataFrame()
                    new_gyr = pd.DataFrame()

                    for x, y in zip(sigs_acc, sigs_gyr):
                        x = x[:tam]
                        y = y[:tam]
                    
                    new_acc["timestamp-accel"] = nt1[:tam]
                    new_gyr["timestamp-gyro"] = nt2[:tam]

                    for sig_acc, sig_gyr, column_acc, column_gyr in zip(sigs_acc, sigs_gyr, feature_columns_acc[2:], feature_columns_gyr[2:]):
                        new_acc[column_acc] = sig_acc[:tam]
                        new_gyr[column_gyr] = sig_gyr[:tam]
            else:
                tam = min(len(time_acc), len(time_gyr))
                new_acc = acc[feature_columns_acc[2:]].iloc[:tam]
                new_gyr = gyr[feature_columns_gyr[2:]].iloc[:tam]
                
            # Criando um dataframe com os dados de aceleração e giroscópio
            df = pd.concat([new_acc, new_gyr], axis=1)
            df["activity code"] = activity
            df["user"] = user
            df["window"] = window
            df = df.dropna()

            dfs.append(df)

    df = pd.concat(dfs)
    df.reset_index(inplace=True, drop=True)

    for column in feature_columns_acc[2:] + feature_columns_gyr[2:]:
        df[column] = df[column].astype(np.float32)
    df["user"] = df["user"].astype(np.int32)

    return df.dropna().reset_index(drop=True)

In [3]:
# Caminho para o dataset WISDM
wisdm_path = Path("../data/original/WISDM/wisdm-dataset/raw/phone")

# Lista com as colunas que são features
feature_columns = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z",
]

# Nome das colunas que serão usada para agrupar as janelas
column_group = ["user", "activity code", "window"]

# activity code: standard activity code
standard_activity_code_map = {
    "A": 2,
    "B": 5,
    "C": 6,
    "D": 0,
    "E": 1,
    "F": -1,
    "G": -1,
    "H": -1,
    "I": -1,
    "J": -1,
    "K": -1,   
    "L": -1,
    "M": -1,
    "O": -1,
    "P": -1,
    "Q": -1,
    "R": -1,
    "S": -1,
}

## Bruto

In [4]:
# Lê o dataset
dataframe = read_wisdm(wisdm_path, interpol=True)

differ = CalcTimeDiffMean(
    groupby_column=column_group,
    column_to_diff="timestamp-accel",
    new_column_name="accel-timestamp diff",
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pelas colunas user e activity code. As janelas são criadas para cada grupo das colunas user e activity code
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Cria as janelas
# 3. Adiciona a coluna com o código da atividade
pipeline = Pipeline([differ, windowizer, standard_label_adder])

# Executa o pipeline
new_df = pipeline(dataframe)
new_df


Executing CalcTimeDiffMean
Executing Windowize


Creating windows: 100%|██████████| 918/918 [02:27<00:00,  6.23it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,activity code,accel-timestamp diff,user,window,timestamp-accel,timestamp-gyro,standard activity code
0,-0.900995,1.988477,0.504245,-2.074267,-4.365631,-0.50346,1.585056,-0.192807,-1.778517,-1.245609,...,-0.02722,0.314905,0.890524,A,50331648.0,1600,0,252207710601216.0,252207962259456.0,2
1,-0.059554,0.42926,1.072248,-2.011736,-4.79265,-2.087036,0.663866,0.317017,-1.504834,-2.359169,...,0.406141,0.615971,0.795717,A,50331648.0,1600,1,252210713722880.0,252210965381120.0,2
2,-0.855499,-1.242746,0.316417,-1.833108,-4.946311,1.429507,1.555702,-1.230934,-2.758278,-0.822352,...,0.100644,0.608968,0.928236,A,50331648.0,1600,2,252213716844544.0,252213968502784.0,2
3,-1.887573,-0.375515,0.282729,-2.745547,-5.598461,-1.923976,1.519264,0.538521,-2.826598,-2.765749,...,-0.015974,0.365056,1.097406,A,50331648.0,1600,3,252216719966208.0,252216971624448.0,2
4,-0.791543,0.976555,1.730751,-4.135104,-7.03357,-3.470397,-1.347896,0.434743,0.326745,-1.416224,...,-0.174049,0.307581,0.470185,A,50331648.0,1600,4,252219723087872.0,252219974746112.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55239,-8.585549,-8.468535,-8.138549,-8.079633,-7.559274,-8.03026,-8.07408,-7.964792,-7.831212,-7.95935,...,0.16659,0.143167,0.114883,S,33554432.0,1650,54,357716216250368.0,357716350468096.0,-1
55240,-7.815167,-8.233317,-8.762104,-8.713644,-8.807968,-7.839873,-7.460501,-8.216674,-8.299685,-8.114063,...,-0.493232,-0.242802,0.01463,S,67108864.0,1650,55,357719236149248.0,357719336812544.0,-1
55241,-8.477044,-7.227965,-7.288946,-8.185891,-7.543177,-8.242362,-7.35019,-7.622583,-7.740208,-7.632626,...,0.239498,0.29974,0.322814,S,33554432.0,1650,56,357722222493696.0,357722356711424.0,-1
55242,-7.660219,-7.764132,-8.19314,-8.04065,-8.146636,-8.491104,-8.791851,-8.869278,-8.717289,-8.584724,...,-0.0708,-0.115323,-0.14952,S,67108864.0,1650,57,357725242392576.0,357725343055872.0,-1


## Normatizado

In [5]:
# Lê o dataset
dataframe = read_wisdm(wisdm_path, interpol=True)

differ = CalcTimeDiffMean(
    groupby_column=column_group,
    column_to_diff="timestamp-accel",
    new_column_name="accel-timestamp diff",
)

# Instancia o objeto que aplica o filtro Butterworth
butterworth = ButterworthFilter(
    axis_columns=["accel-x", "accel-y", "accel-z"],             # Nome das colunas do aceletômetro em que o filtro será aplicado
    fs=20                                                       # Frequência de amostragem original
)

# Instancia o objeto que cria as janelas
windowizer = Windowize(
    features_to_select=feature_columns,                         # Nome das colunas que serão usadas como features
    samples_per_window=60,                                      # Numero de amostras por janela 
    samples_per_overlap=0,                                      # Numero de amostras que se sobrepõem
    groupby_column=column_group,                                # Agrupa pelas colunas user e activity code. As janelas são criadas para cada grupo das colunas user e activity code
)

# Instancia o objeto que adiciona a coluna com o código da atividade
standard_label_adder = AddStandardActivityCode(standard_activity_code_map)

# Cria o pipeline
# 1. Calcula a diferença entre os tempos
# 2. Aplica o filtro Butterworth
# 3. Cria as janelas
# 4. Adiciona a coluna com o código da atividade
pipeline = Pipeline(
    [
        differ,
        butterworth,
        windowizer,
        standard_label_adder
    ]
)

# Executa o pipeline
new_df_standartized = pipeline(dataframe)
new_df_standartized

Executing CalcTimeDiffMean
Executing ButterworthFilter
Executing Windowize


Creating windows: 100%|██████████| 918/918 [02:32<00:00,  6.00it/s]


Executing AddStandardActivityCode


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-57,gyro-z-58,gyro-z-59,activity code,accel-timestamp diff,user,window,timestamp-accel,timestamp-gyro,standard activity code
0,0.203999,3.052405,1.529101,-1.086387,-3.412541,0.4171,2.475407,0.669701,-0.941456,-0.431584,...,-0.02722,0.314905,0.890524,A,50331648.0,1600,0,252207710601216.0,252207962259456.0,2
1,1.11181,1.609697,2.261287,-0.814601,-3.58796,-0.875369,1.881903,1.540785,-0.275996,-1.125939,...,0.406141,0.615971,0.795717,A,50331648.0,1600,1,252210713722880.0,252210965381120.0,2
2,0.448921,0.063567,1.624386,-0.52372,-3.635738,2.741032,2.867946,0.081795,-1.445302,0.490634,...,0.100644,0.608968,0.928236,A,50331648.0,1600,2,252213716844544.0,252213968502784.0,2
3,-0.473682,1.044933,1.709572,-1.312519,-4.159498,-0.479369,2.969187,1.993398,-1.367156,-1.302158,...,-0.015974,0.365056,1.097406,A,50331648.0,1600,3,252216719966208.0,252216971624448.0,2
4,0.817271,2.589154,3.346748,-2.516122,-5.412037,-1.846764,0.277373,2.061176,1.953867,0.211111,...,-0.174049,0.307581,0.470185,A,50331648.0,1600,4,252219723087872.0,252219974746112.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55239,-0.431281,-0.341486,-0.037555,-0.003447,0.493422,0.00033,-0.064159,0.025941,0.141851,-0.002414,...,0.16659,0.143167,0.114883,S,33554432.0,1650,54,357716216250368.0,357716350468096.0,-1
55240,0.531837,0.13,-0.382092,-0.316574,-0.393507,0.592278,0.98959,0.251557,0.186822,0.390787,...,-0.493232,-0.242802,0.01463,S,67108864.0,1650,55,357719236149248.0,357719336812544.0,-1
55241,-0.056782,1.185063,1.117809,0.215498,0.853678,0.15069,1.039669,0.764552,0.64451,0.749803,...,0.239498,0.29974,0.322814,S,33554432.0,1650,56,357722222493696.0,357722356711424.0,-1
55242,0.041996,-0.016277,-0.396841,-0.193471,-0.246525,-0.536402,-0.781294,-0.801998,-0.592809,-0.40295,...,-0.0708,-0.115323,-0.14952,S,67108864.0,1650,57,357725242392576.0,357725343055872.0,-1


# Função de sanidade para o balanceamento

In [6]:
def sanity_function(train_df, val_df, test_df):
    train_size = train_df.shape[0]
    val_size = val_df.shape[0]
    test_size = test_df.shape[0]
    total = train_size + val_size + test_size

    print(f"Train size: {train_size} ({train_size/total*100:.2f}%)")
    print(f"Validation size: {val_size} ({val_size/total*100:.2f}%)")
    print(f"Test size: {test_size} ({test_size/total*100:.2f}%)")

    print(f"Train activities: {train_df['standard activity code'].unique()}")
    print(f"Validation activities: {val_df['standard activity code'].unique()}")
    print(f"Test activities: {test_df['standard activity code'].unique()}")

    dataframes = {
        "Train": train_df,
        "Validation": val_df,
        "Test": test_df
    }
    for name, df in dataframes.items():
        users = df['user'].unique()
        activities = df['standard activity code'].unique()

        tam = len(df[(df["user"] == users[0]) & (df["standard activity code"] == activities[0])])
        flag = True
        for user in users:
            for activity in activities:
                if len(df[(df["user"] == user) & (df["standard activity code"] == activity)]) != tam:
                    print(f"User {user} activity {activity} has different size")
                    flag = False
        if flag:
            print(f"All users have the same size per activity in {name} dataset - Samples per user and activity: {tam}")

    users = train_df['user'].unique()
    activities = train_df['standard activity code'].unique()

    print(f"Users in train: {train_df['user'].unique()}")
    print(f"Users in validation: {val_df['user'].unique()}")
    print(f"Users in test: {test_df['user'].unique()}")

# Filtra por elementos iguais

In [7]:
filter_common = FilterByCommonRows(match_columns=["user", "activity code", "window"])
new_df, new_df_standartized = filter_common(new_df, new_df_standartized)

## Balanceia e salva os dados brutos

In [8]:
output_path_balanced = Path("../data/raw_balanced")
output_path_balanced_standartized = Path("../data/standartized_balanced")
output_path_unbalanced = Path("../data/unbalanced")

# Salva os dados
ouptut_dir = output_path_unbalanced / "KuHar"
ouptut_dir.mkdir(parents=True, exist_ok=True)
new_df.to_csv(output_path_unbalanced / "KuHar/raw_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
balancer = BalanceToMinimumClass(class_column="standard activity code")
new_df_balanced = balancer(new_df[new_df["standard activity code"] != -1])

train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_balanced)

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

ouptut_dir = output_path_balanced / "KuHar" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

sanity_function(train_df, val_df, test_df)

Data saved at ../data/raw_balanced/KuHar
Train size: 10620 (70.59%)
Validation size: 1180 (7.84%)
Test size: 3245 (21.57%)
Train activities: [2 5 6 0 1]
Validation activities: [2 5 6 0 1]
Test activities: [2 5 6 0 1]
All users have the same size per activity in Train dataset - Samples per user and activity: 59
All users have the same size per activity in Validation dataset - Samples per user and activity: 59
All users have the same size per activity in Test dataset - Samples per user and activity: 59
Users in train: [1600 1603 1604 1609 1610 1611 1613 1616 1618 1619 1620 1621 1622 1624
 1626 1627 1628 1629 1630 1631 1632 1633 1635 1636 1637 1638 1639 1641
 1642 1643 1644 1645 1646 1648 1649 1650]
Users in validation: [1602 1612 1623 1625]
Users in test: [1601 1605 1606 1607 1608 1614 1615 1617 1634 1640 1647]


## Balanceia e salva os dados processados

In [9]:
# Salva os dados
new_df_standartized.to_csv(output_path_unbalanced / "KuHar/standartized_unbalanced.csv", index=False)

# Balanceia os dados (exlcuindo as linhas com atividade -1)
balancer = BalanceToMinimumClass(class_column="standard activity code")
new_df_standartized_balanced = balancer(new_df_standartized[new_df_standartized["standard activity code"] != -1])

train_df, test_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.8,
    random_state=42
)(new_df_standartized_balanced)

train_df, val_df = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user", 
    class_column="standard activity code", 
    train_size=0.9,
    random_state=42
)(train_df)

ouptut_dir = output_path_balanced_standartized / "KuHar" 
ouptut_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(ouptut_dir / "train.csv", index=False)
val_df.to_csv(ouptut_dir / "validation.csv", index=False)
test_df.to_csv(ouptut_dir / "test.csv", index=False)
print(f"Data saved at {ouptut_dir}")

sanity_function(train_df, val_df, test_df)

Data saved at ../data/standartized_balanced/KuHar
Train size: 10620 (70.59%)
Validation size: 1180 (7.84%)
Test size: 3245 (21.57%)
Train activities: [2 5 6 0 1]
Validation activities: [2 5 6 0 1]
Test activities: [2 5 6 0 1]
All users have the same size per activity in Train dataset - Samples per user and activity: 59
All users have the same size per activity in Validation dataset - Samples per user and activity: 59
All users have the same size per activity in Test dataset - Samples per user and activity: 59
Users in train: [1600 1603 1604 1609 1610 1611 1613 1616 1618 1619 1620 1621 1622 1624
 1626 1627 1628 1629 1630 1631 1632 1633 1635 1636 1637 1638 1639 1641
 1642 1643 1644 1645 1646 1648 1649 1650]
Users in validation: [1602 1612 1623 1625]
Users in test: [1601 1605 1606 1607 1608 1614 1615 1617 1634 1640 1647]
