In [23]:
from pathlib import Path
import pandas as pd
import numpy as np

from librep.datasets.har.loaders import PandasMultiModalLoader

In [93]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from umap import UMAP
from librep.transforms.fft import FFT
from librep.datasets.multimodal.operations import DatasetWindowedTransform
from sklearn.ensemble import RandomForestClassifier
from umap import UMAP
from librep.datasets.multimodal.operations import (
    DatasetFitter,
    DatasetPredicter,
    DatasetWindowedTransform,
    DatasetY,
    DatasetEvaluator,
    DatasetTransformer,
)
from librep.metrics.report import ClassificationReport

def aplly_fft(train, val, test):

    transformer = DatasetWindowedTransform(transform=FFT(centered=True), do_fit=False, new_suffix=".fft")
    train_fft, val_fft, test_fft = transformer(train), transformer(val), transformer(test)

    return train_fft, val_fft, test_fft

def train_models(info, train_processed, validation_processed, test_processed, umap: bool = False, reduce: str = "all"):
    
    train_fft, val_fft, test_fft = aplly_fft(train_processed, validation_processed, test_processed)
    train_fft = train_fft.concatenate(val_fft)

    if umap:
        if reduce == "all":
            model_reducer = UMAP(n_components=10)     
            train_fft.X = model_reducer.fit_transform(train_fft.X)
            test_fft.X = model_reducer.transform(test_fft.X)

        elif reduce == "sensor":
            model_reducer_acc = UMAP(n_components=10)     
            train_fft_acc = model_reducer_acc.fit_transform(train_fft.X[:90])
            test_fft_acc = model_reducer_acc.transform(test_fft.X[:90])

            model_reducer_gyr = UMAP(n_components=10)     
            train_fft_gyr = model_reducer_gyr.fit_transform(train_fft.X[90:])
            test_fft_gyr = model_reducer_gyr.transform(test_fft.X[90:])

            train_fft.X = np.concatenate([train_fft_acc, train_fft_gyr], axis=0)
            test_fft.X = np.concatenate([test_fft_acc, test_fft_gyr], axis=0)
        else:
            data = {
                "train": [],
                "test": []
            }
            for i in range(6):
                model_reducer = UMAP(n_components=10)
                train = model_reducer.fit_transform(train_fft.X[i*30: (i+1)*30])
                test = model_reducer.fit_transform(test_fft.X[i*30: (i+1)*30])

                data["train"].append(train)
                data["test"].append(test)
            train_fft.X = np.concatenate(data["train"], axis=0)
            test_fft.X = np.concatenate(data["test"], axis=0)

    print(f"Size data: {train_fft.X.shape}")

    models = {
        "RandomForest": RandomForestClassifier(),
        "SVC": SVC(),
        "KNN": KNeighborsClassifier(n_neighbors=5)
    }

    for classifier, model in models.items():

        DatasetFitter(model, use_y=True)(train_fft)
        y_pred = DatasetPredicter(model)(test_fft)
        y_true = DatasetY()(test_fft)

        accuracy = DatasetEvaluator(ClassificationReport(normalize="all", plot_confusion_matrix=False))(y_true, y_pred)['accuracy']
        info[classifier + " - acc"] = accuracy

        info["Type reducer"] = reduce
        info["Total of features"] = train_fft.X.shape[1]
    
    return info


In [25]:
def create_statistics(info, path, datasets):
    for dataset in datasets:

        processed_view_path = path / dataset
        print(dataset)

        train_processed, validation_processed, test_processed = PandasMultiModalLoader(
            processed_view_path
        ).load(label="standard activity code")

        total_samples = len(train_processed) + len(validation_processed) + len(test_processed)

        info[dataset]["Train %"] = round(len(train_processed) / total_samples, 2) * 100
        info[dataset]["Validation %"] = round(len(validation_processed) / total_samples, 2) * 100
        info[dataset]["Test %"] = round(len(test_processed) / total_samples, 2) * 100
        info[dataset]["Total of samples in train"] = len(train_processed)
        info[dataset]["Total of samples in validation"] = len(validation_processed)
        info[dataset]["Total of samples in test"] = len(test_processed)
        info[dataset]["Total of samples"] = total_samples

        info[dataset]["Total of users"] = len(train_processed.data["user"].unique()) + len(validation_processed.data["user"].unique()) + len(test_processed.data["user"].unique())
        info[dataset]["Total of activities"] = len(train_processed.data["standard activity code"].unique())
        info[dataset]["Total users in train"] = len(train_processed.data["user"].unique())
        info[dataset]["Total users in validation"] = len(validation_processed.data["user"].unique())
        info[dataset]["Total users in test"] = len(test_processed.data["user"].unique())

        info[dataset] = train_models(info[dataset], train_processed, validation_processed, test_processed)

        # Há alguma intercessão entre os usuários de treino, validação e teste?
        train_users = set(train_processed.data["user"].unique())
        validation_users = set(validation_processed.data["user"].unique())
        test_users = set(test_processed.data["user"].unique())

        r1 = np.any(np.isin(train_users, validation_users))
        r2 = np.any(np.isin(train_users, test_users))
        r3 = np.any(np.isin(validation_users, test_users))

        if not (r1 or r2 or r3):
            print(f"Não há usuários em comum entre os conjuntos de {dataset}.")
    
    return info

In [26]:
columns = [
    "Total of samples in train",
    "Total of samples in validation",
    "Total of samples in test",
    "Total of samples",
    "Total of users",
    "Total of activities",
    "Total users in train",
    "Total users in validation",
    "Total users in test",
]

classifiers = [ 
    "RandomForest - acc",
    "SVC - acc",
    "KNN - acc"
]

In [7]:
pathes = {
    "old": [
        Path(f"/home/patrick/Downloads/data/standartized_balanced"), # Views de Otavio
        Path(f"/home/patrick/Downloads/data/standartized_inter_balanced"), # Views de Otavio
    ],
    "new": [
        Path(f"../data/standartized_balanced"), #Views de Patrick
        Path(f"../data/standartized_inter_balanced"), #Views de Patrick
    ]
}

datasets = [
    "KuHar",
    "MotionSense",
    "UCI",
    "WISDM",
    "RealWorld",
    "RealWorld_thigh",
    "RealWorld_upperarm",
    "RealWorld_waist",
]

dfs = []

for key, path in pathes.items():

    if key == "old":
        datasets_info = datasets[:-3].copy()
    else:
        datasets_info = datasets.copy()

    for processed_view_path in path:
        info = {
            dataset: {
                "Train %": None,
                "Validation %": None,
                "Test %": None,

                "Total of samples in train": None,
                "Total of samples in validation": None,
                "Total of samples in test": None,
                "Total of samples": None,

                "Total of users": None,
                "Total of activities": None,

                "Total users in train": None,
                "Total users in validation": None,
                "Total users in test": None,
            }
            for dataset in datasets_info
        }

        info = create_statistics(info, processed_view_path, datasets_info)

        df = pd.DataFrame(info).T
        df.round(2)

        for column in columns:
            df[column] = df[column].astype(int)

        for classifier in classifiers:
            df[classifier] = df[classifier] *100
            df[classifiers] = df[classifiers].round(2)

        df["View"] = processed_view_path.name
        df["Type"] = key

        dfs.append(df)

KuHar
Não há usuários em comum entre os conjuntos de KuHar.
MotionSense
Não há usuários em comum entre os conjuntos de MotionSense.
UCI
Não há usuários em comum entre os conjuntos de UCI.
WISDM
Não há usuários em comum entre os conjuntos de WISDM.
RealWorld
Não há usuários em comum entre os conjuntos de RealWorld.
KuHar
Não há usuários em comum entre os conjuntos de KuHar.
MotionSense
Não há usuários em comum entre os conjuntos de MotionSense.
UCI
Não há usuários em comum entre os conjuntos de UCI.
WISDM
Não há usuários em comum entre os conjuntos de WISDM.
RealWorld
Não há usuários em comum entre os conjuntos de RealWorld.
KuHar
Não há usuários em comum entre os conjuntos de KuHar.
MotionSense
Não há usuários em comum entre os conjuntos de MotionSense.
UCI
Não há usuários em comum entre os conjuntos de UCI.
WISDM
Não há usuários em comum entre os conjuntos de WISDM.
RealWorld
Não há usuários em comum entre os conjuntos de RealWorld.
RealWorld_thigh
Não há usuários em comum entre os co

In [8]:
# df = pd.concat(dfs, keys=["old", "new"])
df = pd.concat(dfs)
df

Unnamed: 0,Train %,Validation %,Test %,Total of samples in train,Total of samples in validation,Total of samples in test,Total of samples,Total of users,Total of activities,Total users in train,Total users in validation,Total users in test,RandomForest - acc,SVC - acc,KNN - acc,View,Type
KuHar,71.0,22.0,7.0,1386,426,144,1956,79,6,57,7,15,86.11,72.0,89.0,standartized_balanced,old
MotionSense,71.0,8.0,21.0,3558,420,1062,5040,24,6,17,2,5,92.94,83.0,91.0,standartized_balanced,old
UCI,70.0,10.0,20.0,2420,340,690,3450,30,5,21,3,6,92.17,80.0,82.0,standartized_balanced,old
WISDM,71.0,8.0,21.0,10920,1180,3245,15345,51,5,36,4,11,84.65,80.0,86.0,standartized_balanced,old
RealWorld,68.0,12.0,19.0,30696,5562,8694,44952,15,6,10,2,3,71.84,72.0,67.0,standartized_balanced,old
KuHar,72.0,21.0,7.0,1386,408,144,1938,79,6,57,7,15,81.94,72.0,89.0,standartized_inter_balanced,old
MotionSense,72.0,21.0,7.0,1386,408,144,1938,24,6,17,2,5,92.36,83.0,87.0,standartized_inter_balanced,old
UCI,72.0,21.0,7.0,1155,340,120,1615,30,5,21,3,6,89.17,79.0,81.0,standartized_inter_balanced,old
WISDM,72.0,21.0,7.0,1155,340,120,1615,49,5,36,4,9,85.0,73.0,85.0,standartized_inter_balanced,old
RealWorld,72.0,21.0,7.0,1386,408,144,1938,15,6,10,2,3,69.44,69.0,67.0,standartized_inter_balanced,old


In [22]:
new_df = df.copy()

# Removendo a coluna Total users in train

new_df = new_df.drop(columns=["Total users in train", "Total users in validation", "Total users in test", "Total of samples in train", "Total of samples in validation", "Total of samples in test"])
new_df["Dataset"] = new_df.index
new_df = new_df.reset_index(drop=True)
new_df = new_df[new_df["Type"] == "old"]
new_df

Unnamed: 0,Train %,Validation %,Test %,Total of samples,Total of users,Total of activities,RandomForest - acc,SVC - acc,KNN - acc,View,Type,Dataset
0,71.0,22.0,7.0,1956,79,6,86.11,72.0,89.0,standartized_balanced,old,KuHar
1,71.0,8.0,21.0,5040,24,6,92.94,83.0,91.0,standartized_balanced,old,MotionSense
2,70.0,10.0,20.0,3450,30,5,92.17,80.0,82.0,standartized_balanced,old,UCI
3,71.0,8.0,21.0,15345,51,5,84.65,80.0,86.0,standartized_balanced,old,WISDM
4,68.0,12.0,19.0,44952,15,6,71.84,72.0,67.0,standartized_balanced,old,RealWorld
5,72.0,21.0,7.0,1938,79,6,81.94,72.0,89.0,standartized_inter_balanced,old,KuHar
6,72.0,21.0,7.0,1938,24,6,92.36,83.0,87.0,standartized_inter_balanced,old,MotionSense
7,72.0,21.0,7.0,1615,30,5,89.17,79.0,81.0,standartized_inter_balanced,old,UCI
8,72.0,21.0,7.0,1615,49,5,85.0,73.0,85.0,standartized_inter_balanced,old,WISDM
9,72.0,21.0,7.0,1938,15,6,69.44,69.0,67.0,standartized_inter_balanced,old,RealWorld


In [70]:
dataset = "KuHar"
processed_view_path = Path(f"/home/patrick/Downloads/data/standartized_balanced") / dataset
train_processed, validation_processed, test_processed = PandasMultiModalLoader(
    processed_view_path
).load(label="standard activity code")

In [29]:
train_fft, val_fft, test_fft = aplly_fft(train_processed, validation_processed, test_processed)
train_fft = train_fft.concatenate(val_fft)

In [39]:
train_fft.X.shape

(1812, 180)

In [88]:
info_classifiers = {dataset: {}}
info_classifiers[dataset] = train_models(info_classifiers[dataset], train_processed, validation_processed, test_processed, umap=True, reduce="all")

Size data: (1812, 10)


In [94]:
info_classifiers = {dataset: {}}
info_classifiers[dataset] = train_models(info_classifiers[dataset], train_processed, validation_processed, test_processed, umap=True, reduce="sensor")

Size data: (1812, 10)


In [87]:
info_classifiers

{'KuHar': {'RandomForest - acc': 0.4583333333333333,
  'Type reducer': 'sensor',
  'Total of features': 10,
  'SVC - acc': 0.4236111111111111,
  'KNN - acc': 0.4652777777777778}}