In [1]:
from tools import OUT_PATH, open_meta_df
import pandas as pd
import numpy as np
from skopt import BayesSearchCV
from sklearn.base import clone
from cuml.ensemble import RandomForestClassifier as cuRF
from tqdm.auto import tqdm
from skopt.space import Integer, Categorical, Real
from pprint import pprint
from warnings import filterwarnings
from joblib import dump
from KMeansCustomEstimator import KMeansCustomEstimator
from functools import reduce
from operator import mul
filterwarnings("ignore")

In [2]:
model_name = 'CURF'

In [3]:
meta_df = open_meta_df()
video_id = np.load(OUT_PATH / 'video_id.npy')
landmarks = np.load(OUT_PATH / 'landmarks.npy')
train_idx = np.load(OUT_PATH / 'train_idx.npy')
test_idx = np.load(OUT_PATH / 'test_idx.npy')
meta_df.head()

Unnamed: 0,filename,stem,pose_id,word,sinalizador,repetition
0,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-1,1,Acontecer,1,1
1,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-2,1,Acontecer,1,2
2,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-3,1,Acontecer,1,3
3,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-4,1,Acontecer,1,4
4,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-5,1,Acontecer,1,5


In [4]:
landmarks.shape

(147205, 33, 3)

In [5]:
observations = reduce(mul, landmarks.shape[1:])

stacked_train_landmarks = [landmarks[video_id == i].reshape((-1, observations))  for i in train_idx]
stacked_test_landmarks = [landmarks[video_id == i].reshape((-1, observations)) for i in test_idx]

classes = meta_df['pose_id'].values

y_train = classes[train_idx]
y_test = classes[test_idx]
observations

99

In [6]:
len(y_train), len(y_test), len(y_train) + len(y_test), len(classes)

(793, 265, 1058, 1058)

In [7]:
kmeans_keys = frozenset(['n_clusters'])

estimator_keys = frozenset([
    'n_estimators',
    'split_criterion',
    'bootstrap',
    'max_samples',
    'max_depth',
    'max_leaves',
    'max_features',
    'n_bins',
    'min_samples_leaf',
    'min_samples_split',
    'min_impurity_decrease'
])

opt = BayesSearchCV(
    KMeansCustomEstimator(
        cuRF,
        two_dimensions=False,
        kmeans_keys=kmeans_keys,
        estimator_keys=estimator_keys,
        n_clusters=8,
        n_estimators=100,
        split_criterion=0,
        bootstrap=True,
        max_samples=0,
        max_depth=1,
        max_leaves=-1,
        max_features="sqrt",
        n_bins=8,
        min_samples_leaf=1,
        min_samples_split=2,
        min_impurity_decrease=0,
    ),
    {
        'estimator': Categorical([cuRF]),
        'two_dimensions': Categorical([False]),
        'kmeans_keys': Categorical([kmeans_keys]),
        'estimator_keys': Categorical([estimator_keys]),
        'n_clusters': Integer(1, 50),
        'n_estimators': Integer(100, 5000),
        'split_criterion': Integer(0, 1),
        'bootstrap': Categorical([True, False]),
        'max_samples': Real(0.1, 1),
        'max_depth': Integer(1, 500),
        'max_leaves': Categorical([-1, 1, 2, 4, 8, 16, 32, 64, 128]),
        'max_features': Categorical(["sqrt", "log2"]),
        'n_bins': Integer(8, 1024),
        'min_samples_leaf': Integer(1, 20),
        'min_samples_split': Integer(2, 20),
        'min_impurity_decrease': Real(0, 0.999),
    },
    n_iter=100,
    random_state=42,
    cv=3,
    n_jobs=None,
    verbose=1,
)
opt.fit(stacked_train_landmarks, y_train)
opt.best_score_

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[W] [21:22:43.995988] If bootstrap sampling is disabled, max_samples value is ignored and whole dataset is used for building each tree
[W] [21:24:00.286251] If bootstrap sampling is disabled, max_samples value is ignored and whole dataset is used for building each tree
[W] [21:25:16.059720] If bootstrap sampling is disabled, max_samples value is ignored and whole dataset is used for building each tree
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[W] [21:32:48.513659] If bootstrap sampling is disabled, max_samples value is ignored and whole dataset is used for building each tree
[W] [21:34:42.300850] If bootstrap sampling is disabled, max_samples value is ignored and whole dataset is used for building each tree
[W] [21:36:35.760815] If bootstrap sampling is disabled, max_samples value is ignored and whole dataset is used for building each tree
Fitting 3 fol

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/eros/anaconda3/envs/minds_env_cuml2/include/rmm/mr/device/cuda_memory_resource.hpp

In [None]:
opt.best_params_

In [None]:
estimator = clone(opt.best_estimator_)

In [None]:
%%time
estimator.fit(stacked_train_landmarks, y_train)
train_score = estimator.score(stacked_train_landmarks, y_train) 
test_score = estimator.score(stacked_test_landmarks, y_test)

In [None]:
print('Acurácia de treino:', train_score, '\nAcurácia de teste:', test_score)

In [None]:
opt.cv_results_

In [None]:
dump(opt.cv_results_, OUT_PATH / f'scores/{model_name}_scores.h5', compress=9)
dump(estimator, OUT_PATH / f'Models/{model_name}.h5', compress=9)