In [1]:
from tools import OUT_PATH, open_meta_df
import pandas as pd
import numpy as np
from skopt import BayesSearchCV
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from tqdm.auto import tqdm
from skopt.space import Integer, Categorical, Real
from pprint import pprint
from warnings import filterwarnings
from joblib import dump
from KMeansCustomEstimator import KMeansCustomEstimator
from functools import reduce
from operator import mul
filterwarnings("ignore")

In [2]:
model_name = 'RF'

In [3]:
meta_df = open_meta_df()
video_id = np.load(OUT_PATH / 'video_id.npy')
landmarks = np.load(OUT_PATH / 'landmarks.npy')
train_idx = np.load(OUT_PATH / 'train_idx.npy')
test_idx = np.load(OUT_PATH / 'test_idx.npy')
meta_df.head()

Unnamed: 0,filename,stem,pose_id,word,sinalizador,repetition
0,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-1,1,Acontecer,1,1
1,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-2,1,Acontecer,1,2
2,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-3,1,Acontecer,1,3
3,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-4,1,Acontecer,1,4
4,/media/eros/BackupMae/datasets/Minds/Sinalizad...,01AcontecerSinalizador01-5,1,Acontecer,1,5


In [4]:
landmarks.shape

(147205, 33, 3)

In [5]:
observations = reduce(mul, landmarks.shape[1:])

stacked_train_landmarks = [landmarks[video_id == i].reshape((-1, observations))  for i in train_idx]
stacked_test_landmarks = [landmarks[video_id == i].reshape((-1, observations)) for i in test_idx]

classes = meta_df['pose_id'].values

y_train = classes[train_idx]
y_test = classes[test_idx]
observations

99

In [6]:
len(y_train), len(y_test), len(y_train) + len(y_test), len(classes)

(793, 265, 1058, 1058)

In [7]:
kmeans_keys = frozenset(['n_clusters'])

estimator_keys = frozenset([
    'bootstrap',
    'max_depth',
    'max_features',
    'min_samples_leaf',
    'min_samples_split',
    'n_estimators',
])

opt = BayesSearchCV(
    KMeansCustomEstimator(
        RandomForestClassifier,
        two_dimensions=False,
        kmeans_keys=kmeans_keys,
        estimator_keys=estimator_keys,
        n_clusters=8,
        n_neighbors=5,
        weights='uniform',
        bootstrap=False,
        max_depth=20,
        max_features='sqrt',
        min_samples_leaf=1,
        min_samples_split=2,
        n_estimators=100,
    ),
    {
        'estimator': Categorical([RandomForestClassifier]),
        'two_dimensions': Categorical([False]),
        'kmeans_keys': Categorical([kmeans_keys]),
        'estimator_keys': Categorical([estimator_keys]),
        'n_clusters': Integer(1, 50),
        'bootstrap': Categorical([True, False]),
        'max_depth': Categorical([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500, None]),
        'max_features': Categorical(["sqrt", "log2", None, .1, .2, .3, .4, .5, .6, .7, .8, .9]),
        'min_samples_leaf': Integer(1, 20),
        'min_samples_split': Integer(2, 20),
        'n_estimators': Integer(100, 5000),
    },
    n_iter=100,
    random_state=42,
    cv=3,
    n_jobs=-1,
    verbose=0,
)
opt.fit(stacked_train_landmarks, y_train)
opt.best_score_

0.8713455307794931

In [8]:
opt.best_params_

OrderedDict([('bootstrap', False),
             ('estimator', sklearn.ensemble._forest.RandomForestClassifier),
             ('estimator_keys',
              frozenset({'bootstrap',
                         'max_depth',
                         'max_features',
                         'min_samples_leaf',
                         'min_samples_split',
                         'n_estimators'})),
             ('kmeans_keys', frozenset({'n_clusters'})),
             ('max_depth', None),
             ('max_features', 'sqrt'),
             ('min_samples_leaf', 1),
             ('min_samples_split', 2),
             ('n_clusters', 50),
             ('n_estimators', 5000),
             ('two_dimensions', False)])

In [9]:
estimator = clone(opt.best_estimator_)

In [10]:
%%time
estimator.fit(stacked_train_landmarks, y_train)
train_score = estimator.score(stacked_train_landmarks, y_train) 
test_score = estimator.score(stacked_test_landmarks, y_test)

CPU times: user 12min 27s, sys: 15.9 s, total: 12min 43s
Wall time: 6min 49s


In [11]:
print('Acurácia de treino:', train_score, '\nAcurácia de teste:', test_score)

Acurácia de treino: 0.9987389659520807 
Acurácia de teste: 0.9283018867924528


In [12]:
opt.cv_results_

{'mean_fit_time': array([3.56354253e+02, 1.57789476e+01, 9.62830897e+02, 7.13854329e+00,
        4.53358966e+02, 2.64892886e+02, 6.40857671e+02, 1.01845119e+01,
        4.77045909e+02, 4.13381868e+02, 7.57123260e+01, 6.00746758e+03,
        4.21602122e+00, 5.22415042e+00, 1.03360242e+02, 1.55300621e+02,
        4.97250708e+00, 1.06327346e+02, 1.71826870e+01, 1.58826352e+01,
        1.47632659e+03, 7.49165591e+00, 1.13741890e+02, 1.13352129e+02,
        1.13808981e+02, 1.06928343e+02, 2.32333229e+03, 1.14129155e+02,
        1.46041743e+02, 1.17462863e+02, 1.18500695e+02, 1.19754244e+02,
        1.13977599e+02, 1.25461077e+02, 1.19702459e+02, 1.41100449e+02,
        1.86322739e+02, 1.65183398e+02, 1.47788650e+02, 1.51295722e+02,
        1.63849362e+02, 2.14308923e+02, 1.79496463e+02, 1.77540796e+02,
        1.45922749e+02, 1.45513352e+02, 1.46642434e+02, 1.66420503e+02,
        1.68016873e+02, 2.01433754e+02, 2.06017635e+02, 2.62121306e+02,
        2.30732437e+02, 1.14247979e+02, 1.47218

In [13]:
dump(opt.cv_results_, OUT_PATH / f'scores/{model_name}_scores.h5', compress=9)
dump(estimator, OUT_PATH / f'Models/{model_name}.h5', compress=9)

['Outs/Models/RF.h5']