In [1]:
# %% Package imports
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from deep_audio import Audio, Visualization, Directory, Model, JSON
import numpy as np
from joblib import Parallel, delayed
import multiprocessing
import joblib

In [2]:
kfold_split = True

In [3]:
# %% Load dataset
sampling_rate = 24000

method_algo = 'lpcc'

x, y, mapping = Directory.load_json_data(f'processed/{method_algo}/{method_algo}_{sampling_rate}.json',
                                         inputs_fieldname=method_algo)

In [4]:
# SPLIT 5 FOLDS
random_state = 42
n_estimators = 100

x_holder = []

for row in x:
    x_holder.append(row.flatten())

x = np.array(x_holder)

n = len(x)
# n = 1000

x = x[:n]
y = y[:n]

kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
kf.get_n_splits(x)

5

In [5]:
def process_fold(train_index, test_index):
    x_train, x_val = x[train_index], x[test_index]
    y_train, y_val = y[train_index], y[test_index]
    
    model = RandomForestClassifier(n_estimators=n_estimators)
        
    model.fit(x_train, y_train)
    
    # TESTA ACCURÁCIAS
    
    score_test = model.score(x_val, y_val)

    score_train = model.score(x_train, y_train)

    y_hat = model.predict(x_val)
    
    # SALVA MODELO
    filename = ''
    
    if kfold_split:
        filename = f'models/randomforest/5fold/{method_algo}_{sampling_rate}_{n_estimators}/acc{score_test}_seed{random_state}.sav'
    else:
        filename = f'models/randomforest/{method_algo}_{sampling_rate}_{n_estimators}/acc{score_test}_seed{random_state}.sav'
    
    Directory.create_directory(filename, file=True)
    
    joblib.dump(model, filename)
    
    # SALVA ACURÁCIAS E PARAMETROS
    
    return {
        'method': 'Random Forest',
        'seed': random_state,
        'feature_method': method_algo,
        'sample_rate': sampling_rate,
        'train_test': [len(x_train), len(x_val)],
        'score_train': score_train,
        'score_test': score_test,
        'f1_micro': f1_score(y_hat, y_val, average='micro'),
        'f1_macro': f1_score(y_hat, y_val, average='macro'),
        'model_file': f'acc{score_test}_seed{random_state}.sav',
        'params': model.get_params()
    }

In [None]:
num_cores = multiprocessing.cpu_count()

if kfold_split:
    dump_info = Parallel(n_jobs=num_cores, verbose=3)(
        delayed(process_fold)(train_index, test_index) for train_index, test_index in kf.split(x))
    JSON.create_json_file(f'models/randomforest/5fold/{method_algo}_{sampling_rate}_{n_estimators}/info.json', dump_info)
else:
    for train_index, test_index in kf.split(x):
        dump_info = process_fold(train_index, test_index)
        break
    JSON.create_json_file(f'models/randomforest/{method_algo}_{sampling_rate}_{n_estimators}/info.json', dump_info)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
    dump_info