# Overview

- nb004で作成した特徴量を使ってrfcモデルを作成する。

# Const

In [1]:
NB = '006'
DIR_AUDIO = './../data_ignore/official/train_audio'
DIR_MODEL = './../data_ignore/model'
FEATSETS = 'nb004_librosa_mfcc.csv'
PATH_FEAT = f'./../data_ignore/features/table/{FEATSETS}'
PATH_TRAIN_CSV = './../data_ignore/official/train.csv'
N_FOLD = 5

# Import everything I need :)

In [2]:
import os
import gc
import sys
import glob
import time
import pickle
import joblib
import numpy as np
import pandas as pd
from fastprogress import progress_bar
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

sys.path.insert(0, './../src/util/')
from const import BIRD_CODE, INV_BIRD_CODE

# My function

# Preparation

setting

In [3]:
pd.set_option('display.max_columns', 100)

<br>

loading

In [6]:
train_csv = pd.read_csv(PATH_TRAIN_CSV)
df_feat = pd.read_csv(PATH_FEAT)
print(df_feat.shape)
df_feat.head()

(21375, 13)


Unnamed: 0,librosa_mfcc_2,librosa_mfcc_3,librosa_mfcc_4,librosa_mfcc_5,librosa_mfcc_6,librosa_mfcc_7,librosa_mfcc_8,librosa_mfcc_9,librosa_mfcc_10,librosa_mfcc_11,librosa_mfcc_12,birds,filename
0,6.57,-29.89,7.22,-0.891,1.132,-1.086,0.7397,1.166,0.495,-0.876,0.2184,aldfly,XC134874.mp3
1,21.92,-13.02,5.324,5.566,-0.261,-3.242,1.225,1.003,1.748,-1.192,-0.1931,aldfly,XC135454.mp3
2,16.45,-14.47,8.08,4.57,-0.888,-3.943,2.58,0.589,0.9697,-0.3743,-0.742,aldfly,XC135455.mp3
3,19.39,-12.805,6.594,6.418,-0.856,-4.598,1.675,1.841,1.866,-1.056,-0.8057,aldfly,XC135456.mp3
4,30.81,-3.867,0.5264,0.6313,-0.657,-3.547,1.342,0.1382,1.067,-0.0874,0.597,aldfly,XC135457.mp3


## create X, y

In [14]:
y = df_feat['birds'].apply(lambda x: BIRD_CODE[x]).values
X = df_feat.drop(['birds', 'filename'], axis=1)

In [6]:
print(y)
X.head()

[  0   0   0 ... 263 263 263]


Unnamed: 0,librosa_mfcc_2,librosa_mfcc_3,librosa_mfcc_4,librosa_mfcc_5,librosa_mfcc_6,librosa_mfcc_7,librosa_mfcc_8,librosa_mfcc_9,librosa_mfcc_10,librosa_mfcc_11,librosa_mfcc_12
0,6.57,-29.89,7.22,-0.891,1.132,-1.086,0.7397,1.166,0.495,-0.876,0.2184
1,21.92,-13.02,5.324,5.566,-0.261,-3.242,1.225,1.003,1.748,-1.192,-0.1931
2,16.45,-14.47,8.08,4.57,-0.888,-3.943,2.58,0.589,0.9697,-0.3743,-0.742
3,19.39,-12.805,6.594,6.418,-0.856,-4.598,1.675,1.841,1.866,-1.056,-0.8057
4,30.81,-3.867,0.5264,0.6313,-0.657,-3.547,1.342,0.1382,1.067,-0.0874,0.597


## Train

In [7]:
seed = 7
model_params = {'n_estimators': 100,
                'random_state': seed,
                'n_jobs': 5
               }

In [8]:
%%time
kf = KFold(n_splits=N_FOLD, shuffle=True, random_state=seed)
models = []
scores_train = []
scores_valid = []
oof_pred = np.zeros(len(X))
for fold_i, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f'===== fold {fold_i+1}/{N_FOLD}  {time.ctime()} =====')
    X_train, X_valid = X.values[train_idx, :], X.values[valid_idx, :]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    # init
    model = RandomForestClassifier(**model_params)
    
    # fit
    model.fit(X_train, y_train)
    models.append(model)
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    oof_pred[valid_idx] = y_valid_pred
    
    # evaluate
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_valid = accuracy_score(y_valid, y_valid_pred)
    scores_train.append(acc_train)
    scores_valid.append(acc_valid)
    
    print(f':: score(train): {acc_train :.5f}, score(valid) {acc_valid :.5f} ::\n')
    
scores_train = np.array(scores_train)
scores_valid = np.array(scores_valid)
score_oof = accuracy_score(y, oof_pred)
print(f'===== finish {time.ctime} =====')
print(f':: score(train): mean={scores_train.mean():.5f}, std={scores_train.std():.5f} ::')
print(f':: score(valid): mean={scores_valid.mean():.5f}, std={scores_valid.std():.5f} ::')
print(f':: score(oof): {score_oof:.5f} ::')

===== fold 1/5  Sun Aug  2 19:41:57 2020 =====
:: score(train): 0.99971, score(valid) 0.12117 ::

===== fold 2/5  Sun Aug  2 19:42:06 2020 =====
:: score(train): 0.99982, score(valid) 0.12327 ::

===== fold 3/5  Sun Aug  2 19:42:14 2020 =====
:: score(train): 0.99977, score(valid) 0.12678 ::

===== fold 4/5  Sun Aug  2 19:42:23 2020 =====
:: score(train): 0.99977, score(valid) 0.12678 ::

===== fold 5/5  Sun Aug  2 19:42:31 2020 =====
:: score(train): 0.99965, score(valid) 0.12655 ::

===== finish <built-in function ctime> =====
:: score(train): mean=0.99974, std=0.00006 ::
:: score(valid): mean=0.12491, std=0.00230 ::
:: score(oof): 0.12491 ::
CPU times: user 3min 7s, sys: 12.9 s, total: 3min 20s
Wall time: 42.7 s


# Save model
- 使った特徴量名(librosa_mfcc)も一緒に保存

In [8]:
save_dir = f'{DIR_MODEL}/{NB}_{model.__class__.__name__}/'

file_dir = os.path.dirname(save_dir)
if not os.path.exists(file_dir):
    os.makedirs(file_dir)

<br>

save info

In [22]:
info_dict = {}
info_dict['featsets'] = FEATSETS
info_dict['feat_names'] = X.columns

save_path_info = f'{save_dir}info.joblib'
joblib.dump(info_dict, save_path_info)

['./../data_ignore/model/006_RandomForestClassifier/info.joblib']

<br>

save models

In [23]:
for i, model in enumerate(progress_bar(models)):
    save_path = f'{save_dir}model_{i+1}.joblib'
    joblib.dump(model, save_path, compress=9)

# Check Model

In [16]:
model = RandomForestClassifier()
save_dir = f'{DIR_MODEL}/{NB}_{model.__class__.__name__}/'

In [9]:
path_list = sorted(glob.glob(f'{save_dir}*'))
models = []
for path in progress_bar(path_list):
    basename = os.path.basename(path)
    if 'model' in basename:
        models.append(joblib.load(path))