In [6]:
import os
import sys
import time
import logging
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile
import pickle
from collections import OrderedDict
from keras.layers import ZeroPadding2D, Conv2D, MaxPooling2D, Reshape

from librosa.feature import melspectrogram
import librosa

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras
from keras import regularizers
from keras.models import load_model, Sequential
from keras.utils.np_utils import to_categorical
from keras import optimizers, losses, activations, models
from keras.utils.vis_utils import plot_model
from keras.layers import Convolution2D, Dense, Flatten, Dropout, MaxPooling2D, BatchNormalization, Merge, ELU, GRU
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

from config import TRAIN_PATH, TEST_PATH, INPUT_PATH, OUTPUT_PATH, LABELS
from preprocess_utils import list_wavs_fname, pad_audio, log_specgram, mel_specgram, label_transform
from log_util import LogUtil
log = LogUtil.get_logger(os.path.join(os.getcwd(),'CNN_new'), level=logging.INFO, console_mode=True)

reuse old logger (id:140259387955800, log_path:/home/chi/workspace/kaggle_speech_recognition_challenge/babel_speech/kaggle_speech_recognition_challenge/CNN_new), level:20


In [17]:
MODEL_FILE = 'mel_crnn.h5'
AUDIO_MAX_LEN= 16000
AUDIO_SR = 16000

In [10]:
# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
# sess = tf.Session(config=tf.ConfigProto(
#   allow_soft_placement=True, log_device_placement=True))

# 모델 생성

In [19]:
def preprocess(labels, fnames, feature_func, replace=False):
    start_time = time.time()
    data_file = os.path.join(TRAIN_PATH, 'Xy_train_'+MODEL_FILE.split('_')[0]+'.pkl') # # Xy_train_(피쳐추출알고리즘).pkl
    log.info(data_file)
    if (not replace) and os.path.exists(data_file):
        X, y = pickle.load(open(data_file, 'rb'))
    else:
        ## load & Preprocess Data
        y = []
        X = []
        for i, (label, fname) in enumerate(zip(labels, fnames)):
            samples, sample_rate = librosa.load(os.path.join(TRAIN_PATH, label, fname), sr=AUDIO_SR)
            if len(samples) > AUDIO_MAX_LEN:
                pass
            else:
                samples = pad_audio(samples)
                specgram = feature_func(samples, sample_rate)
                y.append(label)
                X.append(specgram)
        X = np.array(X)
        X = X.reshape(list(X.shape)+[1])
        y = to_categorical(label_transform(y))
        pickle.dump((X, y), open(data_file, 'wb'), protocol=4)

    ## Data Split
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=0) # 9:1로 train, valid 셋 나눔.    
    log.info('Shape of X_train : {}'.format(X_train.shape))
    log.info('Preprocessing Complete - {:.2f} sec'.format(time.time()-start_time))
    return X_train, X_valid, y_train, y_valid

In [20]:
def eval_model(X_valid, y_valid, model):
    preds_proba = model.predict(X_valid, batch_size=256, verbose=1)
    preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
    actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
    log.info('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
    preds = pd.Categorical(preds, categories=LABELS)
    actuals = pd.Categorical(actuals, categories=LABELS)
    log.info('\n -------------------------- \n')
    log.info(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
    log.info('\n -------------------------- \n')
    log.info(classification_report(actuals, preds))

In [21]:
## MEL CRNN MODEL
def get_model(input_shape, replace=False):
    if (not replace) and os.path.exists(os.path.join(OUTPUT_PATH, MODEL_FILE)):
        log.info('Load Model')
        model = load_model(os.path.join(OUTPUT_PATH, MODEL_FILE))
    else:
        # CNN Layers
        model = Sequential()
        model.add(BatchNormalization(input_shape=input_shape))

        model.add(Convolution2D(32, kernel_size=3, activation='elu'))
        model.add(MaxPooling2D(pool_size=(3,3), strides=(3,3)))
        model.add(Dropout(0.2))

        model.add(Convolution2D(32, kernel_size=3, activation='elu'))
        model.add(MaxPooling2D(pool_size=(3,2), strides=(3,2)))
        model.add(Dropout(0.2))

        model.add(Convolution2D(64, kernel_size=4, activation='elu'))
        model.add(MaxPooling2D(pool_size=(3,1), strides=(3,1)))
        model.add(Dropout(0.2))

        model.add(Convolution2D(128, kernel_size=4, activation='elu'))
        model.add(MaxPooling2D(pool_size=(2,1), strides=(2,1)))
        model.add(Dropout(0.2))

        model.add(Reshape(target_shape=(37, 128)))

        # GRU Layers

        model.add(GRU(128, return_sequences=True))
        model.add(BatchNormalization())
        model.add(GRU(128, return_sequences=False))
        model.add(BatchNormalization())

        model.add(Dense(len(LABELS), activation='softmax'))

        opt = optimizers.Adam(lr=0.001)
        model.compile(optimizer=opt, loss=losses.binary_crossentropy, metrics=['categorical_accuracy'])

    log.info('\n')
    log.info('\tMODEL : {}'.format(MODEL_FILE))
    model.summary(print_fn=log.info)
    return model

In [22]:
labels, fnames = list_wavs_fname(TRAIN_PATH)

../../input/train/audio/


In [23]:
X_train, X_valid, y_train, y_valid = preprocess(labels, fnames, mel_specgram, replace=True)

../../input/train/audio/Xy_train_mel.pkl


MemoryError: 

In [24]:
model = get_model(input_shape=X_train.shape[1:])

NameError: name 'X_train' is not defined

In [None]:
def get_callbacks(filepath, patience=2):
#     es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
#     return [es, msave]
    return [msave]
callbacks = get_callbacks(filepath=os.path.join(OUTPUT_PATH, MODEL_FILE), patience=5)

In [None]:
# model = load_model(os.path.join(OUTPUT_PATH, MODEL_FILE))
history = model.fit(x=X_train, y=y_train, batch_size=128, validation_data=(X_valid, y_valid), callbacks=callbacks, \
                    class_weight=class_weight, epochs=50, shuffle=True, verbose=1)
log.info(pd.DataFrame(history.history).round(decimals=3))
log.info('- Model save : {}'.format(os.path.join(OUTPUT_PATH, MODEL_FILE)))
model.save(os.path.join(OUTPUT_PATH, MODEL_FILE))

In [30]:
eval_model(X_valid, y_valid, model)

* 정확도 : 0.97222

 -------------------------- 

preds    silence  unknown  down   go  left   no  off   on  right  stop   up  \
actuals                                                                       
silence       80        0     0    0     0    0    0    0      0     0    2   
unknown        0     1970     4    9     2   14    1    6      0     1   13   
down           0        1   117    0     0    3    0    0      0     0    0   
go             0        0     1  122     0    3    0    1      0     0    0   
left           0        2     0    0   132    0    0    0      0     0    1   
no             0        0     0    0     1  106    0    0      0     1    0   
off            0        1     0    0     0    0  109    0      0     0    3   
on             0        2     0    0     0    0    2  113      1     0    2   
right          0        3     0    0     0    1    0    0    107     0    1   
stop           0        1     0    0     0    0    0    0      0   136    0   
up   

# 제출파일 생성

In [33]:
def preprocess_sub(submission_fpaths, replace=False):
    start_time = time.time()
    data_file = os.path.join(INPUT_PATH, 'X_test_'+MODEL_FILE.split('_')[0]+'.pkl') # X_test_(피쳐추출알고리즘).pkl
    if (not replace) and os.path.exists(data_file):
        X_test = pickle.load(open(data_file, 'rb'))
    else:
        ## load & Preprocess Data
        X_test = []
        for fpath in submission_fpaths:
            samples, sample_rate = librosa.load(fpath, sr=AUDIO_SR)
            specgram = log_specgram(samples)
            X_test.append(specgram)
        X_test = np.array(X_test)
        X_test = X_test.reshape(tuple(list(X_test.shape) + [1]))   
        pickle.dump(X_test, open(data_file, 'wb'), protocol=4)
    log.info('Shape of X_test : {}'.format(X_test.shape))
    log.info('Preprocessing Submission files Complete - {:.2f} sec'.format(time.time()-start_time))
    return X_test

In [34]:
%%time
submission_fpaths = sorted(glob(os.path.join(TEST_PATH, r'*wav')))
X_test = preprocess_sub(submission_fpaths)

Shape of X_test : (158538, 121, 267, 1)
Preprocessing Submission files Complete - 911.89 sec
CPU times: user 9min 19s, sys: 5min 43s, total: 15min 3s
Wall time: 15min 12s


In [36]:
preds_proba = model.predict(X_test, batch_size=256, verbose=1)
pickle.dump(preds_proba, open(os.path.join(OUTPUT_PATH, 'sub_' + MODEL_FILE.split('.')[0] + '_proba.pkl'), 'wb'))
preds = [[L.replace('_', '') for L in LABELS][i] for i in np.argmax(preds_proba, axis=1)]



In [37]:
d = OrderedDict()
d['fname'] = submission_fpaths
d['label'] = preds
for l in LABELS:
    d['{}'.format(l)]=preds_proba[:,LABELS.index(l)]
    
df = pd.DataFrame(d) 
df['fname'] = df['fname'].apply(lambda p: p.split('/')[-1]) 

log.info(df['label'].value_counts())

df[['fname','label']].to_csv(os.path.join(OUTPUT_PATH, 'sub_' + MODEL_FILE.split('.')[0] + '.csv'), index=False) 
log.info('Save File : {}'.format(os.path.join(OUTPUT_PATH, 'sub_' + MODEL_FILE.split('.')[0] + '.csv')))

unknown    90685
no          8679
up          7313
on          6511
silence     6340
left        6163
off         5624
stop        5503
down        5483
right       5449
go          5417
yes         5371
Name: label, dtype: int64
Save File : ./output/sub_spect_rcnn2.csv


In [39]:
from scipy.io import wavfile
import IPython.display as ipd
from scipy.stats import entropy
%matplotlib inline

In [40]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [47]:
i=87
label = 'no'

temp_fname = df[df['label']==label]['fname'].tolist()[i]
idx = df[df['label']==label]['fname'].index[i]
file_path = os.path.join(TEST_PATH, temp_fname)
t = pd.DataFrame({'p':preds_proba[idx].round(decimals=2), 'l':LABELS})
print(t)
print('\nentropy : ', entropy(t['p']))
print('\nvoice_entropy : ', entropy(t['p'][1:]))
sample_rate, samples = wavfile.read(file_path)
ipd.Audio(samples, rate=sample_rate)

          l     p
0   silence  0.00
1   unknown  0.09
2      down  0.00
3        go  0.00
4      left  0.00
5        no  0.91
6       off  0.00
7        on  0.00
8     right  0.00
9      stop  0.00
10       up  0.00
11      yes  0.00

entropy :  0.3025378

voice_entropy :  0.3025378


In [1]:
%%time
label_n = []
for i, row in df.iterrows():
    if row['label'] in ['down','go','left','no','off','on', 'right', 'stop', 'up', 'yes']:
        ent = entropy(row[LABELS].tolist())
        if ent > 1.5:
            correct_l = 'silence'
        elif ent > 0.4:
            correct_l = 'unknown'
        else: 
            correct_l = row['label']
    else:
        correct_l = row['label']
    label_n.append(correct_l)

NameError: name 'df' is not defined

In [None]:
df['label_n'] = label_n
df['label_n'].value_counts()

In [None]:
df_n = df.copy()
df_n['label']=df_n['label_n']
df_n[['fname','label']].to_csv(os.path.join(OUTPUT_PATH, 'sub_' + MODEL_FILE.split('.')[0] + '_cor.csv'), index=False) 