## 4. Model Refinement

- 모델 강화학습(딥러닝을 사용해 urban sounds 분류하기)
  * CNN 모델을 생성해서 딥러닝

In [1]:
# 앞서 2번에서 전처리 완료된 데이터 저장한 것을 불러오기

%store -r x_train 
%store -r x_test 
%store -r y_train 
%store -r y_test 
%store -r yy 
%store -r le

In [2]:
# 특징 추출 개선(refinement)
# 이전에 추출한 특징(feature) MFCC 벡터는 오디오파일마다 크기 다름
# CNN(Convolutional Neural Network) 만들기 위해서는 모두 동일한 크기로 만들어야 함(벡터를 0으로 패딩)
import numpy as np
import librosa
max_pad_len = 174

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs

In [3]:
# Load various imports 
import pandas as pd
import os
import librosa

# Set the path to the full UrbanSound dataset 
fulldatasetpath = './UrbanSound8K/audio'

metadata = pd.read_csv('./UrbanSound8K/metadata/UrbanSound8K.csv')

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():  
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    
    class_label = row["class_name"]
    data = extract_features(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf), ' files')

Finished feature extraction from  8732  files


In [4]:
print(featuresdf)

                                                feature       class_label
0     [[-306.77255, -177.59209, -99.13616, -65.97198...          dog_bark
1     [[-457.69534, -451.0248, -450.68613, -445.0000...  children_playing
2     [[-468.0367, -467.42264, -481.04654, -486.5948...  children_playing
3     [[-422.42215, -411.9085, -409.46243, -409.0892...  children_playing
4     [[-438.10162, -434.47787, -443.32837, -442.664...  children_playing
...                                                 ...               ...
8727  [[-397.82446, -400.45578, -407.50354, -408.952...          car_horn
8728  [[-451.81265, -451.41983, -450.67892, -445.635...          car_horn
8729  [[-301.06348, -298.25397, -305.0326, -303.8614...          car_horn
8730  [[-373.6307, -369.44986, -366.48, -364.9094, -...          car_horn
8731  [[-309.34647, -305.3132, -308.23593, -308.1856...          car_horn

[8732 rows x 2 columns]


In [5]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

Using TensorFlow backend.


In [6]:
print(x_train)
print(x_train.shape)

[[[-8.37522125e+01 -7.18939133e+01 -7.44299469e+01 ... -9.78421860e+01
   -9.42385483e+01  0.00000000e+00]
  [ 1.14666840e+02  1.19127029e+02  1.25082397e+02 ...  1.12450966e+02
    1.06469528e+02  0.00000000e+00]
  [-7.31703949e+01 -8.15858688e+01 -8.74292984e+01 ... -1.00178772e+02
   -1.00562012e+02  0.00000000e+00]
  ...
  [ 6.21281624e-01 -1.46681368e-02  9.38629746e-01 ...  5.45317411e-01
    3.08528042e+00  0.00000000e+00]
  [ 3.85332060e+00  1.10046196e+00  1.51432705e+00 ...  4.19227362e+00
    4.53711987e+00  0.00000000e+00]
  [ 3.23432970e+00 -2.41597605e+00 -1.52170730e+00 ... -1.35237694e+01
   -1.04434299e+01  0.00000000e+00]]

 [[-2.08063232e+02 -2.07646057e+02 -2.09871704e+02 ... -1.68888702e+02
   -1.64623489e+02  0.00000000e+00]
  [ 1.33095703e+02  1.34623871e+02  1.34608337e+02 ...  1.33464355e+02
    1.22665115e+02  0.00000000e+00]
  [-1.22376442e+01 -2.17055168e+01 -2.98256149e+01 ... -2.23849869e+01
   -3.11693134e+01  0.00000000e+00]
  ...
  [-6.87027740e+00 -6.5

In [7]:
# CNN(Convolutional Neural Network) 모델 구조로 수정(재생성)
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

In [8]:
# 모델 컴파일하기(이전 모델과 동일한 옵션)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [9]:
model.summary() # 모델 보여주기

# pre-training 학습 정확도 계산
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 39, 173, 16)       80        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 19, 86, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 86, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 18, 85, 32)        2080      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 9, 42, 32)         0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 9, 42, 32)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 8, 41, 64)        

In [10]:
# Training
# 해당 모델에 대해 epoch 72회 실시(72번 반복) 
# CNN 모델의 특성상 이전 basic 모델과 다르게 시간이 굉장히 오래 걸림
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

#num_epochs = 12
#num_batch_size = 128

num_epochs = 72
num_batch_size = 256

# 학습(Training)하는 동안 체크포인트 저장하기
# 훈련 중간과 마지막에 자동으로 저장하도록 옵션 설정(모델 재사용성 up!)
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

# 모델 학습(Training)
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

Train on 6985 samples, validate on 1747 samples
Epoch 1/72

Epoch 00001: val_loss improved from inf to 2.26111, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 2/72

Epoch 00002: val_loss improved from 2.26111 to 1.94114, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 3/72

Epoch 00003: val_loss improved from 1.94114 to 1.65379, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 4/72

Epoch 00004: val_loss improved from 1.65379 to 1.53527, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 5/72

Epoch 00005: val_loss improved from 1.53527 to 1.48129, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 6/72

Epoch 00006: val_loss improved from 1.48129 to 1.41424, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 7/72

Epoch 00007: val_loss improved from 1.41424 to 1.34677, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 8/72

Epoch 00008: val_loss improved from 1.34677 to 1.27280, saving model 


Epoch 00069: val_loss did not improve from 0.36098
Epoch 70/72

Epoch 00070: val_loss improved from 0.36098 to 0.36019, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 71/72

Epoch 00071: val_loss improved from 0.36019 to 0.34649, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 72/72

Epoch 00072: val_loss did not improve from 0.34649
Training completed in time:  1:13:47.073305


In [11]:
# 모델 테스트하기
# Train dataset
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

# Test dataset
score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9367215633392334
Testing Accuracy:  0.8923869729042053


In [17]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(6985, 40, 174, 1)
(6985, 10)
(1747, 40, 174, 1)
(1747, 10)


In [18]:
# 예측하기(함수 생성)
def print_prediction(file_name):
    prediction_feature = extract_features(file_name) 
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

In [19]:
# Class: Air Conditioner

filename = './UrbanSound8K/audio/fold5/100852-0-0-0.wav' 
print_prediction(filename)

The predicted class is: air_conditioner 

air_conditioner 		 :  0.95868599414825439453125000000000
car_horn 		 :  0.00002513961590011604130268096924
children_playing 		 :  0.02091950178146362304687500000000
dog_bark 		 :  0.00067896617110818624496459960938
drilling 		 :  0.00735024036839604377746582031250
engine_idling 		 :  0.00765555538237094879150390625000
gun_shot 		 :  0.00015567925584036856889724731445
jackhammer 		 :  0.00091366545530036091804504394531
siren 		 :  0.00042527535697445273399353027344
street_music 		 :  0.00318997912108898162841796875000


In [20]:
# Class: Siren

filename = './UrbanSound8K/audio/fold9/159748-8-2-1.wav' 
print_prediction(filename)

The predicted class is: siren 

air_conditioner 		 :  0.00000000000000000014355622595630
car_horn 		 :  0.00000000000000003857009854820024
children_playing 		 :  0.00000000040563058467668611228873
dog_bark 		 :  0.00000067011859528065542690455914
drilling 		 :  0.00000000001219694657772496526604
engine_idling 		 :  0.00000000031732638738901641772827
gun_shot 		 :  0.00000000009801592176783557874842
jackhammer 		 :  0.00000000000001572945095842329805
siren 		 :  0.99999773502349853515625000000000
street_music 		 :  0.00000151486824506719131022691727


In [21]:
# Class: Children_palying

filename = './children.wav' 
print_prediction(filename)

Error encountered while parsing file:  ./children.wav


AttributeError: 'NoneType' object has no attribute 'reshape'