modified from https://github.com/aqibsaeed/Urban-Sound-Classification/blob/master/Convolutional%20Neural%20Network.ipynb

data from UrbanSound https://urbansounddataset.weebly.com/urbansound8k.html

# 데이터 준비

In [None]:
!wget -O UrbanSound9K.tar.gz https://goo.gl/8hY5ER

--2022-01-25 20:07:11--  https://goo.gl/8hY5ER
Resolving goo.gl (goo.gl)... 173.194.211.101, 173.194.211.138, 173.194.211.100, ...
Connecting to goo.gl (goo.gl)|173.194.211.101|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz [following]
--2022-01-25 20:07:11--  https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6023741708 (5.6G) [application/octet-stream]
Saving to: ‘UrbanSound9K.tar.gz’


2022-01-25 20:20:33 (7.18 MB/s) - ‘UrbanSound9K.tar.gz’ saved [6023741708/6023741708]



In [None]:
!tar xvfz UrbanSound9K.tar.gz

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
UrbanSound8K/audio/fold4/17480-2-0-6.wav
UrbanSound8K/audio/fold4/17480-2-0-9.wav
UrbanSound8K/audio/fold4/175904-2-0-11.wav
UrbanSound8K/audio/fold4/175904-2-0-24.wav
UrbanSound8K/audio/fold4/176003-1-0-0.wav
UrbanSound8K/audio/fold4/176638-5-0-0.wav
UrbanSound8K/audio/fold4/177756-2-0-10.wav
UrbanSound8K/audio/fold4/177756-2-0-4.wav
UrbanSound8K/audio/fold4/177756-2-0-5.wav
UrbanSound8K/audio/fold4/177756-2-0-7.wav
UrbanSound8K/audio/fold4/179862-1-0-0.wav
UrbanSound8K/audio/fold4/180977-3-1-1.wav
UrbanSound8K/audio/fold4/180977-3-1-5.wav
UrbanSound8K/audio/fold4/183989-3-1-21.wav
UrbanSound8K/audio/fold4/183989-3-1-23.wav
UrbanSound8K/audio/fold4/185709-0-0-0.wav
UrbanSound8K/audio/fold4/185709-0-0-1.wav
UrbanSound8K/audio/fold4/185709-0-0-6.wav
UrbanSound8K/audio/fold4/185709-0-0-7.wav
UrbanSound8K/audio/fold4/185909-2-0-102.wav
UrbanSound8K/audio/fold4/185909-2-0-13.wav
UrbanSound8K/audio/fold4/185909-2-0-17.wav
UrbanSound8K/audio/

In [None]:
import pandas as pd
meta = pd.read_csv('./UrbanSound8K/metadata/UrbanSound8K.csv')
meta

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [None]:
import IPython.display as ipd
display(ipd.Audio('./UrbanSound8K/audio/fold1/108041-9-0-5.wav'))
display(ipd.Audio('./UrbanSound8K/audio/fold5/100852-0-0-19.wav'))

In [None]:
sound_clip, sampling_ratio = librosa.load('./UrbanSound8K/audio/fold1/108041-9-0-5.wav')

import matplotlib.pyplot as plt
print(type(sound_clip))
print(sound_clip.shape)
print(sound_clip[:10])
print("sampling_ratio =",sampling_ratio)

import IPython
IPython.display.Audio(data=sound_clip, rate=sampling_ratio)

<class 'numpy.ndarray'>
(88200,)
[-0.03148247 -0.06967403 -0.02807479  0.07812583  0.10158778  0.08008352
  0.09629877  0.1579831   0.25382036  0.27910018]
sampling_ratio = 22050


In [None]:
plt.figure(figsize=(14,6))
plt.subplot(211)
plt.plot(sound_clip)

plt.subplot(212)
plt.specgram(sound_clip,Fs=sampling_ratio)

plt.show()

# 데이터 로딩

In [None]:
### Load necessary libraries ###
import glob
import os
import librosa
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras

In [None]:
BANDS = 60
FRAMES = 41

# refer http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/
def extract_features(parent_dir,sub_dir,file_ext="*.wav",
                     bands=BANDS,frames=FRAMES):
    def _windows(data, window_size):
        start = 0
        while start < len(data):
            yield int(start), int(start + window_size)
            start += (window_size // 2)
            
    window_size = 512 * (frames - 1)
    features, labels = [], []
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        label = int(fn.split('/')[2][4:]) - 1
        segment_log_specgrams, segment_labels = [], []
        sound_clip, sampling_ratio = librosa.load(fn)
        # fn = 'UrbanSound8K/audio/fold1/17913-4-2-0.wav'
        for (start,end) in _windows(sound_clip,window_size):
            if(len(sound_clip[start:end]) == window_size):
                signal = sound_clip[start:end]
                melspec = librosa.feature.melspectrogram(signal,n_mels=bands)
                logspec = librosa.amplitude_to_db(melspec)
                logspec = logspec.T.flatten()[:, np.newaxis].T
                segment_log_specgrams.append(logspec)
                segment_labels.append(label)
            
        segment_log_specgrams = np.asarray(segment_log_specgrams).reshape(
            len(segment_log_specgrams),bands,frames,1)
        segment_features = np.concatenate((segment_log_specgrams, np.zeros(
            np.shape(segment_log_specgrams))), axis=3)
        for i in range(len(segment_features)): 
            segment_features[i, :, :, 1] = librosa.feature.delta(
                segment_features[i, :, :, 0])
        
        if len(segment_features) > 0: # check for empty segments 
            features.append(segment_features)
            labels.append(segment_labels)
        if len(labels)>=10: break # DEV
    return features, labels

In [None]:
# Pre-process and extract feature from the data
parent_dir = 'UrbanSound8K/audio/'
save_dir = "UrbanSound8K/processed/"
!mkdir -p {save_dir}

for path, subdirs, files in os.walk(parent_dir):
    folds = subdirs
    break
# folds = ['fold2', 'fold10', 'fold4', 'fold6', 'fold8', 'fold1', 'fold5', 'fold3', 'fold9', 'fold7']

for fold in folds:
    print(f"data loading {fold}")
    features, labels = extract_features(parent_dir,fold)
    print(f"loaded {len(features}")
    np.savez(f"{save_dir}{fold}", features=features, labels=labels)

data loading fold1


  return array(a, dtype, copy=False, order=order, subok=True)


data loading fold2
data loading fold3
data loading fold4
data loading fold5
data loading fold6
data loading fold7
data loading fold8
data loading fold9
data loading fold10


In [None]:
load_dir = save_dir
x, y = [], []
for fold in folds:
    print(f"loading {load_dir}{fold}.npz")
    data = np.load(f"{load_dir}{fold}.npz", allow_pickle=True)
    x.extend(data['features'])
    y.extend(data['labels'])

x = np.concatenate(x)
y = np.concatenate(y)

print(x.shape)
print(y.shape)

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1)

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

loading UrbanSound8K/processed/fold2.npz
loading UrbanSound8K/processed/fold10.npz
loading UrbanSound8K/processed/fold4.npz
loading UrbanSound8K/processed/fold6.npz
loading UrbanSound8K/processed/fold8.npz
loading UrbanSound8K/processed/fold1.npz
loading UrbanSound8K/processed/fold5.npz
loading UrbanSound8K/processed/fold3.npz
loading UrbanSound8K/processed/fold9.npz
loading UrbanSound8K/processed/fold7.npz
(635, 60, 41, 2)
(635,)
(571, 60, 41, 2)
(571,)
(64, 60, 41, 2)
(64,)


# 모델 정의

In [None]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, BatchNormalization, Activation, GlobalMaxPooling2D

DATA_SHAPE = x[0].shape

def build_network():
    pool_size = (2, 2) 
    kernel_size = (3, 3)  
    input_shape = DATA_SHAPE
    num_classes = len(folds)
    
    keras.backend.clear_session()
    
    model = keras.models.Sequential()
    model.add(Conv2D(24, kernel_size,
                padding="same", input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(MaxPooling2D(pool_size=pool_size))

    model.add(Conv2D(32, kernel_size,
                                  padding="same"))
    model.add(BatchNormalization())
    model.add(Activation("relu"))  
    model.add(MaxPooling2D(pool_size=pool_size))
    
    model.add(Conv2D(64, kernel_size,
                                  padding="same"))
    model.add(BatchNormalization())
    model.add(Activation("relu"))  
    model.add(MaxPooling2D(pool_size=pool_size))
    
    model.add(Conv2D(128, kernel_size,
                                  padding="same"))
    model.add(BatchNormalization())
    model.add(Activation("relu"))  

    model.add(GlobalMaxPooling2D())
    model.add(Dense(128, activation="relu"))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    return model

# 모델 학습

In [None]:

model = build_network()
hist = model.fit(train_x, train_y, epochs=2, batch_size=32, validation_split=0.1)



In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.show()

In [None]:

loss, acc = model.evaluate(test_x, test_y)
print("loss=", loss)
print("acc=", acc)
