In [88]:
import pandas as pd
import numpy as np
import os
import librosa
import librosa.display

from src.wavhelp import WavHelper

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical


In [128]:
metadata = pd.read_csv('UrbanSoundDatasetSample/metadata/UrbanSound8K.csv')

In [129]:
display(metadata.head())
print(metadata.shape)

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class_name
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


(8732, 8)


Only need a few file names for this example. Parse with os.listdir

In [130]:
existing_files = os.listdir('UrbanSoundDatasetSample/audio')
metasub = metadata[metadata['slice_file_name'].isin(existing_files)].reset_index(drop=True)
metasub

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class_name
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn
3,100852-0-0-0.wav,100852,0.0,4.0,1,5,0,air_conditioner
4,101848-9-0-0.wav,101848,0.0,4.0,1,7,9,street_music
5,102305-6-0-0.wav,102305,0.0,2.61161,1,1,6,gun_shot
6,102853-8-0-0.wav,102853,0.0,4.0,2,7,8,siren
7,102857-5-0-0.wav,102857,0.0,4.0,1,10,5,engine_idling
8,103074-7-0-0.wav,103074,3.341279,7.341279,1,1,7,jackhammer
9,103199-4-0-0.wav,103199,0.0,4.0,1,3,4,drilling


In [131]:
print(metasub.class_name.value_counts())

street_music        1
gun_shot            1
jackhammer          1
dog_bark            1
car_horn            1
air_conditioner     1
drilling            1
engine_idling       1
children_playing    1
siren               1
Name: class_name, dtype: int64


In [132]:
## For sample of Urban sound dataset... will have to be amended for entire dataset
# file_name = os.path.join(os.path.abspath('/UrbanSound8K/audio/'),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))

wavfilehelper = WavHelper()
audiodata = []
for index, row in metasub.iterrows():
    
    file_name = os.path.join('UrbanSoundDatasetSample/audio', str(row["slice_file_name"]))
    
    data = wavfilehelper.get_file_props(file_name)
    audiodata.append(data)

# Convert to pandas dataframe
audiodf = pd.DataFrame(audiodata, columns=['num_channels','sample_rate','bit_depth'])

In [133]:
audiodf

Unnamed: 0,num_channels,sample_rate,bit_depth
0,2,44100,16
1,2,44100,16
2,2,44100,16
3,2,44100,16
4,2,44100,16
5,2,44100,16
6,2,44100,16
7,2,44100,16
8,2,48000,24
9,2,44100,16


Primarily 44.1k SR, 16 bit depth, but some variation. Account for this with the librosa load functionality

In [134]:
print(audiodf.num_channels.value_counts(normalize=True))

2    1.0
Name: num_channels, dtype: float64


In [135]:
print(audiodf.sample_rate.value_counts(normalize=True))

44100    0.9
48000    0.1
Name: sample_rate, dtype: float64


In [136]:
print(audiodf.bit_depth.value_counts(normalize=True))

16    0.9
24    0.1
Name: bit_depth, dtype: float64


In [137]:
max_pad_len = 174

def extract_features(fname, features='mfccs'):
    
    try:
        signal, rate = librosa.load(fname, res_type='kaiser_fast') # default is kaiser best. Downsampling later anyways.
        if features == 'mfccs':
            vec = librosa.feature.mfcc(y=signal, sr=rate, n_mfcc=40)
        elif features == 'mels':
            vec = librosa.feature.melspectrogram(y=signal, sr=rate, n_mels=60)
#             vec = librosa.power_to_db(vec)
        pad_width = max_pad_len - vec.shape[1]
        vec = np.pad(vec, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error while parsing file: ", fname)
        return None
    
    return vec

In [304]:
datasetpath = 'UrbanSoundDatasetSample/audio/'

features = []

#iterating through each row
for index, row in metasub.iterrows():
    
    file_name = os.path.join(datasetpath, str(row["slice_file_name"]))
    label = row['class_name']
    fold = row["fold"]
    vector = extract_features(file_name, 'mels')

    
    features.append([vector, label, fold])

featuresdf = pd.DataFrame(features, columns=['feature', 'class_label', 'fold'])

In [305]:
features[0][0].shape

(60, 174)

In [306]:
display(featuresdf)
print(featuresdf.shape)

Unnamed: 0,feature,class_label,fold
0,"[[0.00015100525, 0.00014170613, 0.00013727874,...",dog_bark,5
1,"[[0.020249717, 0.035944887, 0.021877488, 0.026...",children_playing,5
2,"[[0.0010503497, 0.007112698, 0.011476202, 0.00...",car_horn,10
3,"[[2.9233718e-06, 0.0705473, 1.8070649, 12.4267...",air_conditioner,5
4,"[[0.3527452, 0.4210949, 0.31309783, 0.36565572...",street_music,7
5,"[[47.233067, 55.38184, 42.464436, 41.443974, 2...",gun_shot,1
6,"[[0.13007168, 0.13769388, 0.10224321, 0.199294...",siren,7
7,"[[151.7741, 128.39484, 84.23963, 67.139145, 10...",engine_idling,10
8,"[[0.97118616, 0.5512772, 0.3870931, 0.650693, ...",jackhammer,1
9,"[[2.3284926e-07, 8.215059e-06, 4.199208e-05, 0...",drilling,3


(10, 3)


In [307]:
# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())
folds = np.array(featuresdf.fold.tolist())
# Encode the classification labels
le = LabelEncoder()
y = to_categorical(le.fit_transform(y)) 

# X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state = 42)


In [308]:

### store the preprocessed data for use in the next notebook

# %store X_train 
# %store X_test 
# %store y_train 
# %store y_test 
# %store y_cat 
# %store le

In [309]:
### or... we can pickle that:
import pickle

vec_type = 'mels'
processed_data = (X, y, folds)
data_path = os.path.join('pickles', 'sample_urbansound_'+ vec_type + '.p')

with open(data_path, 'wb') as handle:
    pickle.dump(processed_data, handle, protocol=2)

In [310]:
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

X, y, folds = data[0], data[1], data[2]

In [311]:
from sklearn.model_selection import LeaveOneGroupOut

In [312]:
y

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]], dtype=float32)

In [314]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D


def get_conv_model():
    model = Sequential()
    model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))

    model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))

    model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))

    model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(GlobalAveragePooling2D())

    model.add(Dense(num_labels, activation='softmax'))

    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    model.summary()
    return model

num_rows = 60
num_columns = 174
num_channels = 1
X = X.reshape(X.shape[0], num_rows, num_columns, num_channels)
num_labels = y.shape[1]


In [315]:
logo = LeaveOneGroupOut()
# print("n_splits=", logo.get_n_splits(X,y,folds))
for train_idx, test_idx in logo.split(X, y, folds):
#     print("train_idx:", train_idx, "test_idx:", test_idx)
    X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    model = get_conv_model()
    history = model.fit(X_train, y_train, epochs=4, validation_data=(X_test, y_test))
    print('fold', folds[test_idx][0])
#     model.score(X_test, y_test)

Model: "sequential_76"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_268 (Conv2D)          (None, 59, 173, 16)       80        
_________________________________________________________________
max_pooling2d_260 (MaxPoolin (None, 29, 86, 16)        0         
_________________________________________________________________
dropout_260 (Dropout)        (None, 29, 86, 16)        0         
_________________________________________________________________
conv2d_269 (Conv2D)          (None, 28, 85, 32)        2080      
_________________________________________________________________
max_pooling2d_261 (MaxPoolin (None, 14, 42, 32)        0         
_________________________________________________________________
dropout_261 (Dropout)        (None, 14, 42, 32)        0         
_________________________________________________________________
conv2d_270 (Conv2D)          (None, 13, 41, 64)      

Epoch 3/4
Epoch 4/4
fold 5
Model: "sequential_79"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_280 (Conv2D)          (None, 59, 173, 16)       80        
_________________________________________________________________
max_pooling2d_272 (MaxPoolin (None, 29, 86, 16)        0         
_________________________________________________________________
dropout_272 (Dropout)        (None, 29, 86, 16)        0         
_________________________________________________________________
conv2d_281 (Conv2D)          (None, 28, 85, 32)        2080      
_________________________________________________________________
max_pooling2d_273 (MaxPoolin (None, 14, 42, 32)        0         
_________________________________________________________________
dropout_273 (Dropout)        (None, 14, 42, 32)        0         
_________________________________________________________________
conv2d_282 (Conv2D)       

In [316]:
history.history

{'loss': [2.20090389251709,
  1.8922537565231323,
  1.8116307258605957,
  1.594461441040039],
 'accuracy': [0.25, 0.125, 0.125, 0.5],
 'val_loss': [2.580855131149292,
  2.751909017562866,
  2.942147970199585,
  3.1782569885253906],
 'val_accuracy': [0.0, 0.0, 0.0, 0.0]}

In [317]:
max(history.history['val_accuracy'])

0.0

In [318]:
history.history['val_accuracy']

[0.0, 0.0, 0.0, 0.0]

In [None]:
metadata.