In [1]:
import os
import soundfile as sf
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.models import Sequential,load_model
from keras.layers.core import Dense, Activation, Flatten
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD, Adam
from keras.callbacks import ModelCheckpoint
import h5py
from pydub import AudioSegment
import os
import datetime
from scipy.io.wavfile import write
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 1. Data Processing

#### Convert Stereo to Mono

In [126]:
def stereo_to_mono(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for filename in os.listdir(input_dir):
        if filename.endswith(".wav"):
            sound = AudioSegment.from_wav(os.path.join(input_dir, filename))
            sound = sound.set_channels(1)
            name = filename.split(".")[0]
            sound.export(output_dir + name + ".wav", format="wav")

In [127]:
stereo_to_mono('data/', 'data_monowavs/')

#### Compress the wav into smaller format

Copy the 'dataProcessing.py' file into the monowavs folder, then run it to compress the wavs files.  
(bitrate = sample rate × number of channels × bits per sample)

#### Feature Extraction

In [118]:
#Feature extraction
def feature_extraction(x,fs):
    frame_length_s = 0.04 # window length in seconds
    frame_length = int(2**np.ceil(np.log2(fs*frame_length_s))) # 40ms window length in samples
    # set an overlap ratio of 50 %
    hop_length = frame_length//2

    # Compute STFT
    _,_,X = signal.stft(x, nfft=frame_length,noverlap=hop_length, fs=fs,nperseg=frame_length)
    number_frequencies, number_time_frames = X.shape[0]//2 -1, X.shape[1]
    X = np.abs(X[0:number_frequencies, :])

    # Segmentation
    segment_length_s = 0.5 # segment length in seconds
    segment_length = int(2**np.ceil(np.log2(segment_length_s/frame_length_s))) # ~0.5s in samples

    # Trim the frames that can't be fitted into the segment size
    trimmed_X = X[:, :-(number_time_frames%segment_length)]

    # Segmentation (number of freqs x number of frames x number of segment x 1). The last dimension is 'channel'.
    features = trimmed_X.reshape((number_frequencies,segment_length,-1,1), order='F')
    # Transpose the feature to be in form (number of segment x number of freqs x number of frames x 1)
    return features.transpose((2,0,1,3))

#### Read all files and extract training and label data

In [175]:
data_folder = 'data_monowavs/'

In [181]:
input_features = []
groundtruth_features = []
for filename in os.listdir(data_folder):
    if filename.endswith(".wav"):
        x,fs = sf.read(data_folder + filename)
        features = feature_extraction(x, fs)
        if "down" in filename:
            input_features.append(features)
        else:   
            groundtruth_features.append(features)
input_features = np.vstack(input_features)
groundtruth_features = np.vstack(groundtruth_features)

In [183]:
groundtruth_features.shape

(958, 511, 16, 1)

In [185]:
input_features.shape

(958, 511, 16, 1)

In [190]:
X_train, X_test, y_train, y_test = train_test_split \
(input_features,groundtruth_features,test_size=0.4,random_state=0)

#### Save features into .h5 files

In [207]:
def save_features(X_train,X_test,y_train,y_test):
    with h5py.File('data.hdf5', 'w') as f:
        f.create_dataset('X_train', data=X_train)
        f.create_dataset('X_test', data=X_test)
        f.create_dataset('y_train', data=y_train)
        f.create_dataset('y_test', data=y_test)

In [212]:
save_features(X_train,X_test,y_train,y_test)

## 2. Model

In [209]:
def get_model(features_shape):
    input_shape = (features_shape[1],features_shape[2], 1)# (number of freqs x number of frames in a segment x number of channels)
    model = Sequential()
    model.add(Conv2D(32, (5, 5),
            input_shape=input_shape,
            activation = "relu",
            padding = "same"))
    # model.add(MaxPooling2D(pool_size=(4, 4)))
    model.add(Conv2D(64, (5, 5),
            activation = "relu",
            padding = "same"))

    model.add(Conv2D(1, (10, 10),
            activation = "relu",
            padding = "same"))

    adam = Adam(lr=0.0003)
    model.compile(optimizer=adam, loss='mean_absolute_error', metrics=['mean_absolute_error'])
    model.summary()
    return model

In [210]:
model = get_model(X_train.shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 511, 16, 32)       832       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 511, 16, 64)       51264     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 511, 16, 1)        6401      
Total params: 58,497
Trainable params: 58,497
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size=32, validation_data=(X_test, y_test), shuffle=True, epochs=100)

In [None]:
model.save('test-{date:%Y-%m-%d %H:%M:%S}.txt'.format( date=datetime.datetime.now() ))