In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from google.colab import drive
import librosa
import h5py
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorflow_version 2.x
import datetime, os
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
with h5py.File('/content/drive/My Drive/Colab Notebooks/ThesisData/Multichannel/fullmixture_5k_2channels_large_phase.hdf5', 'r') as hf:
    clear_train = hf.get('clear_timit_train')
    mixture_train = hf.get('mixture_timit_train')

    clear_train = np.array(clear_train)
    mixture_train = np.array(mixture_train)

In [4]:
noisyMean     = np.mean(mixture_train);
noisyStd      = np.std(mixture_train);
print(str(noisyMean))
print(str(noisyStd))
mixture_train = (mixture_train - noisyMean)/noisyStd;
cleanMean     = np.mean(clear_train);
cleanStd      = np.std(clear_train);
print(str(cleanMean))
print(str(cleanStd))
clear_train    = (clear_train - cleanMean)/cleanStd;

0.059830967664290226
1.8274894293505253
0.030857038
1.7579846


### Channels Last Approach
We will create the following structure: 
(SAMPLE_SIZE,129, 8, CHANNEL_NUMBER)
(SAMPLE_SIZE,129, 1, 1)


In [5]:
mixture_train_shaped = np.reshape(mixture_train, (mixture_train.shape[0],mixture_train.shape[1], mixture_train.shape[2], 1));
clear_train_shaped = np.reshape(clear_train, (clear_train.shape[0], clear_train.shape[1], 1, 1));
print(mixture_train_shaped.shape)
print(clear_train_shaped.shape)

(297871, 129, 8, 1)
(297871, 129, 1, 1)


In [6]:
from tensorflow.keras.layers import Conv2D, Input, LeakyReLU, Flatten, Dense, Reshape, Conv2DTranspose, BatchNormalization, Activation
from tensorflow.keras import Model, Sequential
input_img = tf.keras.layers.Input(shape=(129,8,1))
x = tf.keras.layers.Conv2D(18, (9, 8), strides=(1, 100), padding='same')(input_img)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.LeakyReLU()(x)

for i in range(0,5):
	x = tf.keras.layers.Conv2D(30, (5, 1), strides=(1, 100), padding='same')(x)
	x = tf.keras.layers.BatchNormalization()(x)
	x = tf.keras.layers.LeakyReLU()(x)
	x = tf.keras.layers.Conv2D(8, (9, 1), strides=(1, 100), padding='same')(x)
	x = tf.keras.layers.BatchNormalization()(x)
	x = tf.keras.layers.LeakyReLU()(x)
	x = tf.keras.layers.Conv2D(18, (9, 1), strides=(1, 100), padding='same')(x)
	x = tf.keras.layers.BatchNormalization()(x)
	x = tf.keras.layers.LeakyReLU()(x)
	x = tf.keras.layers.Conv2D(30, (5, 1), strides=(1, 100), padding='same')(x)
	x = tf.keras.layers.BatchNormalization()(x)
	x = tf.keras.layers.ReLU()(x)

decoded = tf.keras.layers.Conv2D(1, (129, 1), strides=(1, 100), padding='same')(x)
model = Model(inputs=input_img, outputs=decoded)

optimizer = keras.optimizers.Adam(learning_rate=0.0015,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-08,
    amsgrad=True,
    name='Adam')

model.compile(optimizer=optimizer, loss='mean_squared_error')
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 129, 8, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 129, 1, 18)        1314      
_________________________________________________________________
batch_normalization (BatchNo (None, 129, 1, 18)        72        
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 129, 1, 18)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 129, 1, 30)        2730      
_________________________________________________________________
batch_normalization_1 (Batch (None, 129, 1, 30)        120       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 129, 1, 30)        0     

In [0]:
tf.keras.utils.plot_model(model, show_shapes=True, dpi=64)

In [0]:
model.load_weights('/content/drive/My Drive/Colab Notebooks/ThesisData/Multichannel/Model/2channel_phase_v1_ckpt.h5')

In [0]:
from datetime import datetime
print(mixture_train_shaped.shape)
print(clear_train_shaped.shape)
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='/content/drive/My Drive/Colab Notebooks/ThesisData/Multichannel/Model/2channel_phase_v1_ckpt.h5', 
                                                         monitor='val_loss', save_best_only=True)

training_history = model.fit(mixture_train_shaped, clear_train_shaped,
         validation_split=0.2,
         epochs=75,
         batch_size=128, 
         shuffle=True,
         verbose=1,
         callbacks=[checkpoint_callback,tensorboard_callback],
        )
%tensorboard --logdir logs/scalars
model.save('/content/drive/My Drive/Colab Notebooks/ThesisData/Multichannel/Model/2channel_phase_v1.h5') 

(297871, 129, 8, 1)
(297871, 129, 1, 1)
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
  21/1862 [..............................] - ETA: 28s - loss: 0.9751

In [0]:
%tensorboard --logdir logs/scalars

### Loading an Existing Model to Continue Training

### Code Evaluation
Testing the success of the algorithm with the dataset.

In [0]:
# Recreate the exact same model, including its weights and the optimizer
model = tf.keras.models.load_model('/content/drive/My Drive/Colab Notebooks/ThesisData/Multichannel/Model/4channel_5k_v3.h5')

# Show the model architecture
# model.summary()

## Testing the Model
Here I will be continuing to improve the model

In [0]:
counter = 0
FrameSize = 256  # 512 under 16KHz time=32ms(normally 20~30ms )
Overlap = round(0.75 * FrameSize)
FFTSize = FrameSize  # FFT window size=FRAMESIZE
FrequencyBins = FrameSize // 2 + 1  # stft_matrix:np.ndarray [shape=(1 + n_fft/2, t)]
NumSegments = 8
noisyMean = 0.082
noisyStd  = 0.8
cleanMean = 0.236
cleanStd  = 1.126


def convert_to_audio(outname,stft_magnitude,stft_phase,noisyAudio,fs=8000,):
    # Normal Case:
    # noisyMean = 0.0143
    # noisyStd  = 0.0805
    # cleanStd  = 0.286
    # cleanMean = 1.1955631

    # Converting the data back to human language
    FrameSize = 256  # 512 under 16KHz time=32ms(normally 20~30ms )
    Overlap   = round(0.75 * FrameSize)
    FFTSize   = FrameSize  # FFT window size=FRAMESIZE
    FrequencyBins = FrameSize // 2 + 1  # stft_matrix:np.ndarray [shape=(1 + n_fft/2, t)]
    NumSegments = 8
    if noisyAudio == True:
      stft_magnitude_nonNormal = noisyStd*stft_magnitude+noisyMean
    else:
      stft_magnitude_nonNormal = cleanStd*stft_magnitude+cleanMean
    final_STFT = stft_magnitude_nonNormal*np.exp(1j*stft_phase)
    # Audio Converter
    converted_audio = librosa.istft(final_STFT, hop_length=Overlap, win_length=FFTSize,
                  window=scipy.signal.hamming(FrameSize,sym=False))
    wavfile.write(outname, rate=fs, data=converted_audio)

def stft_generator(filename,noisyAudio,fs=8000):
    FrameSize = 256  # 512 under 16KHz time=32ms(normally 20~30ms )
    Overlap = round(0.75 * FrameSize)
    FFTSize = FrameSize  # FFT window size=FRAMESIZE
    FrequencyBins = FrameSize // 2 + 1  # stft_matrix:np.ndarray [shape=(1 + n_fft/2, t)]
    NumSegments = 8

    # Reading the audio data
    audio_data, sr = librosa.load(filename, mono=False, sr=fs)  # if sr=None to read raw sample_rate

    if noisyAudio == False:
        stft_segments= librosa.stft(audio_data, n_fft=FrameSize, hop_length=Overlap, win_length=FFTSize,
                                              window=scipy.signal.hamming(FrameSize,sym=False))
        # Phase
        audio_phase = np.angle(stft_segments)
        # Magnitude matrix
        stft_segments = np.transpose(abs(stft_segments))
    else:
        counter = 0
        for audio in audio_data[0:1]:
            channelNumber = 2
            audio = np.nan_to_num(audio)
            noisySTFT = librosa.stft(audio, n_fft=FrameSize, hop_length=Overlap, win_length=FFTSize,
                                         window=scipy.signal.hamming(FrameSize,sym=False))
            audio_phase = np.angle(noisySTFT)
            # Magnitude matrix
            noisySTFT =np.abs(noisySTFT)
            new_noisy_STFT = np.concatenate((noisySTFT[:,0:NumSegments-1], noisySTFT), axis=1)
            # initialize STFT 4 channel structure
            if counter == 0:
                stft_segments = np.zeros((new_noisy_STFT.shape[1] - NumSegments + 1, FrequencyBins, NumSegments,channelNumber))

            for index in range(0, new_noisy_STFT.shape[1] - NumSegments + 1):
                stft_segments[index,:,:,counter] = new_noisy_STFT[:, index:index+NumSegments]
            counter = counter + 1
    return {'stft': stft_segments, 'phase': audio_phase}

In [0]:
# Creating the magnitude and phase spectrums of noisy signal
from scipy.io import wavfile
import scipy
data_folder = '/content/drive/My Drive/Colab Notebooks/ThesisData/Multichannel/Data/4Channel/'
stft_segments = stft_generator(data_folder + 'mixture_1550_n5.wav',True)
clean_stft_segments = stft_generator(data_folder + 'timit1550.wav',False)
absSTFT = stft_segments['stft']
noisyPhase = stft_segments['phase']
cleanPhase = clean_stft_segments['phase']
clean_stft_segments = clean_stft_segments['stft']

# Mean-STD Calculations - Noisy
noisyMean     = np.mean(absSTFT);
noisyStd      = np.std(absSTFT);
absSTFT_normalized = (absSTFT - noisyMean)/noisyStd

# Mean-STD Calculations - Clean
cleanMean     = np.mean(clean_stft_segments);
cleanStd      = np.std(clean_stft_segments);
clean_absSTFT_normalized = (clean_stft_segments - cleanMean)/cleanStd
print(absSTFT.shape)
print(clean_stft_segments.shape)

(142, 129, 8, 2)
(142, 129)


In [0]:
noisySTFT = np.reshape(absSTFT_normalized, (absSTFT_normalized.shape[0],absSTFT_normalized.shape[1], absSTFT_normalized.shape[2], 2));
cleanSTFT = np.reshape(clean_absSTFT_normalized, (clean_absSTFT_normalized.shape[0], clean_absSTFT_normalized.shape[1], 1, 1));
print(noisySTFT.shape)
print(cleanSTFT.shape)
cleanedup_audio = model.predict(noisySTFT)
print(cleanedup_audio.shape)

(142, 129, 8, 2)
(142, 129, 1, 1)
(142, 129, 1, 1)


In [0]:
print('\n# Evaluate on test data')
results = model.evaluate(noisySTFT, cleanSTFT,verbose=1)
print('test loss, test acc:', results)


# Evaluate on test data
test loss, test acc: 0.5344913601875305


In [0]:
final_cleanedup_audio = np.reshape(cleanedup_audio, (noisySTFT.shape[0], noisySTFT.shape[1]));
print(final_cleanedup_audio.shape)
print(noisyPhase.shape)

final_cleanSTFT= np.reshape(cleanSTFT, (clean_stft_segments.shape[0], clean_stft_segments.shape[1]));
print(final_cleanSTFT.shape)
print(cleanPhase.shape)

(142, 129)
(129, 142)
(142, 129)
(129, 142)


### Write to the file
The code below converts the cleaned file to the drive.


In [0]:
convert_to_audio(data_folder + 'cleaned_1550_v1_2k.wav', np.transpose(final_cleanedup_audio),cleanPhase,True)
# convert_to_audio('/content/drive/My Drive/Colab Notebooks/ThesisData/Basic_NoDelay/TestData/4Channel/perfect0_8K.wav',
#                  np.transpose(final_cleanSTFT),cleanPhase,False)

# PLAYGROUND FOR GROUND TRUTH

In [0]:
mse = (np.square(final_cleanedup_audio - final_cleanSTFT)).mean()
print(mse)
stft_segments = stft_generator('/content/drive/My Drive/Colab Notebooks/ThesisData/Basic_NoDelay/TestData/timit2145_n1.wav',False)
absSTFT = stft_segments['stft']
noisyPhase = stft_segments['phase']
absSTFT_normalized = (absSTFT - noisyMean)/noisyStd
print(absSTFT_normalized.shape)
mse = (np.square(absSTFT_normalized - final_cleanSTFT)).mean()
print(mse)

In [0]:
# Creating the magnitude and phase spectrums of noisy signal
from scipy.io import wavfile
import scipy
stft_segments = stft_generator('/content/drive/My Drive/Colab Notebooks/ThesisData/Basic_NoDelay/TestData/timit2145.wav',True)
clean_stft_segments = stft_generator('/content/drive/My Drive/Colab Notebooks/ThesisData/Basic_NoDelay/TestData/timit2145.wav',False)
absSTFT = stft_segments['stft']
noisyPhase = stft_segments['phase']
absSTFT_normalized = (absSTFT - cleanMean)/cleanStd
cleanPhase = clean_stft_segments['phase']
clean_stft_segments = clean_stft_segments['stft']
clean_absSTFT_normalized = (clean_stft_segments - cleanMean)/cleanStd
print(absSTFT.shape)
print(clean_stft_segments.shape)

(129, 8, 325)
(129, 325)


### Demo Code


In [0]:
for i in range(8):
  print(sum(np.ravel(absSTFT[:,i,:])-np.ravel(clean_stft_segments[:,:])))
cleanedup_audio = new_model.predict(noisySTFT)
print(absSTFT.shape)
print(clean_stft_segments.shape)


-5.521531762433824
-4.648348442810402
-3.690907558631352
-2.8582976624525145
-2.064867725527165
-1.2853483124349623
-0.4730873316739235
0.0
(129, 8, 325)
(129, 325)


In [0]:
noisySTFT = np.reshape(absSTFT_normalized, (absSTFT_normalized.shape[2],absSTFT_normalized.shape[0], absSTFT_normalized.shape[1], 1));
cleanSTFT = np.reshape(clean_absSTFT_normalized, (clean_absSTFT_normalized.shape[1], clean_absSTFT_normalized.shape[0], 1, 1));
noisySTFT = np.ravel(noisySTFT)
print(noisySTFT.shape)
noisySTFT = np.reshape(noisySTFT, (absSTFT_normalized.shape[0],absSTFT_normalized.shape[1], absSTFT_normalized.shape[2]));
print(noisySTFT.shape)
cleanSTFT = np.ravel(cleanSTFT)
print(cleanSTFT.shape)
cleanSTFT = np.reshape(cleanSTFT, (clean_absSTFT_normalized.shape[0], clean_absSTFT_normalized.shape[1]));
print(cleanSTFT.shape)

for i in range(8):
  print(sum(np.ravel(noisySTFT[:,i,:])-np.ravel(cleanSTFT[:,:])))
# cleanedup_audio = new_model.predict(noisySTFT)
# print(noisySTFT.shape)
# print(cleanSTFT.shape)
# print(cleanedup_audio.shape)
# print(noisyPhase.shape)

(335400,)
(129, 8, 325)
(41925,)
(129, 325)
-6.570148158180036
-5.531115976335604
-4.391822693334616
-3.4010702847510186
-2.456939516357768
-1.529361379701967
-0.5628228761593751
0.00012072501257150217


**Evaluation in numbers**


In [0]:
print('\n# Evaluate on test data')
results = new_model.evaluate(noisySTFT, cleanSTFT,verbose=1)
print('test loss, test acc:', results)


# Evaluate on test data
test loss, test acc: 0.707741042226553


In [0]:
final_audio = np.transpose(np.squeeze(cleanSTFT))
print(final_audio[1:3,1:3])
print(clean_absSTFT_normalized[1:3,1:3])
print(final_audio.shape)
print(clean_absSTFT_normalized.shape)
print(cleanPhase.shape)

[[-0.28905934 -0.26405573]
 [-0.26985237 -0.3025639 ]]
[[-0.29344794 -0.2982164 ]
 [-0.3020195  -0.30220264]]
(129, 325)
(129, 325)
(129, 325)


In [0]:
convert_to_audio('/content/drive/My Drive/Colab Notebooks/ThesisData/Basic_NoDelay/TestData/timit2145furka_cleanedv2.wav',
                 clean_absSTFT_normalized,cleanPhase,False)