# Synthesis

### Examine the Dataset

In [1]:
import librosa.display
import librosa.util
import matplotlib.pyplot as plt
from IPython.display import Audio
import os
import scipy
import numpy as np

In [2]:
raw_dir = "./data/raw/edinburgh-noisy-speech-db/"
log_trainset = "log_trainset_28spk.txt"
audio_files = []
# list files
f = open(raw_dir + log_trainset, "r")
for x in f:
    audio_files.append(x.split()[0] + ".wav")
f.close()

In [71]:
clean_audio_dir = "./data/raw/edinburgh-noisy-speech-db/clean_trainset_28spk_wav/"
audio_file = audio_files[69]
clean_audio_f = clean_audio_dir + audio_file
Audio(clean_audio_f)

In [73]:
y_noise, sr = librosa.load(clean_audio_f)
# you can take any distribution from https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html
noise_amp = 0.15*np.random.uniform()*np.amax(y_noise)
y_noise = y_noise.astype('float64') + noise_amp * np.random.normal(size=y_noise.shape[0])
Audio(y_noise, rate=sr)

### Short-Time Fourier Transform

First, convert samples into STFT

In [74]:
window_length = 256;
win = scipy.signal.hann(window_length,"periodic");
hop_length = round(0.25 * window_length);
fft_length = window_length;

In [75]:
# downsampling to 8k
input_fs = sr;
fs = 8e3;

y_noise = librosa.resample(y_noise, target_sr = fs, orig_sr = sr)
sr = fs

In [76]:
Audio(y_noise, rate = sr)

In [77]:
n = len(y_noise)
y_pad_noise = librosa.util.fix_length(y_noise, n + fft_length // 2)
D_noise = librosa.stft(y_pad_noise.astype(np.float32),
    n_fft = fft_length,
    win_length = window_length,
    window = win,
    hop_length = hop_length)
magnitude, phase = librosa.magphase(D_noise)

In [78]:
num_features  = 129;
num_segments  = 8;

predictors = []

# Create array of targets and predictors
for segment_index in range(magnitude.shape[1] - num_segments + 1):
    predictors.append(magnitude[:, segment_index:segment_index + num_segments])

### Load Model

In [79]:
import torch
import os
import sys
# TODO: fix model saving to save model as recommended
# https://pytorch.org/docs/master/notes/serialization.html#recommended-approach-for-saving-a-model
# then loading the model should work properly
from model.baseline_model import FullyConnectedBaseline as network

model_to_test = "Baseline_FullyConnected/0506_135304"

model = network(n_features = 129, n_segments = 8)
model_path = "./saved/" + model_to_test + "/model_best.pth"
model.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict'])

In [80]:
predictors = np.array(predictors)
predictors.shape

(324, 129, 8)

In [81]:
sample = torch.from_numpy(predictors)
sample.shape

torch.Size([324, 129, 8])

In [82]:
sample = sample.view(sample.shape[0], -1)
sample.shape

torch.Size([324, 1032])

In [83]:
y_pred = model(sample)
y_pred = y_pred.detach().numpy().transpose()

In [84]:
phase[:,num_segments - 1:].shape

(129, 324)

In [85]:
y_pred.shape

(129, 324)

Then, invert STFT back to audio format

In [86]:
D_rec = y_pred * phase[:,num_segments - 1:] 

In [87]:
audio_rec = librosa.istft(D_rec,
    length=n,
    win_length = window_length,
    window = win,
    hop_length = hop_length)

In [88]:
Audio(audio_rec, rate = sr)

## Denoised signal

In [89]:
from utils.snr_calc import *

In [90]:
magnitude, phase = librosa.magphase(y_pred)
sr=8e3
print(magnitude.shape)

(129, 324)


In [91]:
print(f"SNR_db with function: {10*np.log10(DER_SNR(spectral_flux(magnitude, sr)))}")
print(y_pred.shape)

SNR_db with function: 32.35684421959705
(129, 324)


In [92]:
signaltonoise(magnitude)

23.038296699523926

## Noisy signal

In [93]:
D_noise = librosa.stft(y_pad_noise.astype(np.float32),
    n_fft = fft_length,
    win_length = window_length,
    window = win,
    hop_length = hop_length)
magnitude, phase = librosa.magphase(D_noise)
sr=8e3
print(magnitude.shape)

(129, 331)


In [94]:
print(f"SNR_db with function: {10*np.log10(DER_SNR(spectral_flux(magnitude, sr)))}")

SNR_db with function: 29.71228850676269


In [95]:
def signaltonoise(a, axis=0, ddof=0):
    a = np.asanyarray(a)
    m = a.mean(axis)
    sd = a.std(axis=axis, ddof=ddof)
    return 10*np.log10(np.sum(m/sd))

In [96]:
signaltonoise(magnitude)

23.09335231781006

## Clean signal

In [97]:
y_clean, sr = librosa.load(clean_audio_f)
y_clean = librosa.resample(y_clean, target_sr = fs, orig_sr = sr)

n = len(y_clean)
y_pad_clean = librosa.util.fix_length(y_clean, n + fft_length // 2)
D_clean = librosa.stft(y_pad_clean.astype(np.float32),
    n_fft = fft_length,
    win_length = window_length,
    window = win,
    hop_length = hop_length)
magnitude, phase = librosa.magphase(D_clean)

In [98]:
print(f"SNR_db with function: {10*np.log10(DER_SNR(spectral_flux(magnitude, sr)))}")

SNR_db with function: 25.759288649333175


In [99]:
signaltonoise(magnitude)

20.729126930236816