# Synthesis

### Examine the Dataset

In [1]:
import librosa.display
import librosa.util
import matplotlib.pyplot as plt
from IPython.display import Audio
import os
import scipy
import numpy as np

In [None]:
raw_dir = "./data/raw/edinburgh-noisy-speech-db/"
log_trainset = "log_trainset_28spk.txt"
audio_files = []
# list files
f = open(raw_dir + log_trainset, "r")
for x in f:
    audio_files.append(x.split()[0] + ".wav")
f.close()

In [None]:
clean_audio_dir = "./data/raw/edinburgh-noisy-speech-db/clean_trainset_28spk_wav/"
audio_file = audio_files[200]
clean_audio_f = clean_audio_dir + audio_file
Audio(clean_audio_f)

In [None]:
y_noise, sr = librosa.load(clean_audio_f)
# you can take any distribution from https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html
noise_amp = 0.15*np.random.uniform()*np.amax(y_noise)
y_noise = y_noise.astype('float64') + noise_amp * np.random.normal(size=y_noise.shape[0])
Audio(y_noise, rate=sr)

### Short-Time Fourier Transform

First, convert samples into STFT

In [8]:
window_length = 1024;
win = scipy.signal.hann(window_length,"periodic");
hop_length = round(0.25 * window_length);
fft_length = window_length;

In [9]:
# downsampling to 8k
input_fs = sr;
fs = 8e3;

y_noise = librosa.resample(y_noise, target_sr = fs, orig_sr = sr)
sr = fs

In [10]:
Audio(y_noise, rate = sr)

In [11]:
n = len(y_noise)
y_pad_noise = librosa.util.fix_length(y_noise, n + fft_length // 2)
D_noise = librosa.stft(y_pad_noise.astype(np.float32),
    n_fft = fft_length,
    win_length = window_length,
    window = win,
    hop_length = hop_length)
magnitude, phase = librosa.magphase(D_noise)

In [20]:
num_features  = 129;
num_segments  = 8;

predictors = []

# Create array of targets and predictors
for segment_index in range(magnitude.shape[1] - num_segments + 1):
    predictors.append(magnitude[:, segment_index:segment_index + num_segments])

### Load Model

In [29]:
import torch
import os
import sys
# TODO: fix model saving to save model as recommended
# https://pytorch.org/docs/master/notes/serialization.html#recommended-approach-for-saving-a-model
# then loading the model should work properly
from model.baseline_model import FullyConnectedBaseline as network

model_to_test = "Baseline_FullyConnected/0506_114819"

model = network(n_features = 1024, n_segments = 8)
model_path = "./saved/" + model_to_test + "/model_best.pth"
model.load_state_dict(torch.load(model_path)['state_dict'])

RuntimeError: Error(s) in loading state_dict for FullyConnectedBaseline:
	size mismatch for fc1.weight: copying a param with shape torch.Size([1024, 1032]) from checkpoint, the shape in current model is torch.Size([1024, 8192]).
	size mismatch for fc3.weight: copying a param with shape torch.Size([129, 1024]) from checkpoint, the shape in current model is torch.Size([1024, 1024]).
	size mismatch for fc3.bias: copying a param with shape torch.Size([129]) from checkpoint, the shape in current model is torch.Size([1024]).

In [22]:
predictors = np.array(predictors)
predictors.shape

(94, 513, 8)

In [23]:
sample = torch.from_numpy(predictors)
sample.shape

torch.Size([94, 513, 8])

In [24]:
sample = sample.view(sample.shape[0], -1)
sample.shape

torch.Size([94, 4104])

In [25]:
y_pred = model(sample)
y_pred = y_pred.detach().numpy().transpose()

RuntimeError: size mismatch, m1: [94 x 4104], m2: [1032 x 1024] at /Users/soumith/b101_2/2019_02_08/wheel_build_dirs/wheel_3.6/pytorch/aten/src/TH/generic/THTensorMath.cpp:940

In [18]:
phase[:,num_segments - 1:].shape

(513, 94)

In [19]:
y_pred.shape

NameError: name 'y_pred' is not defined

Then, invert STFT back to audio format

In [22]:
D_rec = y_pred * phase[:,num_segments - 1:] 

In [23]:
audio_rec = librosa.istft(D_rec,
    length=n,
    win_length = window_length,
    window = win,
    hop_length = hop_length)

In [24]:
Audio(audio_rec, rate = sr)