# Wave to Spectrogram

* 11.9초 단위로 자른 wav file을 stft(Short-time Fourier Transfromation)를 이용하여 (time, frequency) domain으로 바꿈
* 바꾼 그림을 각각 그림 파일 (jpg)로 저장
* Singing Voice Separation with Deep U-Net Convolutional Networks 논문 데이터 만드는 방법

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import imageio

import librosa
import librosa.display
import musdb

from IPython import display

In [None]:
mus = musdb.DB(root_dir='./datasets/musdb18/')

In [None]:
# load the training tracks
tracks = mus.load_mus_tracks(subsets=['train'])
print(type(tracks))
print(len(tracks))

In [None]:
tracks[0].targets.keys()

In [None]:
index = 0
print(tracks[index].name)

In [None]:
# original track - mixture
display.Audio(tracks[index].audio.T, rate=44100)

In [None]:
print(tracks[index].audio.T.shape)

In [None]:
# if you want to listen in each stem source then uncomment them
# display.Audio(tracks[index].targets['vocals'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['drums'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['bass'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['other'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['accompaniment'].audio.T, rate=44100)

## Plot for short time

In [None]:
# Separate to left and right channels
second = 10
left_wave = tracks[index].audio.T[0][:44100 * second]
left_wave /= max(abs(left_wave))
right_wave = tracks[index].audio.T[1][:44100 * second]
right_wave /= max(abs(right_wave))

In [None]:
# Plot the each channel
plt.figure(figsize=[18, 3])
plt.plot(left_wave)

plt.figure(figsize=[18, 3])
plt.plot(right_wave)
plt.show()

In [None]:
print(left_wave.shape)

In [None]:
plt.plot(left_stft[:,0])

In [None]:
# Short-time Fourier Transform
# n_fft: number of samples used to calculate fft
# hop_length: like concept of stride
left_stft = librosa.core.stft(left_wave, n_fft=2048, hop_length=512)
print(left_stft.shape)
print(type(left_stft[0, 0]))

In [None]:
left_abs = abs(left_stft)
librosa.display.specshow(left_abs)
plt.colorbar()
plt.show()

librosa.display.specshow(librosa.amplitude_to_db(left_abs, ref=np.max))
plt.colorbar()
plt.show()

## Spectrogram using normalize (for maybe standard form)

In [None]:
min_level_db = -100
ref_level_db = 20

In [None]:
def spectrogram(y):
  D = _stft(y)
  S = _amp_to_db(np.abs(D)) - ref_level_db
  return _normalize(S)

In [None]:
def _stft(y):
  #n_fft, hop_length, win_length = 2048, 512, 2048
  n_fft, hop_length, win_length = 1024, 768, 1024
  return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)

In [None]:
def _amp_to_db(x):
  return 20 * np.log10(np.maximum(1e-5, x))

In [None]:
def _normalize(S):
  return np.clip((S - min_level_db) / -min_level_db, 0, 1)

In [None]:
left_spec = spectrogram(left_wave)
print(left_spec.shape)

In [None]:
plt.figure(figsize=(16, 4))
librosa.display.specshow(left_spec)
plt.colorbar()
plt.show()

In [None]:
# data split
spectrogram_datadir = './datasets/spectrogram_jpg/train'
second = 11.90625 # split in about 11.9 seconds
for i, track in enumerate(tracks):
  number = int(track.audio.T.shape[1] / second / 44100)
  interval = int(second * 44100)
  for j in range(number):
    print("# {} track: {}th part".format(i, j))
    mixtures = track.audio.T[:, j*interval:(j+1)*interval]
    vocals = track.targets['vocals'].audio.T[:, j*interval:(j+1)*interval]
    drums = track.targets['drums'].audio.T[:, j*interval:(j+1)*interval]
    basses = track.targets['bass'].audio.T[:, j*interval:(j+1)*interval]
    others = track.targets['other'].audio.T[:, j*interval:(j+1)*interval]
    accompaniments = track.targets['accompaniment'].audio.T[:, j*interval:(j+1)*interval]
    
    sources = [mixtures, vocals, drums, basses, others, accompaniments]
    for k, wave in enumerate(sources):
      left_wave = wave[0]
      right_wave = wave[1]
      # resampling
      left_wave_8192 = librosa.resample(left_wave, orig_sr=44100, target_sr=8192)
      right_wave_8192 = librosa.resample(left_wave, orig_sr=44100, target_sr=8192)
      
      left_spec = spectrogram(left_wave_8192) # (513, 128) shape
      right_spec = spectrogram(right_wave_8192) # (513, 128) shape
      print(left_spec.shape)
      
      if k == 0:
        left_concat = left_spec
        right_concat = right_spec
      else:
        left_concat = np.concatenate((left_concat, left_spec), axis=1)
        right_concat = np.concatenate((right_concat, right_spec), axis=1)
        
    display.clear_output(wait=True)
    filename_l = 'track{}.part{}.left.jpg'.format(i, j)
    filename_r = 'track{}.part{}.right.jpg'.format(i, j)
    print(filename_l, filename_r)
    
    imageio.imwrite(os.path.join(spectrogram_datadir, filename_l), left_concat[:512,])
    imageio.imwrite(os.path.join(spectrogram_datadir, filename_r), right_concat[:512,])