# Convert wave to spectrogram to tfrecords format

* Sampling down: 44100 -> 8192
* track 단위로 stft를 수행 (train 기준 총 100개의 파일 단위로 fft)
* track 마다 시간이 많이 달라 (20sec ~ 10min for train) track을 일정 구간으로 자름
  * 실제로는 track을 자르지 않고 stft 한 결과인 spectrogram을 time 축으로 자름
* spectrogram의 input_data shape이 (512, 128) 이 되도록 맞춤
* 데이터 하나는 (512, 192) shape의 spectrogram 임
  * 이러면 노래 마지막 부분이 잘리는데 잘리는 부분은 바로 전 part에 merge
  * 그래서 마지막 부분 time_step: 192 < time_step < 384
* Singing Voice Separation with Deep U-Net Convolutional Networks 논문 데이터 만드는 방법

In [None]:
import os
import sys

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import librosa
import librosa.display
import musdb

from IPython import display

import tensorflow as tf

In [None]:
mus = musdb.DB(root_dir='./datasets/musdb18/')

In [None]:
# load the training tracks
split_name = 'train'
assert split_name in ['train', 'test']

tracks = mus.load_mus_tracks(subsets=[split_name])
print(type(tracks))
print(len(tracks))

In [None]:
# check for total time (minute, sec) info
# for i, track in enumerate(tracks):
#   sec = track.audio.T.shape[1]/44100
#   minute = int(sec / 60)
#   sec = sec - minute * 60
#   print("{}th: {} min {:.2f} sec".format(i, minute, sec))

In [None]:
tracks[0].targets.keys()

In [None]:
index = 0
print(tracks[index].name)

### Listen the track

In [None]:
# original track - mixture
display.Audio(tracks[index].audio.T, rate=44100)

In [None]:
print(tracks[index].audio.T.shape)

In [None]:
# if you want to listen in each stem source then uncomment them
# display.Audio(tracks[index].targets['vocals'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['drums'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['bass'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['other'].audio.T, rate=44100)
# display.Audio(tracks[index].targets['accompaniment'].audio.T, rate=44100)

## Plot for short time

In [None]:
# Separate to left and right channels
second = 10
left_wave = tracks[index].audio.T[0][:44100 * second]
left_wave /= max(abs(left_wave))
right_wave = tracks[index].audio.T[1][:44100 * second]
right_wave /= max(abs(right_wave))

In [None]:
# Plot the each channel
plt.figure(figsize=[18, 3])
plt.plot(left_wave)

plt.figure(figsize=[18, 3])
plt.plot(right_wave)
plt.show()

In [None]:
print(left_wave.shape)

In [None]:
plt.plot(left_stft[:,0])

In [None]:
# Short-time Fourier Transform
# n_fft: number of samples used to calculate fft
# hop_length: like concept of stride
left_stft = librosa.core.stft(left_wave, n_fft=2048, hop_length=512)
print(left_stft.shape)
print(type(left_stft[0, 0]))

In [None]:
left_abs = abs(left_stft)
librosa.display.specshow(left_abs)
plt.colorbar()
plt.show()

librosa.display.specshow(librosa.amplitude_to_db(left_abs, ref=np.max))
plt.colorbar()
plt.show()

## Spectrogram using normalize (for maybe standard method)

In [None]:
min_level_db = -100
ref_level_db = 20

In [None]:
def spectrogram(y):
  D = _stft(y)
  S = _amp_to_db(np.abs(D)) - ref_level_db
  return _normalize(S)

In [None]:
def _stft(y):
  #n_fft, hop_length, win_length = 2048, 512, 2048
  n_fft, hop_length, win_length = 1024, 512, 1024
  return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)

In [None]:
def _amp_to_db(x):
  return 20 * np.log10(np.maximum(1e-5, x))

In [None]:
def _normalize(S):
  return np.clip((S - min_level_db) / -min_level_db, 0, 1)

In [None]:
left_spec = spectrogram(left_wave)
print(left_spec.shape)

In [None]:
plt.figure(figsize=(16, 4))
librosa.display.specshow(left_spec)
plt.colorbar()
plt.show()

## Convert to tfrecords format

In [None]:
def int64_feature(values):
  """Returns a TF-Feature of int64s.

  Args:
    values: A scalar or list of values.

  Returns:
    A TF-Feature.
  """
  if not isinstance(values, (tuple, list)):
    values = [values]
  return tf.train.Feature(int64_list=tf.train.Int64List(value=values))


def bytes_feature(values):
  """Returns a TF-Feature of bytes.

  Args:
    values: A string.

  Returns:
    A TF-Feature.
  """
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))


def float_feature(values):
  """Returns a TF-Feature of floats.

  Args:
    values: A scalar of list of values.

  Returns:
    A TF-Feature.
  """
  if not isinstance(values, (tuple, list)):
    values = [values]
  return tf.train.Feature(float_list=tf.train.FloatList(value=values))

In [None]:
def _get_dataset_filename(dataset_dir, split_name, shard_id, num_shards):
  output_filename = 'spectrogram_%s_%05d-of-%05d.tfrecord' % (
      split_name, shard_id, num_shards)
  return os.path.join(dataset_dir, output_filename)

In [None]:
def convert_dataset(split_name, dataset_dir, N, num_shards):
  """Converts the spectrogram of given tracks to a TFRecord dataset.

  Args:
    split_name: The name of the dataset, either 'train' or 'validation'.
    dataset_dir: The directory where the converted datasets are stored.
    N: number of total examples # train: 100, test: 50
    num_shards: number of shards
  """
  assert split_name in ['train', 'test']

  # data split
  spectrogram_datadir = os.path.join(dataset_dir, split_name)
  print(spectrogram_datadir)
  
  num_per_shard = int(N / float(num_shards))

  for shard_id in range(num_shards):
    output_filename = _get_dataset_filename(
              spectrogram_datadir, split_name, shard_id, num_shards)
    print('Writing', output_filename)

    # step 1
    with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer:
      start_ndx = shard_id * num_per_shard
      end_ndx = min((shard_id+1) * num_per_shard, N)

      for i in range(start_ndx, end_ndx):
        sys.stdout.write('\r>> Converting spectrogram %d/%d shard %d\n' % (
            i+1, N, shard_id))
        sys.stdout.flush()

        mixtures = tracks[i].audio.T
        vocals = tracks[i].targets['vocals'].audio.T
        drums = tracks[i].targets['drums'].audio.T
        basses = tracks[i].targets['bass'].audio.T
        others = tracks[i].targets['other'].audio.T
        accompaniments = tracks[i].targets['accompaniment'].audio.T
        number_of_samples = len(mixtures[0])        

        sources = [mixtures, vocals, drums, basses, others, accompaniments]
        for k, wave in enumerate(sources):
          left_wave = wave[0]
          right_wave = wave[1]
          # resampling
          left_wave_8192 = librosa.resample(left_wave, orig_sr=44100, target_sr=8192)
          right_wave_8192 = librosa.resample(right_wave, orig_sr=44100, target_sr=8192)

          left_spec = np.expand_dims(spectrogram(left_wave_8192), axis=2)
          right_spec = np.expand_dims(spectrogram(right_wave_8192), axis=2)
          
          left_spec

          if k == 0:
            left_spec_concat = left_spec
            right_spec_concat = right_spec
          else:
            left_spec_concat = np.concatenate((left_spec_concat, left_spec), axis=2)
            right_spec_concat = np.concatenate((right_spec_concat, right_spec), axis=2)

          time_step_for_example = 192
          num_split = int(left_spec_concat.shape[1] / time_step_for_example) # time_step: 192 for one data

          print("{}th data; shape: {}, num_split: {}".format(i, left_spec_concat.shape, num_split))

        channle_info = [b'left', b'right']
        for channel_index, spec_concat in enumerate([left_spec_concat, right_spec_concat]):
          for split_index in range(num_split-1):
            if split_index > 0:
              break
            # step 2
            spec_raw = spec_concat[:512, split_index*time_step_for_example:(split_index+1)*time_step_for_example]
            frequency_bin = spec_raw.shape[0]
            time_step = spec_raw.shape[1]
            spec_raw_string = spec_raw.tostring()
            channel = channle_info[channel_index]
            print("{}th track; {}th split".format(i, split_index))

            # step 3:
            features = tf.train.Features(feature={'spec_raw': bytes_feature(spec_raw_string),
                                                  'frequency_bin': int64_feature(frequency_bin),
                                                  'time_step': int64_feature(time_step),
                                                  'channel': bytes_feature(channel),
                                                 })

            # step 4
            example = tf.train.Example(features=features)

            # step 5
            tfrecord_writer.write(example.SerializeToString())

          # merge between last split part and residual part
          # step 2
          spec_raw = spec_concat[:512, (split_index+1)*time_step_for_example:]
          frequency_bin = spec_raw.shape[0]
          time_step = spec_raw.shape[1]
          spec_raw_string = spec_raw.tostring()
          channel = channle_info[channel_index]
          print("{}th track; {}th split".format(i, split_index+1))

          # step 3:
          features = tf.train.Features(feature={'spec_raw': bytes_feature(spec_raw_string),
                                                'frequency_bin': int64_feature(frequency_bin),
                                                'time_step': int64_feature(time_step),
                                                'channel': bytes_feature(channel),
                                               })

          # step 4
          example = tf.train.Example(features=features)

          # step 5
          tfrecord_writer.write(example.SerializeToString())

In [None]:
spectrogram_data_dir = './datasets/spectrogram'
NUM_SHARDS = 10 # for train: 20, for test: 10
N = 10 # for train: 100, for test: 50
convert_dataset(split_name, spectrogram_data_dir, N, NUM_SHARDS)