# DeepSpeech Distances 

### Environment setup

In [None]:
%load_ext autoreload
%autoreload 2
import sys, os
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import audio_distance

PATH = os.path.abspath('') # Current path
DATA_PATH= os.path.join(PATH, "Audio/") # DATA_PATH is a path with following structure /{conversion_name}/{auido_type}/*.wav
CONVERSIONS = os.listdir(DATA_PATH)
WORKING_PATH = os.path.join(PATH, 'working')
CHECKPOINT_PATH = os.path.join(PATH, 'checkpoint')

if not os.path.exists(os.path.join(PATH, 'checkpoint', 'ds2_large')):
  CKPT = os.path.join(PATH, 'ds2_large.tar.gz')
  URL = "https://github.com/davegabe/DeepSpeechDistances/releases/download/Model/ds2_large.tar.gz"
  !wget {URL} -O {CKPT}
  !tar -C {CHECKPOINT_PATH} -xvf {CKPT}
  !rm {CKPT}
else:
  print('Found checkpoint directory, skipping download.')

### Prepare data
>Copy data and subsample what's needed

In [None]:
import scipy.io.wavfile as wav

# Copy data to working directory
!rm -rf {WORKING_PATH}
!mkdir -p {WORKING_PATH}
!cp -r {DATA_PATH}/* {WORKING_PATH}

def get_audio_by_length(audio_path):
    """Sorts data by length, longest first."""
    audio_lengths = []
    for audio in os.listdir(audio_path):
        audio_lengths.append((audio, os.path.getsize(os.path.join(audio_path, audio))))
    return sorted(audio_lengths, key=lambda x: x[1], reverse=True)

# For each conversion
for conversion in CONVERSIONS:
    AUDIO_TYPES = os.listdir(os.path.join(DATA_PATH, conversion))
    # There are 35 audio files for audio type, but we need 64 samples per each audio type so we need to subsample the audio files.
    
    # for audio_type in AUDIO_TYPES:
    audio_type = "Target"
    audio_path = os.path.join(WORKING_PATH, conversion, audio_type)
    audio_lengths = get_audio_by_length(audio_path)
    while len(audio_lengths) < 64:
        # Take first audio
        audio_name, audio_length = audio_lengths[0]
        audio = wav.read(os.path.join(audio_path, audio_name))

        # Split audio into 2
        audio1 = audio[1][:int(len(audio[1])/2)]
        audio2 = audio[1][int(len(audio[1])/2):]

        # Write audio to file
        wav.write(os.path.join(audio_path, audio_name + '_1.wav'), audio[0], audio1)
        wav.write(os.path.join(audio_path, audio_name + '_2.wav'), audio[0], audio2)

        # Remove original audio
        os.remove(os.path.join(audio_path, audio_name))

        # Get audio by length
        audio_lengths = get_audio_by_length(audio_path)  


### Calculate FDSD and KDSD
>Create evaluator object, load reference samples and calculate distance from other samples.

In [None]:
# For each conversion
for conversion in CONVERSIONS:
  audio_path = os.path.join(WORKING_PATH, conversion)
  AUDIO_TYPES = os.listdir(audio_path)
  reference_path = os.path.join(audio_path, "Target/*.wav")
  eval_paths = [ os.path.join(audio_path, audio_type, "*.wav") for audio_type in AUDIO_TYPES ]

  evaluator = audio_distance.AudioDistance(
      load_path=os.path.join(CHECKPOINT_PATH, 'ds2_large', 'model.ckpt-54800'),
      meta_path=os.path.join(CHECKPOINT_PATH, 'collection-stripped-meta.meta'),
      required_sample_size=35,
      num_splits=1,
      do_conditional_dsds=False
  )

  evaluator.load_real_data(reference_path)

  dist_names = ['FDSD','KDSD']
  def print_results(values):
    print('\n' + ', '.join(['%s = %.5f (%.5f)' % (n, v[0], v[1]) for n, v 
                            in zip(dist_names, values)]))

  distances = []
  with tf.compat.v1.Session(config=evaluator.sess_config) as sess:
    print('Computing reference DeepSpeech distances.')
    values = evaluator.get_distance(sess=sess)
    print_results(values)
    distances = [values]

    for eval_path in eval_paths:
      if eval_path not in ["SWS_o5", "Noise", "Buzz"]:
        continue
      print('\nComputing DeepSpeech distances for files in the directory:\n'
            + os.path.dirname(eval_path))
      values = evaluator.get_distance(sess=sess, files=eval_path)
      print_results(values)
      distances.append(values)