In [4]:
%load_ext autoreload
%autoreload 2
import sys, os
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import audio_distance

PATH = './'
DATA_PATH= "../conversions_1800ep" # DATA_PATH is a path with following structure /{conversion_name}/{auido_type}/*wav
CONVERSIONS = os.listdir(DATA_PATH)
WORKING_PATH = os.path.join(PATH, 'working')
CHECKPOINT_PATH = os.path.join(PATH, 'checkpoint')
NUM_SPLITS = 3 # 3  # number of data splits to comute std of DSD
SAMPLES_PER_SPLIT = 500 # 500  # number of samples in a single DSD run. 
# We recommend at least 10k samples for evaluation to get reasonable estimates.
AUDIO_LENGTH = 2  # length of individual sample, in seconds
NUM_NOISE_LEVELS = 3  # number of different noise levels for samples to evaluate

if not os.path.exists(os.path.join(PATH, 'checkpoint', 'ds2_large')):
  CKPT = os.path.join(PATH, 'ds2_large.tar.gz')
  URL = "https://github.com/davegabe/DeepSpeechDistances/releases/download/Model/ds2_large.tar.gz"
  !wget {URL} -O {CKPT}
  !tar -C {CHECKPOINT_PATH} -xvf {CKPT}
  !rm {CKPT}
else:
  print('Found checkpoint directory, skipping download.')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Found checkpoint directory, skipping download.


Copy data and subsample what's needed

In [5]:
import scipy.io.wavfile as wav

# Copy data to working directory
!rm -rf {WORKING_PATH}
!mkdir -p {WORKING_PATH}
!cp -r {DATA_PATH}/* {WORKING_PATH}

def get_audio_by_length(audio_path):
    """Sorts data by length, longest first."""
    audio_lengths = []
    for audio in os.listdir(audio_path):
        audio_lengths.append((audio, os.path.getsize(os.path.join(audio_path, audio))))
    return sorted(audio_lengths, key=lambda x: x[1], reverse=True)

# For each conversion
for conversion in CONVERSIONS:
    AUDIO_TYPES = os.listdir(os.path.join(DATA_PATH, conversion))
    # There are 35 audio files for audio type, but we need 64 samples per each audio type so we need to subsample the audio files.
    
    # for audio_type in AUDIO_TYPES:
    audio_type = "Target"
    audio_path = os.path.join(WORKING_PATH, conversion, audio_type)
    audio_lengths = get_audio_by_length(audio_path)
    while len(audio_lengths) < 64:
        # Take first audio
        audio_name, audio_length = audio_lengths[0]
        audio = wav.read(os.path.join(audio_path, audio_name))

        # Split audio into 2
        audio1 = audio[1][:int(len(audio[1])/2)]
        audio2 = audio[1][int(len(audio[1])/2):]

        # Write audio to file
        wav.write(os.path.join(audio_path, audio_name + '_1.wav'), audio[0], audio1)
        wav.write(os.path.join(audio_path, audio_name + '_2.wav'), audio[0], audio2)

        # Remove original audio
        os.remove(os.path.join(audio_path, audio_name))

        # Get audio by length
        audio_lengths = get_audio_by_length(audio_path)  


Create evaluator object and load reference samples.

In [6]:
# For each conversion
for conversion in CONVERSIONS:
  audio_path = os.path.join(WORKING_PATH, conversion)
  AUDIO_TYPES = os.listdir(audio_path)
  reference_path = os.path.join(audio_path, "Target/*.wav")
  eval_paths = [ os.path.join(audio_path, audio_type, "*.wav") for audio_type in AUDIO_TYPES ]

  # print("Eval path: ", eval_paths)

  evaluator = audio_distance.AudioDistance(
      load_path=os.path.join(CHECKPOINT_PATH, 'ds2_large', 'model.ckpt-54800'),
      meta_path=os.path.join(CHECKPOINT_PATH, 'collection-stripped-meta.meta'),
      required_sample_size=35,    # NUM_SPLITS * SAMPLES_PER_SPLIT
      num_splits=1,   # NUM_SPLITS
      do_conditional_dsds=False
  )

  evaluator.load_real_data(reference_path)

  dist_names = ['FDSD','KDSD']
  def print_results(values):
    print('\n' + ', '.join(['%s = %.5f (%.5f)' % (n, v[0], v[1]) for n, v 
                            in zip(dist_names, values)]))

  distances = []
  with tf.compat.v1.Session(config=evaluator.sess_config) as sess:
    print('Computing reference DeepSpeech distances.')
    values = evaluator.get_distance(sess=sess)
    print_results(values)
    distances = [values]

    for eval_path in eval_paths:
      if eval_path in ["Target", "SWS", "SWS_praat", "SWS_o5_clean"]:
        continue
      print('\nComputing DeepSpeech distances for files in the directory:\n'
            + os.path.dirname(eval_path))
      values = evaluator.get_distance(sess=sess, files=eval_path)
      print_results(values)
      distances.append(values)

Computing reference DeepSpeech distances.
INFO:tensorflow:Restoring parameters from ./checkpoint/ds2_large/model.ckpt-54800
Checkpoint restored.


Extracting DeepSpeech features from reference samples: 100%|██████████| 64/64 [00:06<00:00, 10.50it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 6.098910808563232


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:02<00:00, 11.11it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 2.885369300842285
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 88.98692893981934
AudioDistance: finished evaluation.

FDSD = 9.02327 (0.00000), KDSD = -0.00002 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TF1/Buzz


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.35it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.985083818435669
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 85.79199266433716
AudioDistance: finished evaluation.

FDSD = 25.30785 (0.00000), KDSD = 0.03063 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TF1/Noise


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.98it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.353400468826294
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 88.77531909942627
AudioDistance: finished evaluation.

FDSD = 26.51990 (0.00000), KDSD = 0.03269 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TF1/Paper


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.42it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.911280393600464
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 86.9420416355133
AudioDistance: finished evaluation.

FDSD = 20.52126 (0.00000), KDSD = 0.02074 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TF1/SWS_o5


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.72it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.600004196166992
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 87.1363115310669
AudioDistance: finished evaluation.

FDSD = 27.45022 (0.00000), KDSD = 0.03513 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TF1/Target


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:02<00:00, 11.45it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 2.798356771469116
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 86.81512022018433
AudioDistance: finished evaluation.

FDSD = 9.02327 (0.00000), KDSD = -0.00002 (nan)
Computing reference DeepSpeech distances.
INFO:tensorflow:Restoring parameters from ./checkpoint/ds2_large/model.ckpt-54800
Checkpoint restored.


Extracting DeepSpeech features from reference samples: 100%|██████████| 64/64 [00:05<00:00, 11.08it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.779914379119873


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:02<00:00, 11.20it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 2.8613126277923584
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 88.58206081390381
AudioDistance: finished evaluation.

FDSD = 9.61608 (0.00000), KDSD = -0.00007 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TM1/Buzz


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:06<00:00,  5.22it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 6.127667188644409
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 88.17156744003296
AudioDistance: finished evaluation.

FDSD = 20.22243 (0.00000), KDSD = 0.02155 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TM1/Noise


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:06<00:00,  4.72it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 6.779855251312256
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 86.70368003845215
AudioDistance: finished evaluation.

FDSD = 21.04610 (0.00000), KDSD = 0.02247 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TM1/Paper


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.35it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.985130071640015
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 86.67691206932068
AudioDistance: finished evaluation.

FDSD = 18.13587 (0.00000), KDSD = 0.01755 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TM1/SWS_o5


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:06<00:00,  5.29it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 6.055692434310913
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 89.29941415786743
AudioDistance: finished evaluation.

FDSD = 22.64307 (0.00000), KDSD = 0.02621 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SF3_TM1/Target


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:02<00:00, 11.22it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 2.855961322784424
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 87.44781112670898
AudioDistance: finished evaluation.

FDSD = 9.61608 (0.00000), KDSD = -0.00007 (nan)
Computing reference DeepSpeech distances.
INFO:tensorflow:Restoring parameters from ./checkpoint/ds2_large/model.ckpt-54800
Checkpoint restored.


Extracting DeepSpeech features from reference samples: 100%|██████████| 64/64 [00:07<00:00,  9.01it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 7.105317115783691


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:03<00:00,  9.41it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 3.4035117626190186
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 86.77005362510681
AudioDistance: finished evaluation.

FDSD = 9.02327 (0.00000), KDSD = -0.00002 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TF1/Buzz


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:07<00:00,  4.31it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 7.4303553104400635
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 88.89656805992126
AudioDistance: finished evaluation.

FDSD = 26.13038 (0.00000), KDSD = 0.03169 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TF1/Noise


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:07<00:00,  4.49it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 7.126528024673462
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 87.96030688285828
AudioDistance: finished evaluation.

FDSD = 27.74623 (0.00000), KDSD = 0.03446 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TF1/Paper


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:07<00:00,  4.46it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 7.178345680236816
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 87.47462630271912
AudioDistance: finished evaluation.

FDSD = 23.85204 (0.00000), KDSD = 0.02770 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TF1/SWS_o5


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:06<00:00,  4.65it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 6.87777042388916
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 88.43586707115173
AudioDistance: finished evaluation.

FDSD = 34.67243 (0.00000), KDSD = 0.04802 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TF1/Target


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:03<00:00, 10.18it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 3.147843599319458
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 86.70801210403442
AudioDistance: finished evaluation.

FDSD = 9.02327 (0.00000), KDSD = -0.00002 (nan)
Computing reference DeepSpeech distances.
INFO:tensorflow:Restoring parameters from ./checkpoint/ds2_large/model.ckpt-54800
Checkpoint restored.


Extracting DeepSpeech features from reference samples: 100%|██████████| 64/64 [00:05<00:00, 11.01it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.817717790603638


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:02<00:00, 12.19it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 2.6294171810150146
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 87.6862096786499
AudioDistance: finished evaluation.

FDSD = 9.61608 (0.00000), KDSD = -0.00007 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TM1/Buzz


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.71it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.606923818588257
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 87.35962748527527
AudioDistance: finished evaluation.

FDSD = 18.96223 (0.00000), KDSD = 0.01823 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TM1/Noise


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.69it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.628041505813599
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 84.3254086971283
AudioDistance: finished evaluation.

FDSD = 20.18066 (0.00000), KDSD = 0.02032 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TM1/Paper


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.75it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.572564363479614
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 88.27598071098328
AudioDistance: finished evaluation.

FDSD = 17.83219 (0.00000), KDSD = 0.01583 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TM1/SWS_o5


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:05<00:00,  5.68it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 5.63887095451355
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 85.07477688789368
AudioDistance: finished evaluation.

FDSD = 22.49038 (0.00000), KDSD = 0.02492 (nan)

Computing DeepSpeech distances for files in the directory:
./working/SM3_TM1/Target


Extracting DeepSpeech features from samples to evaluate: 100%|██████████| 32/32 [00:02<00:00, 13.09it/s]


DeepSpeech2: finished evaluating features, total time%.1fs 2.447192430496216
AudioDistance: got features from both samples, computing metrics...
AudioDistance: computed metrics from features in %.1fs. 83.28373789787292
AudioDistance: finished evaluation.

FDSD = 9.61608 (0.00000), KDSD = -0.00007 (nan)
