In [1]:
import torch
import torchaudio
import torchtext
import librosa
import spacy
import random
import math
import time
import json
import sys
import ipdb
import os
import subprocess
import logging
import string
import re
import fnmatch
import io
import IPython
import unicodedata

import scipy.signal

import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from collections import Counter
from scipy import spatial
from tqdm import tqdm
from typing import Tuple
from torchaudio.datasets.utils import walk_files
# from __future__ import print_function
from tempfile import NamedTemporaryFile
from torch import Tensor
from torch.autograd import Variable
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torch.distributed import get_rank
from torch.distributed import get_world_size
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler

# from stanfordcorenlp import StanfordCoreNLP

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Check hours of recording (New testament)

In [2]:
!pwd

/home/aims/big_data/2020-AMMI-salomon/notebooks/MT-ASR


In [11]:
import subprocess

def get_audio_length(repository):
    
    seconds = []
    for count, filename in enumerate(os.listdir(repository)): 
        seconds.append(float(subprocess.check_output(['soxi -D \"%s\"' % repository+filename.strip()], shell=True)))
    
    hours = sum(seconds)/(60*60)
    
    # ipdb.set_trace()    
    
    return f"We have a total of '{hours}' hours in the repository '{repository}'"


get_audio_length("/home/aims/big_data/2020-AMMI-salomon/data/external/mass-dataset-mod/dataset/wav_verse/")

"We have a total of '18.781306666666644' hours in the repository '/home/aims/big_data/2020-AMMI-salomon/data/external/mass-dataset-mod/dataset/wav_verse/'"

In [None]:
def load_audio(path):
    sound, _ = torchaudio.load(path, normalization=True)
    sound = sound.numpy().T
    
#     print(len(sound.shape))
    
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)  # multiple channels, average
    return sound


def get_audio_length(path):
    output = subprocess.check_output(
        ['soxi -D \"%s\"' % path.strip()], shell=True)
    return float(output)

def audio_with_sox(path, sample_rate, start_time, end_time):
    """
    crop and resample the recording with sox and loads it.
    """
    with NamedTemporaryFile(suffix=".wav") as tar_file:
        tar_filename = tar_file.name
        sox_params = "sox \"{}\" -r {} -c 1 -b 16 -e si {} trim {} ={} >/dev/null 2>&1".format(
            path, sample_rate,
            tar_filename, start_time,
            end_time)
        
        os.system(sox_params)
        y = load_audio(tar_filename)
        return y

def augment_audio_with_sox(path, sample_rate, tempo, gain):
    """
    Changes tempo and gain of the recording with sox and loads it.
    """
    with NamedTemporaryFile(suffix=".wav") as augmented_file:
        augmented_filename = augmented_file.name
        sox_augment_params = ["tempo", "{:.3f}".format(
            tempo), "gain", "{:.3f}".format(gain)]
        sox_params = "sox \"{}\" -r {} -c 1 -b 16 -e si {} {} >/dev/null 2>&1".format(
            path, sample_rate, augmented_filename, " ".join(sox_augment_params))
        os.system(sox_params)
        y = load_audio(augmented_filename)
        return y


def load_randomly_augmented_audio(path, sample_rate=16000, tempo_range=(0.85, 1.15), 
                                  gain_range=(-6, 8)):
    """
    Picks tempo and gain uniformly, applies it to the utterance by using sox utility.
    Returns the augmented utterance.
    """
    low_tempo, high_tempo = tempo_range
    tempo_value = np.random.uniform(low=low_tempo, high=high_tempo)
    low_gain, high_gain = gain_range
    gain_value = np.random.uniform(low=low_gain, high=high_gain)
    audio = augment_audio_with_sox(path=path, sample_rate=sample_rate,
                                   tempo=tempo_value, gain=gain_value)
    return audio

## Data Loader

In [None]:
windows = {'hamming': scipy.signal.hamming, 'hann': scipy.signal.hann, 
           'blackman': scipy.signal.blackman, 'bartlett': scipy.signal.bartlett}


class AudioParser(object):
    def parse_transcript(self, transcript_path):
        """
        :param transcript_path: Path where transcript is stored from the manifest file
        :return: Transcript in training/testing format
        """
        raise NotImplementedError

    def parse_audio(self, audio_path):
        """
        :param audio_path: Path where audio is stored from the manifest file
        :return: Audio in training/testing format
        """
        raise NotImplementedError


class SpectrogramParser(AudioParser):
    def __init__(self, audio_conf, normalize=False, augment=False):
        """
        Parses audio file into spectrogram with optional normalization and various augmentations
        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
        :param normalize(default False):  Apply standard mean and deviation normalization to audio tensor
        :param augment(default False):  Apply random tempo and gain perturbations
        """
        super(SpectrogramParser, self).__init__()
        self.window_stride = audio_conf['window_stride']
        self.window_size = audio_conf['window_size']
        self.sample_rate = audio_conf['sample_rate']
        self.window = windows.get(audio_conf['window'], windows['hamming'])
        self.normalize = normalize
        self.augment = augment
        self.noiseInjector = NoiseInjection(audio_conf['noise_dir'], self.sample_rate,
                                            audio_conf['noise_levels']) if audio_conf.get(
            'noise_dir') is not None else None
        self.noise_prob = audio_conf.get('noise_prob')

    def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)

        if self.noiseInjector:
            logging.info("inject noise")
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)

        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)

        # Short-time Fourier transform (STFT)
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
                
        spect, phase = librosa.magphase(D)

        # S = log(S+1) = log1p(S)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)

        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect

    def parse_transcript(self, transcript_path):
        raise NotImplementedError


class SpectrogramDataset(Dataset, SpectrogramParser):
    def __init__(self, audio_conf, manifest_filepath_list, 
                 label2id, normalize=False, augment=False):
        """
        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
        a comma. Each new line is a different sample. Example below:
        /path/to/audio.wav,/path/to/audio.txt
        ...
        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
        :param manifest_filepath: Path to manifest csv as describe above
        :param labels: String containing all the possible characters to map to
        :param normalize (default False): Apply standard mean and deviation normalization to audio tensor
        :param augment (default False):  Apply random tempo and gain perturbations
        """
        self.max_size = 0
        self.ids_list = []
        for i in range(len(manifest_filepath_list)):
            manifest_filepath = manifest_filepath_list[i]
            with open(manifest_filepath) as f:
                ids = f.readlines()

            ids = [x.strip().split('\t') for x in ids]
            self.ids_list.append(ids)
            self.max_size = max(len(ids), self.max_size)

        self.manifest_filepath_list = manifest_filepath_list
        self.label2id = label2id
        super(SpectrogramDataset, self).__init__(
            audio_conf, normalize, augment)

    def __getitem__(self, index):
        random_id = random.randint(0, len(self.ids_list)-1)
        ids = self.ids_list[random_id]
        sample = ids[index % len(ids)]
        audio_path, transcript, translation = sample[0], sample[1], sample[2]
        
        # get the audio using Short-time Fourier transform (STFT)
        # librosa.stft up to "args.src_max_len"
        spect = self.parse_audio(audio_path)[:,:args.src_max_len] 
        
        transcript = self.parse_transcript(transcript)
        translation = self.parse_translation(translation)
        return spect, transcript, translation

    def parse_transcript(self, transcript_path):
#         with open(transcript_path, 'r', encoding='utf8') as transcript_file:
            # add start of sentense and end of sentence token
        transcript = args.SOS_CHAR + transcript_path.lower() +\
                        args.EOS_CHAR
            
        # return all index exept 0 (false), in this case
        # there will be no 0 in the list of index (due to filter)
#         transcript = list(
#             filter(None, [self.label2id.get(x) for x in list(transcript)]))
        return transcript
    
    def parse_translation(self, translation_path):
#         with open(transcript_path, 'r', encoding='utf8') as transcript_file:
            # add start of sentense and end of sentence token
        translation = args.SOS_CHAR + translation_path.lower() +\
                        args.EOS_CHAR
            
        # return all index exept 0 (false), in this case
        # there will be no 0 in the list of index (due to filter)
#         transcript = list(
#             filter(None, [self.label2id.get(x) for x in list(transcript)]))
        return translation

    def __len__(self):
        return self.max_size


class NoiseInjection(object):
    def __init__(self,
                 path=None,
                 sample_rate=16000,
                 noise_levels=(0, 0.5)):
        """
        Adds noise to an input signal with specific SNR. Higher the noise level, the more noise added.
        Modified code from https://github.com/willfrey/audio/blob/master/torchaudio/transforms.py
        """
        if not os.path.exists(path):
            print("Directory doesn't exist: {}".format(path))
            raise IOError
        self.paths = path is not None and librosa.util.find_files(path)
        self.sample_rate = sample_rate
        self.noise_levels = noise_levels

    def inject_noise(self, data):
        noise_path = np.random.choice(self.paths)
        noise_level = np.random.uniform(*self.noise_levels)
        return self.inject_noise_sample(data, noise_path, noise_level)

    def inject_noise_sample(self, data, noise_path, noise_level):
        noise_len = get_audio_length(noise_path)
        data_len = len(data) / self.sample_rate
        noise_start = np.random.rand() * (noise_len - data_len)
        noise_end = noise_start + data_len
        noise_dst = audio_with_sox(
            noise_path, self.sample_rate, noise_start, noise_end)
        assert len(data) == len(noise_dst)
        noise_energy = np.sqrt(noise_dst.dot(noise_dst) / noise_dst.size)
        data_energy = np.sqrt(data.dot(data) / data.size)
        data += noise_level * noise_dst * data_energy / noise_energy
        return data


def _collate_fn(batch):
    def func(p):
        return p[0].size(1)

    def func_tgt_1(p):
        return len(p[1])
    
    def func_tgt_2(p):
        return len(p[2])

    # descending sorted
    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)

    max_seq_len = max(batch, key=func)[0].size(1)
    freq_size = max(batch, key=func)[0].size(0)
    max_tgt_1_len = len(max(batch, key=func_tgt_1)[1])
    max_tgt_2_len = len(max(batch, key=func_tgt_2)[1])
    
    inputs = torch.zeros(len(batch), 1, freq_size, max_seq_len)
    input_sizes = torch.IntTensor(len(batch))
    input_percentages = torch.FloatTensor(len(batch))

    targets_1 = torch.zeros(len(batch), max_tgt_1_len).long()
    target_1_sizes = torch.IntTensor(len(batch))
    
    targets_2 = torch.zeros(len(batch), max_tgt_2_len).long()
    target_2_sizes = torch.IntTensor(len(batch))
    
    for x in range(len(batch)):
        sample = batch[x]
        input_data = sample[0]
        target_1 = sample[1]
        target_2 = sample[2]
        
        seq_length = input_data.size(1)
        input_sizes[x] = seq_length
        inputs[x][0].narrow(1, 0, seq_length).copy_(input_data)
        input_percentages[x] = seq_length / float(max_seq_len)
        target_1_sizes[x] = len(target_1)
        targets_1[x][:len(target_1)] = torch.IntTensor(target_1)
        
        target_2_sizes[x] = len(target_2)
        targets_2[x][:len(target_2)] = torch.IntTensor(target_2)

    return inputs, targets, input_percentages, input_sizes, target_sizes


class AudioDataLoader(DataLoader):
    def __init__(self, *args, **kwargs):
        super(AudioDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = _collate_fn


class BucketingSampler(Sampler):
    def __init__(self, data_source, batch_size=1):
        """
        Samples batches assuming they are in order of size to batch similarly 
        sized samples together.
        """
        super(BucketingSampler, self).__init__(data_source)
        self.data_source = data_source
        ids = list(range(0, len(data_source)))
        self.bins = [ids[i:i + batch_size]
                     for i in range(0, len(ids), batch_size)]

    def __iter__(self):
        for ids in self.bins:
            np.random.shuffle(ids)
            yield ids

    def __len__(self):
        return len(self.bins)

    def shuffle(self, epoch):
        np.random.shuffle(self.bins)

In [100]:
import argparse

parser = argparse.ArgumentParser(description='ASR training')


# Train
parser.add_argument('--train-manifest-list', nargs='+', type=str)
parser.add_argument('--valid-manifest-list', nargs='+', type=str)
parser.add_argument('--test-manifest-list', nargs='+', type=str)
parser.add_argument('--lang-list', nargs='+', type=str)

parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
parser.add_argument('--batch-size', default=20, type=int, help='Batch size for training') # 20
parser.add_argument('--num-workers', default=4, type=int, 
                    help='Number of workers used in data-loading')

parser.add_argument('--labels-path', default='labels.json', 
                    help='Contains all characters for transcription')
parser.add_argument('--label-smoothing', default=0.0, type=float, help='Label smoothing')

# Speech
parser.add_argument('--window-size', default=.02, type=float, 
                    help='Window size for spectrogram in seconds')
parser.add_argument('--window-stride', default=.01, type=float, 
                    help='Window stride for spectrogram in seconds')
parser.add_argument('--window', default='hamming', 
                    help='Window type for spectrogram generation')

parser.add_argument('--epochs', default=100, type=int, 
                    help='Number of training epochs') # 1000
parser.add_argument('--cuda', dest='cuda', action='store_true', 
                    help='Use cuda to train model')

parser.add_argument('--device-ids', default=None, nargs='+', type=int,
                    help='If using cuda, sets the GPU devices for the process')
parser.add_argument('--lr', '--learning-rate', default=3e-4, type=float, 
                    help='initial learning rate')

parser.add_argument('--save-every', default=5, type=int, 
                    help='Save model every certain number of epochs')
parser.add_argument('--save-folder', default='models/', 
                    help='Location to save epoch models')

parser.add_argument('--emb_trg_sharing', action='store_true', 
                    help='Share embedding weight source and target')
parser.add_argument('--feat_extractor', default='vgg_cnn', type=str, 
                    help='emb_cnn or vgg_cnn')

parser.add_argument('--verbose', action='store_true', 
                    help='Verbose')

parser.add_argument('--continue-from', default='', 
                    help='Continue from checkpoint model')
parser.add_argument('--augment', dest='augment', action='store_true', 
                    help='Use random tempo and gain perturbations.')
parser.add_argument('--noise-dir', default=None,
                    help='Directory to inject noise into audio. If default, noise Inject not added')
parser.add_argument('--noise-prob', default=0.4, 
                    help='Probability of noise being added per sample')
parser.add_argument('--noise-min', default=0.0,
                    help='Minimum noise level to sample from. (1.0 means all noise, not original signal)', type=float)
parser.add_argument('--noise-max', default=0.5,
                    help='Maximum noise levels to sample from. Maximum 1.0', type=float)

# Transformer
parser.add_argument('--num-layers', default=3, type=int, help='Number of layers')
parser.add_argument('--num-heads', default=5, type=int, help='Number of heads')
parser.add_argument('--dim-model', default=512, type=int, help='Model dimension')
parser.add_argument('--dim-key', default=64, type=int, help='Key dimension')
parser.add_argument('--dim-value', default=64, type=int, help='Value dimension')
parser.add_argument('--dim-input', default=161, type=int, help='Input dimension')
parser.add_argument('--dim-inner', default=1024, type=int, help='Inner dimension')
parser.add_argument('--dim-emb', default=512, type=int, help='Embedding dimension')

parser.add_argument('--src-max-len', default=4000, type=int, help='Source max length')
parser.add_argument('--tgt-max-len', default=1000, type=int, help='Target max length')

# Noam optimizer
parser.add_argument('--warmup', default=4000, type=int, help='Warmup')
parser.add_argument('--min-lr', default=1e-5, type=float, help='min lr')
parser.add_argument('--k-lr', default=1, type=float, help='factor lr')

# SGD optimizer
parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
parser.add_argument('--lr-anneal', default=1.1, type=float, help='lr anneal')

# Decoder search
parser.add_argument('--beam-search', action='store_true', help='Beam search')
parser.add_argument('--beam-width', default=3, type=int, help='Beam size')
parser.add_argument('--beam-nbest', default=5, type=int, help='Number of best sequences')
parser.add_argument('--lm-rescoring', action='store_true', help='Rescore using LM')
parser.add_argument('--lm-path', type=str, default="lm_model.pt", help="Path to LM model")
parser.add_argument('--lm-weight', default=0.1, type=float, help='LM weight')
parser.add_argument('--c-weight', default=0.1, type=float, help='Word count weight')
parser.add_argument('--prob-weight', default=1.0, type=float, help='Probability E2E weight')

# Loss
parser.add_argument('--loss', type=str, default='ce', help='ce or ctc')
parser.add_argument('--clip', action='store_true', help="clip")
parser.add_argument('--max-norm', default=400, type=float, help="max norm for clipping")

parser.add_argument('--dropout', default=0.1, type=float, help='Dropout')

# Parallelize model
parser.add_argument('--parallel', action='store_true', help='Parallelize the model')

# shuffle
parser.add_argument('--shuffle', action='store_true', help='Shuffle')

# PAD_CHAR, SOS_CHAR, EOS_CHAR
parser.add_argument('--PAD_CHAR', default="¶", type=str, help='PAD_CHAR')
parser.add_argument('--SOS_CHAR', default="§", type=str, help='SOS_CHAR')
parser.add_argument('--EOS_CHAR', default="¤", type=str, help='EOS_CHAR')
parser.add_argument('--PAD_TOKEN', default=0, type=int, help='PAD_TOKEN')
parser.add_argument('--SOS_TOKEN', default=1, type=int, help='SOS_TOKEN')
parser.add_argument('--EOS_TOKEN', default=2, type=int, help='EOS_TOKEN')


torch.manual_seed(123456)
torch.cuda.manual_seed_all(123456)

# https://github.com/spyder-ide/spyder/issues/3883
import sys
sys.argv=['']; del sys 

args = parser.parse_args()
USE_CUDA = args.cuda

# PAD_TOKEN = 0
# SOS_TOKEN = 1
# EOS_TOKEN = 2

# PAD_CHAR = "¶"
# SOS_CHAR = "§"
# EOS_CHAR = "¤"

args.train_manifest_list = ['data/manifests/libri_train_manifest.csv']
args.valid_manifest_list = ['data/manifests/libri_val_manifest.csv']
args.test_manifest_list = ['data/manifests/libri_test_manifest.csv']

args.batch_size = 6
args.labels_path = 'data/labels/labels.json'
args.lr = 1e-4
args.name = 'libri_drop0.1_cnn_batch12_6_vgg_layer_notebook'
args.save_folder = 'save/'
args.save_every = 10
args.feat_extractor = 'vgg_cnn'
args.dropout = 0.1
args.num_layers = 2
args.num_heads = 6
args.dim_model = 512
args.dim_key = 64
args.dim_value = 64
args.dim_input = 161
args.dim_inner = 2048
args.dim_emb = 512
args.shuffle = True
args.min_lr = 1e-6
args.k_lr = 1
args.target_dir = '../../../../big_data/end2end-asr-pytorch/LibriSpeech_dataset/'
args.sample_rate = 1600 #16000

args.window_size = 0.02
args.window_stride = 0.01 #0.01
args.window = 'hamming'
# args.noise_dir = 
args.noise_prob = 0.4
args.noise_min = 0.0
args.noise_max = 0.5
args.cuda = torch.cuda.is_available()
args.epochs = 1000
args.continue_from = 'save/libri_drop0.1_cnn_batch12_6_vgg_layer_notebook/best_model.th'

In [None]:
train_data = SpectrogramDataset(
    audio_conf, manifest_filepath_list=args.train_manifest_list,
    label2id=label2id, normalize=True, augment=args.augment)

In [97]:
# from torch.utils.data import Dataset
# from torchaudio.datasets.utils import (
#   download_url,
#   extract_archive,
#   walk_files,
# )

# URL = "train-clean-100"
# FOLDER_IN_ARCHIVE = "LibriSpeech"
# BASE_URL = "https://dl.fbaipublicfiles.com/librispeech_100h_mp3/"
# _CHECKSUMS = {
#   BASE_URL + "dev-clean.tar.gz":
#   "076916a8f9c61951c5d2e6efaa8d2188232fcf860eec8c074e46edf4fac9623e",
#   BASE_URL + "test-clean.tar.gz":
#   "3c171e2f1e377e4993c2dbe6bff3f01cd324c0ed462f4de6c78737402a7dbedd",
#   BASE_URL + "train-clean-100.tar.gz":
#   "7bfbefc680d25ba3a82798ce32c287ea0e82932af1b1f864fae71fb52d2f41f0",
# }

# "nameid_start_end" format

# 308660 number of audio file in train-segment

# big_data/2020-AMMI-salomon/data/external/TED_Speech_Translation/train.en-fr

def load_data_item(fileid: str, 
                   path: str, 
                   ext_audio: str, 
                   ext_txt: str) -> Tuple[Tensor, int, str, str, int, int, int]:
    
    temp, end = fileid.split("-")
    nameid, start = temp.split("_")
    
#     file_text = speaker_id + "_" + chapter_id + ext_txt
    file_text = os.path.join(path.split("train")[0], "train"+ext_txt)
    
    fileid_audio = nameid + "_" + start + "-" + end
    file_audio = fileid_audio + ext_audio
    file_audio = os.path.join(path, file_audio)
    # Load audio
    waveform, sample_rate = torchaudio.load(file_audio)
    
    # Load text
    with open(file_text) as ft:
        for line in ft:
            
            fileid_text, trascription, translation = line.strip().split("\t")
            fileid_text = fileid_text.split('.')[0]
            ipdb.set_trace()
            if fileid_audio == fileid_text:
                break
        else:
          # Transcription or/and Translation not found
          raise FileNotFoundError("Transcription or/and Translation not found for " + fileid_audio)
                
    return (
        waveform,
        sample_rate,
        trascription,
        translation,
        int(nameid),
        int(start),
        int(end)
        )


class TED_Dataset(Dataset):
    """
    Create a Dataset for TED dataset. Each item is a tuple of the form:
    waveform, sample_rate, trascription, translation, nameid, start, end
    """
    
    _ext_txt = ".en-fr"
    _ext_audio = ".wav"
    
    def __init__(self,
                 root: str,
                 folder: str) -> None:
        
        self._path = os.path.join(root, folder)
        
        walker = walk_files(
          self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
        )
        
        self._walker = list(walker)
        
    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
        fileid = self._walker[n]
#         ipdb.set_trace()
        return load_data_item(fileid, self._path, self._ext_audio, self._ext_txt)
    
    def __len__(self) -> int:
        return len(self._walker)

In [98]:
train = TED_Dataset("../../data/external/TED_Speech_Translation/", "train-segment")
# train = TED_Dataset("big_data/2020-AMMI-salomon/data/external/TED_Speech_Translation/",
#                     "train-segment")



In [99]:
print(next(iter(train)))


> [0;32m<ipython-input-97-83eb5df18d46>[0m(50)[0;36mload_data_item[0;34m()[0m
[0;32m     49 [0;31m            [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 50 [0;31m            [0;32mif[0m [0mfileid_audio[0m [0;34m==[0m [0mfileid_text[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     51 [0;31m                [0;32mbreak[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  fileid_audio


'93268_0139110-0140457'


ipdb>  fileid_text


'100230_0000442-0022222'
--KeyboardInterrupt--


ipdb>  q


BdbQuit: 

In [None]:
a,  b = torchaudio.load('../../data/external/TED_Speech_Translation/train-segment/93268_0139110-0140457.wav')