# KaldiTorch notebook

#### Packages

In [10]:
import torch
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from torchsummary import summary

from models import DWaveNet
from utils import AudioDataset, DenoiseDataset

import os
import math
import time

import numpy as np
import kaldi_io
import librosa
from tqdm.notebook import tqdm


In [2]:
train_dataset, test_dataset = AudioDataset('train'), AudioDataset('test')

len(train_dataset), len(test_dataset)

(82485, 8913)

In [3]:
train_data = DataLoader(train_dataset, batch_size = 8, shuffle=True)
test_data = DataLoader(test_dataset, batch_size = 8, shuffle=True)

In [4]:
model = DWaveNet(in_channels=1, num_layers=10, num_stacks=1, residual_channels=56, gate_channels=56, skip_out_channels=56,
                last_channels=(512, 128))

In [5]:
summary(model, train_dataset[0][0].shape, device = 'cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1            [-1, 56, 16384]             168
            Conv1d-2            [-1, 56, 16384]           9,408
            Conv1d-3            [-1, 56, 16384]           1,568
            Conv1d-4            [-1, 56, 16384]           1,568
 ResidualConv1dGLU-5         [-1, 2, 56, 16384]               0
            Conv1d-6            [-1, 56, 16384]           9,408
            Conv1d-7            [-1, 56, 16384]           1,568
            Conv1d-8            [-1, 56, 16384]           1,568
 ResidualConv1dGLU-9         [-1, 2, 56, 16384]               0
           Conv1d-10            [-1, 56, 16384]           9,408
           Conv1d-11            [-1, 56, 16384]           1,568
           Conv1d-12            [-1, 56, 16384]           1,568
ResidualConv1dGLU-13         [-1, 2, 56, 16384]               0
           Conv1d-14            [-1, 56

In [30]:
# os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'

# device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
#model = nn.DataParallel(model)

In [8]:
model = model.to(device)

In [9]:
criterion1, criterion2 = torch.nn.L1Loss(), torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [16]:
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda):#, save_path):
    """returns trained model"""

    valid_loss_min = np.Inf 
    
    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        ###################
        # train the model #
        ###################
        model.train()
        for batch_idx, (data, target) in tqdm(enumerate(loaders['train'])):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            
            optimizer.zero_grad()
            
            output = model(data)
            
            
            loss = criterion(output, target)
            
            # backward pass
            loss.backward()
            # parameter update
            optimizer.step()
            # update training loss
            train_loss += loss.item() * data.size(0)
            del data, target, output
            
        ######################    
        # validate the model #
        ######################
        model.eval()
        for batch_idx, (data, target) in tqdm(enumerate(loaders['valid'])):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            ## update the average validation loss

            # forward pass
            output = model(data)
            # batch loss
            loss = criterion(output, target)
            # update validation loss
            valid_loss += loss.item() * data.size(0)
            del data, target, output

        train_loss = train_loss / len(loaders['train'].dataset)
        valid_loss = valid_loss /len(loaders['valid'].dataset)
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss))
        
        ## TODO: save the model if validation loss has decreased
#         if valid_loss <= valid_loss_min: 
#           print("Validation loss descreased from {} ---------> {} Saving model...".format(valid_loss_min, valid_loss))
#           valid_loss_min = valid_loss
#           torch.save(model.state_dict(), save_path)
             
    # return trained model
    return model


In [13]:
#log_dir = "log/"
def train(n_epochs, loaders, model, optimizer, criterions, use_cuda, batch_verbose):#, save_path):
    for epoch in range(1, n_epochs+1):
        print('Epoch %d' % epoch)
        start = time.time()
        model.train()
        for batch_idx, (data, target) in enumerate(loaders['train']):
            if use_cuda:
                data, target = data.to(device), target.to(device)
          #print(data.shape, target.shape)
            optimizer.zero_grad()
            output = model(data)
            loss_a = criterions[0](output, target)
            loss_b = criterions[1](output, target)
            loss = loss_a + loss_b
            loss.backward()
            optimizer.step()
            #lr = optimizer.update_learning_rate()
            if batch_idx % batch_verbose == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(loaders['train'].dataset),
                    100. * batch_idx / len(loaders['train']), loss.item()))
          #train.update(np.random.randint(300, 800+1)) # 3-8s chunk
            del data, target, output, loss, loss_a, loss_b

        print('Train epoch {} completed in {:.3f} minutes'.format(epoch, (time.time()-start)/60))
      
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for data, target in loaders['valid']:
                if use_cuda:
                    data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += criterions[0](output, target).item() # sum up batch loss
                test_loss += criterions[1](output, target).item()
              # if asoftmax == 'True': # angular-softmax
              #     output = output[0] # 0=cos_theta 1=phi_theta
              # pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
              # correct += pred.eq(target.view_as(pred)).sum().item()
              
        test_loss /= len(loaders['valid'].dataset)
        print('\nTest set: Average loss: {:.4f}\n'.format(test_loss))

        del data, target, output, test_loss

      # if 100. * correct / len(val_loader.dataset) > best:
      #     best = 100. * correct / len(val_loader.dataset)
      #     torch.save({
      #         'epoch': epoch,
      #         'state_dict': model.state_dict(),
      #         'best_acc': best,
      #         'optimizer' : optimizer.state_dict(),
      #     }, log_dir + str(epoch) + "_" + str(int(100. * correct / len(val_loader.dataset))) + ".h5")
      #     print("===> save to checkpoint at {}\n".format(log_dir + 'model_best.pth.tar'))
      #     shutil.copyfile(log_dir + str(epoch) + "_" + str(int(100. * correct / len(val_loader.dataset))) +
      #             ".h5", log_dir + 'model_best.pth.tar')
      #     best_epoch = epoch
      # elif epoch - best_epoch > 2:
      #     optimizer.increase_delta()
      #     best_epoch = epoch
    return model

In [11]:
loaders = {'train':train_data, 'valid':test_data}

In [14]:
train(n_epochs=3, loaders=loaders, model=model,optimizer=optimizer, criterions=[criterion1, criterion2], use_cuda = True, batch_verbose=10)

Epoch 1


RuntimeError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 11.17 GiB total capacity; 877.57 MiB already allocated; 6.19 MiB free; 906.00 MiB reserved in total by PyTorch)

<hr>

In [1]:
import os

In [2]:
os.environ['KALDI_ROOT'] = '/home/faber6911/kaldi/'

In [3]:
import torchaudio
import torch
import numpy as np
import matplotlib.pyplot as plt
import kaldi_io
from scipy.io import wavfile
import librosa.display as display
from torch.utils.data import DataLoader
import pandas as pd
from tqdm.autonotebook import tqdm
from sklearn.preprocessing import LabelEncoder
import torchvision
import re

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


#### Classes and functions

In [4]:
# class FeaturesDataset(torch.utils.data.Dataset):
#     """Features extracted from Kaldi"""

#     def __init__(self, scp_file, transform = None):
#         """
#         Args:
#             scp_file (string): Path to the scp file.
#             transform (callable, optional): Optional transform to be applied
#                 on a sample.
#         """
#         le = LabelEncoder()
#         self.features = pd.read_csv(scp_file, sep = " ", header = None, names = ['long_label', 'path'])
#         self.features['label'] = self.features.long_label
#         for index, row in self.features.iterrows():
#             row.label = row.label.split('-')[0]
        
#         self.features['label_enc'] = le.fit_transform(self.features.label)
#         self.transform = transform

#     def __len__(self):
#         return len(self.features)

#     def __getitem__(self, idx):
#         if torch.is_tensor(idx):
#             idx = idx.tolist()

#         feature_path = self.features.iloc[idx, 1]
#         label = self.features.iloc[idx, 3]
#         feature = kaldi_io.read_mat(feature_path)

#         if self.transform:
#             feature = self.transform(feature)

#         return (feature, label)
    
#     def num_speakers(self):
#         return len(np.unique(np.asarray(self.features.label)))
    
#     def get_wav(self, idx):
#         absPath = '/export/corpora/VoxCeleb1/dev/wav/'
#         relPath = str(self.features.iloc[idx, 0])
#         relPath = re.sub('-', '/', relPath)
#         relPath = relPath + '.wav'
#         completePath = os.path.join(absPath, relPath)
#         fs, audio = wavfile.read(completePath)
#         return (fs, audio.astype('float'))
        
# def CountSpeaker(path, require_labels = False):
#     assert isinstance(path, str)
#     import torchaudio
    
#     labels = []
#     for label, _ in tqdm(torchaudio.kaldi_io.read_mat_scp(path)):
#         labels.append(label.split('-')[0])
    
#     labels = np.asarray(labels)
#     if require_labels:
#         print(len(np.unique(labels)))
#         return np.unique(labels)
#     else:
#         print("{} distinct speakers".format(len(np.unique(labels))))

        
# def LabelToPath(label, dataset):
#     assert dataset in ['train', 'test']
#     assert isinstance(label, str)
#     import os
#     import re
#     import errno
    
#     # VoxCeleb abs path
#     if dataset == 'train':
#         abs_path = '/export/corpora/VoxCeleb1/dev/wav'
#     else:
#         abs_path = '/export/corpora/VoxCeleb1/test/wav'
#     # substitute - with /
#     rel_path = re.sub('-', '/', label)
#     # add extension
#     rel_path = rel_path+'.wav'
#     complete_path = os.path.join(abs_path, rel_path)
#     if not os.path.exists(complete_path):
#         raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), complete_path)
    
#     return os.path.join(abs_path, rel_path)


# def DisplayAudio(audio, fs):
#     assert isinstance(fs, int)
#     assert isinstance(audio, np.ndarray)
#     plt.figure(figsize = (15, 10))
#     plt.subplot(2, 1, 1)
#     display.waveplot(audio, sr = fs)
#     plt.subplot(2, 1, 2)
#     plt.specgram(audio, Fs = fs)[3]

In [5]:
class SequenceDataset(torch.utils.data.Dataset):
    """PyTorch datalaoder for processing 'uncompressed' Kaldi feats.scp
    """
    def __init__(self, scp_file, utt2spkid_file, min_length):
        """Preprocess Kaldi feats.scp here and balance the training set
        """
        self.rxfiles, self.labels, self.utt2spkid = [], [], {}
        
        # balanced training 
        id_count = {}
        for line in open(utt2spkid_file):
            utt, label = line.rstrip().split()
            self.utt2spkid[utt] = int(label)
            if not int(label) in id_count:
                id_count[int(label)] = 0
            id_count[int(label)] += 1
        max_id_count = int((max(id_count.values())+1)/2)
        
        for line in open(scp_file):
            utt, rxfile = line.rstrip().split()
            label = self.utt2spkid[utt]
            repetition = max(1, max_id_count // id_count[label])
            self.rxfiles.extend([rxfile] * repetition)
            self.labels.extend([label] * repetition)
        
        self.rxfiles = np.array(self.rxfiles)
        self.labels  = np.array(self.labels, dtype=np.int)
        self.seq_len = min_length
        print("Totally "+str(len(self.rxfiles))+" samples with at most "+
            str(max_id_count)+" samples for one class")
    
    def __len__(self):
        """Return number of samples 
        """
        return len(self.labels)

    def update(self, seq_len):
        """Update the self.seq_len. We call this in the main training loop 
        once per training iteration. 
        """
        self.seq_len = seq_len

    def __getitem__(self, index):
        """Generate samples
        """
        rxfile  = self.rxfiles[index]
        full_mat = kaldi_io.read_mat(rxfile)
        assert len(full_mat) >= self.seq_len
        pin = np.random.randint(0, len(full_mat) - self.seq_len + 1)
        chunk_mat = full_mat[pin:pin+self.seq_len, :]
        y = np.array(self.labels[index])
        
        return chunk_mat, y


In [4]:
import librosa

In [29]:
class DenoiseDataset(torch.utils.data.Dataset):
    """PyTorch datalaoder for processing 'uncompressed' Kaldi feats.scp. It returns noisy audio
    with their corresponding clean audio file.
    """
    def __init__(self, scp_file, min_length):
        """Preprocess Kaldi feats.scp here
        """
        self.noisy_audios, self.clean_audios = [], []
        
        for line in open(scp_file):
            clean_audio_path, noisy_audio_path = line.rstrip().split() 
            self.noisy_audios.extend([noisy_audio_path])
            self.clean_audios.extend([clean_audio_path])
        
        self.noisy_audios = np.array(self.noisy_audios)
        self.clean_audios  = np.array(self.clean_audios)
        self.seq_len = min_length*16000
        print("Totally "+str(len(self.noisy_audios))+" samples")
    
    def __len__(self):
        """Return number of samples 
        """
        return len(self.clean_audios)

    def update(self, seq_len):
        """Update the self.seq_len. We call this in the main training loop 
        once per training iteration. 
        """
        self.seq_len = seq_len

    def __getitem__(self, index):
        """Generate samples
        """
        noisy_audio  = self.noisy_audios[index]
        full_mat = kaldi_io.read_mat(noisy_audio).squeeze(1)
        assert len(full_mat) >= self.seq_len
        pin = np.random.randint(0, len(full_mat) - self.seq_len + 1)
        chunk_mat = full_mat[pin:pin+self.seq_len]
        y, _ = librosa.load(self.labels[index], sr = 16000)
        y = y[pin:pin+self.seq_len]
        
        return chunk_mat, y


In [31]:
dataset = SequenceDataset(scp_file='../data/test/noisy_audio.scp', min_length = 4.0)

Totally 816 samples


In [17]:
import IPython.display as ipd

In [22]:
ipd.Audio(dataset[0][0], rate = 16000)

In [6]:
dataset = SequenceDataset(scp_file = "exp/processed/train_orig.scp",
                          utt2spkid_file="exp/processed/utt2spkid", min_length=800)

Totally 195898 samples with at most 198 samples for one class


In [7]:
from model import NeuralSpeakerModel, AngleLoss

In [8]:
 model=NeuralSpeakerModel(model='resnet34', input_dim= 30, output_dim=1128, D=32, hidden_dim=512, \
            pooling='mean', network_type='lde', distance_type='sqr', asoftmax=True, m=2)

In [11]:
from torchsummary import summary

In [18]:
dataset[0][0].shape

(800, 30)

In [14]:
print(model)

NeuralSpeakerModel(
  (res): ResNet(
    (conv1): Conv2d(1, 16, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(16, 16,

In [26]:
summary(model, (800, 30), batch_size=128)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [128, 16, 800, 30]             784
       BatchNorm2d-2         [128, 16, 800, 30]              32
              ReLU-3         [128, 16, 800, 30]               0
            Conv2d-4         [128, 16, 800, 30]           2,304
       BatchNorm2d-5         [128, 16, 800, 30]              32
              ReLU-6         [128, 16, 800, 30]               0
            Conv2d-7         [128, 16, 800, 30]           2,304
       BatchNorm2d-8         [128, 16, 800, 30]              32
              ReLU-9         [128, 16, 800, 30]               0
       BasicBlock-10         [128, 16, 800, 30]               0
           Conv2d-11         [128, 16, 800, 30]           2,304
      BatchNorm2d-12         [128, 16, 800, 30]              32
             ReLU-13         [128, 16, 800, 30]               0
           Conv2d-14         [128, 16, 

In [20]:
model_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('===> Model total parameter: {}'.format(model_params))

===> Model total parameter: 4015160


<hr>

## Check torchaudio.compliance.kaldi features with Kaldi toolkit

In [25]:
torch.from_numpy(dataset[0][0]).unsqueeze(0).shape

torch.Size([1, 64000])

In [27]:
torchaudio.compliance.kaldi.spectrogram(torch.from_numpy(dataset[0][0]).unsqueeze(0)).shape

torch.Size([398, 257])

In [4]:
for line in open('fbank/raw_fbank_train.1.scp'):
    utt, path = line.rstrip().split()
    break
    
print("Utt: {}\nPath:{}".format(utt, path))

Utt: id10001-1zcIwhmdeo4-00001
Path:/home/faber6911/kaldi/egs/pytorch-kaldi-neural-speaker-embeddings/fbank/raw_fbank_train.1.ark:26


In [5]:
# read matrix and check dimensions
fbank_line = kaldi_io.read_mat(path)
print("Shape: {}\nMax: {}\nMin: {}".format(fbank_line.shape, np.max(fbank_line), np.min(fbank_line)))

Shape: (812, 30)
Max: 27.77927017211914
Min: 10.756635665893555


In [6]:
# from utt to path for import wav file
new_path = re.sub('-', '/', utt)
new_path = new_path+'.wav'
audio_path = os.path.join('/export/corpora/VoxCeleb1/dev/wav/', new_path)
print("Abs path for wav file:\n{}".format(audio_path))

Abs path for wav file:
/export/corpora/VoxCeleb1/dev/wav/id10001/1zcIwhmdeo4/00001.wav


In [7]:
# import wav file and check dims and duration
sr, waveform = wavfile.read(audio_path)
print("Shape: {}\nDuration: {} seconds".format(waveform.shape, waveform.shape[0]/sr))

Shape: (129921,)
Duration: 8.1200625 seconds


In [8]:
# from numpy to TorchTensor in order to use with torchaudio
waveform = torch.FloatTensor(waveform)
waveform = waveform.unsqueeze(0)
waveform.shape

torch.Size([1, 129921])

In [9]:
# fbank from torchaudio with same config
fbank_torch = torchaudio.compliance.kaldi.fbank(waveform,
                                                sample_frequency = sr,
                                                frame_length = 25,
                                                low_freq = 20,
                                                high_freq = 7600,
                                                num_mel_bins = 30,
                                                snip_edges = False)

In [10]:
print("fbank Kaldi toolkit:\n{}\n\nfbank torchaudio:\n{}".format(fbank_line, fbank_torch))

fbank Kaldi toolkit:
[[15.3331   17.081566 17.721813 ... 19.025589 19.246582 19.476   ]
 [14.083518 16.721992 17.103155 ... 19.966415 19.51774  19.911795]
 [14.944939 16.36242  16.39664  ... 20.000017 20.357042 21.019016]
 ...
 [13.966785 16.675093 17.25782  ... 17.336458 16.881641 18.509287]
 [14.258617 15.63033  17.50915  ... 18.303068 17.276648 17.981493]
 [14.880245 16.346786 18.475801 ... 17.882803 17.320536 18.509287]]

fbank torchaudio:
tensor([[15.3379, 17.0791, 17.7284,  ..., 19.0252, 19.2422, 19.4683],
        [14.1024, 16.7175, 17.1048,  ..., 19.9661, 19.5175, 19.9037],
        [14.9401, 16.3688, 16.4212,  ..., 19.9948, 20.3537, 21.0038],
        ...,
        [13.9402, 16.6806, 17.2543,  ..., 17.3316, 16.8983, 18.4902],
        [14.2535, 15.6409, 17.5064,  ..., 18.2910, 17.2929, 17.9776],
        [14.8852, 16.3422, 18.4661,  ..., 17.9035, 17.3213, 18.4965]])


In [11]:
print("//// torchaudio fbank -- kaldi toolkit fbank\nMax: {}, {}\nMin: {}, {}\nMean: {}, {}\nMedian: {}, {}".format(
                 torch.max(fbank_torch), np.max(fbank_line),
                 torch.min(fbank_torch), np.min(fbank_line),
                 torch.mean(fbank_torch), np.mean(fbank_line),
                 torch.median(fbank_torch), np.median(fbank_line)))

//// torchaudio fbank -- kaldi toolkit fbank
Max: 27.7794246673584, 27.77927017211914
Min: 10.755780220031738, 10.756635665893555
Mean: 19.649372100830078, 19.64935874938965
Median: 19.618640899658203, 19.622425079345703


fbank seems to be the same, time to check for MFCC

In [12]:
for line in open('mfcc/raw_mfcc_train_mfcc.1.scp'):
    utt, path = line.rstrip().split()
    break
    
print("Utt: {}\nPath:{}".format(utt, path))

Utt: id10001-1zcIwhmdeo4-00001
Path:/home/faber6911/kaldi/egs/pytorch-kaldi-neural-speaker-embeddings/mfcc/raw_mfcc_train_mfcc.1.ark:26


In [13]:
mfcc_line = kaldi_io.read_mat(path)
print("Shape: {}\nMax: {}\nMin: {}".format(mfcc_line.shape, np.max(mfcc_line), np.min(mfcc_line)))

Shape: (812, 30)
Max: 59.16253662109375
Min: -76.8489990234375


The wav file is the same so we already have the audio in TorchTensor format

In [14]:
mfcc_torch = torchaudio.compliance.kaldi.mfcc(waveform = waveform,
                                              sample_frequency = sr,
                                             frame_length = 25,
                                             low_freq = 20,
                                             high_freq = 7600,
                                             num_mel_bins = 30,
                                             num_ceps = 30,
                                             snip_edges = False,
                                             use_energy=True)

MFCC in Kaldi toolkit has use_energy param True as default while torchaudio.compliance.kaldi has use_energy False

In [15]:
print("mfcc Kaldi toolkit:\n{}\nmfcc torchaudio:\n{}".format(mfcc_line, mfcc_torch))

mfcc Kaldi toolkit:
[[ 20.106464   -11.908861     0.39882356 ...  -0.7912214    3.6647441
    2.9267256 ]
 [ 19.808418   -16.656345    -3.5222826  ...   3.485898     4.524192
    4.6507907 ]
 [ 20.021309   -17.447592    -1.2642107  ...   3.133277     5.6291966
    3.7312894 ]
 ...
 [ 21.30063     -3.422789   -24.859312   ...   1.1005099    3.1736312
   -2.7399864 ]
 [ 21.57147     -2.7626133  -24.859312   ...  -0.28873026  -2.1898746
   -2.7399864 ]
 [ 21.723818    -0.12190962 -23.017002   ...   4.1911397   -2.0781922
   -2.1426592 ]]
mfcc torchaudio:
tensor([[ 20.1075, -11.9517,   0.2819,  ...,  -0.8027,   3.6210,   2.9432],
        [ 19.7922, -16.6810,  -3.5334,  ...,   3.4430,   4.5605,   4.6365],
        [ 20.0252, -17.4113,  -1.2113,  ...,   3.1620,   5.5843,   3.7296],
        ...,
        [ 21.3083,  -3.3794, -24.9784,  ...,   1.0830,   3.2391,  -2.7685],
        [ 21.5762,  -2.8118, -24.9922,  ...,  -0.2775,  -2.2522,  -2.6962],
        [ 21.7162,  -0.1835, -22.9673,  ...,   4.

In [16]:
print("//// torchaudio mfcc -- kaldi toolkit mfcc\nMax: {}, {}\nMin: {}, {}\nMean: {}, {}\nMedian: {}, {}".format(
                 torch.max(mfcc_torch), np.max(mfcc_line),
                 torch.min(mfcc_torch), np.min(mfcc_line),
                 torch.mean(mfcc_torch), np.mean(mfcc_line),
                 torch.median(mfcc_torch), np.median(mfcc_line)))

//// torchaudio mfcc -- kaldi toolkit mfcc
Max: 59.11941146850586, 59.16253662109375
Min: -76.91753387451172, -76.8489990234375
Mean: -2.9523754119873047, -2.953380823135376
Median: -0.6154720187187195, -0.6105281114578247


## Writing .ark and .scp file

In [215]:
saving = {'prova': fbank_torch.numpy(),
         'prova2': fbank_line}

In [156]:
# Allow to write .ark file
with open('feats2.ark','wb') as f:
  for key,mat in saving.items():
    kaldi_io.write_mat(f, mat, key = key)

In [None]:
# work in progress, write .ark and .scp files all in one
ark_scp_output='ark:| copy-feats --compress=true ark:- ark,scp:data/feats2.ark,data/feats2.scp'
with kaldi_io.open_or_fd(ark_scp_output,'wb') as f:
    for key,mat in saving.items():
        kaldi_io.write_mat(f, mat, key = key)

In [180]:
# modules that allow to use writers and readers for kaldi obj
import kaldiio
from kaldiio import WriteHelper

In [126]:
# write .ark and .scp file all in one
with WriteHelper('ark,scp:file.ark,file.scp') as writer:
    for key, mat in saving.items():
        writer[key] = mat