In [1]:
%load_ext autoreload
%autoreload 2

## 1.4 Tags

In [5]:
from train import AudioTransformer
import torch
from dataset import RAHWhisperDataset, RAHWhisperTokenizer, RIRAug, NoiseAug
from utils import seed_everything
import editdistance
import re
seed_everything(42)
torch.set_default_dtype(torch.float64)
## IF not backend is detected
# import torchaudio

# # Check the current backend
# torchaudio.set_audio_backend("soundfile")
# torchaudio.list_audio_backends()

### 1.4.1 Training

In [2]:
tokenizer = RAHWhisperTokenizer(path='fechas1/fechas1_tag_train.csv')
trainset=RAHWhisperDataset('fechas1/fechas1_tag_train.csv', tokenizer, transform=[NoiseAug(), RIRAug()])
testset=RAHWhisperDataset('fechas1/fechas1_tag_test.csv', tokenizer)

In [4]:
model = AudioTransformer(vocab_size=len(tokenizer.idx2word.keys()), d_model=128, nb_layers=4, 
                         d_ff=256, n_heads=4, d_head=32, dropout=0.1, seq_len=500)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-4)

nb_epochs = 5
batch_size = 16
model.train()

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
for e in range(nb_epochs):
    avg_loss = 0
    for x, y in trainloader:
        x = x.to(device)
        y = y.to(device)
        opt.zero_grad()
        loss = model.loss(x, y)
        loss.backward()
        opt.step()
        avg_loss += loss.item()
    print('epoch %d/%d: avg_loss: %.2f' % (e,nb_epochs,avg_loss/len(trainloader)))
       
torch.save([model, opt], 'model_14.pt')
torch.save(tokenizer, 'tokenizer_14.pth')


epoch 0/5: avg_loss: 0.35
epoch 1/5: avg_loss: 0.16
epoch 2/5: avg_loss: 0.10
epoch 3/5: avg_loss: 0.07
epoch 4/5: avg_loss: 0.05


### 1.3.2 Test

In [6]:
def filter(texto):
    texto=tokenizer.decode(texto)
    # print(texto)
    valores = re.findall(r'\((.*?)\)', texto)
    valores= " ".join(valores)
    return valores

In [12]:
[model, opt] = torch.load('model_14.pt')
tokenizer = torch.load('tokenizer_14.pth')
testset=RAHWhisperDataset('fechas1/fechas1_tag_test.csv', tokenizer)

model.eval()
err = 0
num = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for i,(x, y) in enumerate(testset):    
    x = x.to(device)    
    y_pred = model.generate(x[None,...])
    # hyp = ' '.join([str(i) for i in y_pred[1:-1]])
    # print('hyp', hyp)
    hyp = filter(y_pred[1:-1])
    y = y.numpy().tolist()
    # find the first 3 <eos> in list y
    # y = y[:y.index(3)]
    # ref = ' '.join([str(i) for i in y[1:]])
    # print('ref', ref)
    ref = filter(y[1:y.index(3)])
    # print('(%d/%d)' % (i, len(testset)) )

    
    # edit distance
    err += editdistance.eval(hyp, ref)
    num += len(ref.split())
    
print(f'error rate {err/num:.2%},  ({err}/{num})')

  [model, opt] = torch.load('model_14.pt')
  tokenizer = torch.load('tokenizer_14.pth')


error rate 7.00%,  (70/1000)
