In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from custom_dataset import CustomDataset
from collate import collate_fn
from TdAtt import *
from torch import nn
import pickle
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import librosa
import sklearn
from sklearn import preprocessing
import math
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
from datetime import datetime
import time
import copy
from sklearn.utils import shuffle
import torch.nn.functional as F
import warnings

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
device

device(type='cuda')

In [4]:
def get_pkl(file_path):
    with open(file_path, 'rb') as file:
        loaded_list = pickle.load(file)
    return loaded_list

In [5]:
#np_wav, str_list = shuffle(get_pkl('mfcc.pkl'), get_pkl('str_list_cut.pkl'), random_state=0)
np_wav = get_pkl('mfcc.pkl')
str_list = get_pkl('str_list_cut.pkl')

In [6]:
trainc = 3000
valc = 500

In [7]:
vnp_wav = np_wav[trainc:trainc+valc]
vstr_list = str_list[trainc:trainc+valc]
np_wav = np_wav[:trainc]
str_list = str_list[:trainc]

In [8]:
print(len(np_wav))
print(len(str_list))

3000
3000


In [9]:
class VectorizeChar:
    def __init__(self, max_len=50):
        self.vocab = (
            ["", "-", "#", "<", ">"]
            + [chr(i + 96) for i in range(1, 27)]
            + [" ", ".", ",", "?"]
        )
        self.max_len = max_len
        self.char_to_idx = {}
        for i, ch in enumerate(self.vocab):
            self.char_to_idx[ch] = i

    def __call__(self, text):
        text = text.lower()
        text = text[: self.max_len - 2]
        text = "<" + text + ">"
        pad_len = self.max_len - len(text)
        return [self.char_to_idx.get(ch, 1) for ch in text] + [0] * pad_len

    def get_vocabulary(self):
        return self.vocab

In [10]:
vectorizer = VectorizeChar(400)
print("vocab size", len(vectorizer.get_vocabulary()))
print(vectorizer("hey way i got a new complaint"))

vocab size 35
[3, 12, 9, 29, 31, 27, 5, 29, 31, 13, 31, 11, 19, 24, 31, 5, 31, 18, 9, 27, 31, 7, 19, 17, 20, 16, 5, 13, 18, 24, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
vect_str_list = [vectorizer(txt) for txt in str_list]
vvect_str_list = [vectorizer(txt) for txt in vstr_list]

In [12]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=469):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]`` no
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [13]:
class SpeechFeatureEmbedding(nn.Module):
    def __init__(self, embedding_dim, output_dim, num_2d, num_heads, num_hid=64, layernorm_eps=1e-6):
        super(SpeechFeatureEmbedding, self).__init__()
        self.num_layers = num_2d
        self.conv1 = nn.Sequential(nn.Conv2d(1, num_hid, 3, padding=(0, 1), stride=(2, 1)),
                             nn.BatchNorm2d(num_hid),
                             nn.LeakyReLU(),
                             #nn.Conv2d(num_hid, num_hid, 3, padding=(0, 1), stride=(2, 1)),
                             #nn.BatchNorm2d(num_hid),
                             #nn.ReLU()
                            )
        self.tda = nn.ModuleList([TwoD_Attention_layer(in_channels=num_hid, 
                                                    num_head=num_heads,
                                                    emb_dim=embedding_dim,
                                                    layernorm_eps=layernorm_eps) for _ in range(self.num_layers)])
        self.lin = nn.Linear(embedding_dim * num_hid, output_dim)
        
    def forward(self, x):
        x = self.conv1(x)
        for i in range(self.num_layers):
            x = self.tda[i](x)
        x = self.lin(x.view(x.size(0), x.size(2), -1))
        #return torch.squeeze(x, 1)
        return x

In [14]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return nn.Sequential(
        nn.Linear(embedding_dim, fully_connected_dim),
        nn.ReLU(),
        nn.Linear(fully_connected_dim, embedding_dim)
        )

In [15]:
class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        self.norm1 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm2 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)
        
    def forward(self, inputs):
        
        self_mha_output, _ = self.mha(inputs, inputs, inputs)
        
        skip_attention = self.norm1(inputs + self_mha_output)
        
        ffn_output = self.ffn(skip_attention)
        
        ffn_output = self.dropout_ffn(ffn_output)
        
        encoder_layer_out = self.norm2(skip_attention + ffn_output)
        
        return encoder_layer_out        

embedding_dim = d_model
max_len = ntoken (time)

возможно в конце linear слой для настройки ембедінг дім

In [16]:
class Encoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim,
               max_len, output_dim,  dropout_rate=0.1, layernorm_eps=1e-6):
        super(Encoder, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        
        self.pad_length = self.output_dim[0] - max_len
        
        self.pos_encoding = PositionalEncoding(self.embedding_dim, dropout_rate, max_len) # 3
        
        """self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps).to(device) 
                           for _ in range(self.num_layers)]"""
        self.enc_layers = nn.ModuleList([EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) for _ in range(self.num_layers)])

        self.linear = nn.Sequential(
            nn.Linear(embedding_dim, output_dim[-1]),
            nn.ReLU()
            )
        self.linear2 = nn.Sequential(
            nn.Linear(max_len, output_dim[-2])
            #nn.ReLU()
            )

    def pad_tensor(self, input_tensor):

        padded_tensor = torch.nn.functional.pad(input_tensor, (0, 0, 0, self.pad_length), value=0.)

        return padded_tensor        
        
    def forward(self, inputs):
        
        #x = inputs * math.sqrt(self.embedding_dim) #
        x = self.pos_encoding(inputs) #
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        
        #x = self.linear(x)
        #x = self.pad_tensor(x)
        x = self.linear2(x.permute(0, 2, 1)).permute(0, 2, 1)
        return x

внутрь декодера (не уровня) maxlen = seq_len

In [17]:
class TokenEmbedding(nn.Module):
    def __init__(self, num_vocab=35, maxlen=400, embedding_dim=64, dropout_rate=0.1):
        super(TokenEmbedding, self).__init__()
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(num_vocab, embedding_dim)
        self.pos_emb = PositionalEncoding(embedding_dim, dropout=0, max_len=maxlen) # d_model, dropout=0.1, max_len=469

    def forward(self, inputs):
        x = self.emb(inputs)
        #x = x * math.sqrt(self.embedding_dim)
        x = self.pos_emb(x)
        return x

In [18]:
class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.mha2 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        self.norm1 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm2 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm3 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)
    
    def forward(self, inputs, enc_output): # look_ahead_mask !!!!
        
        seq_len = inputs.size(1)
        ahead_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1).to(device)
        #ahead_mask = self.create_look_ahead_mask2(seq_len)
        
        self_mha1_output, _ = self.mha1(inputs, inputs, inputs, attn_mask=ahead_mask) # look_ahead_mask !!!! batch
        Q1 = self.norm1(self_mha1_output + inputs)
        
        self_mha2_output, _ = self.mha2(query=Q1, key=enc_output, value=enc_output) # pad mask  ???
        skip_attention2 = self.norm2(self_mha2_output + Q1)
        
        ffn_output = self.ffn(skip_attention2)
        drop_output = self.dropout_ffn(ffn_output)
        skip3 = self.norm3(drop_output + skip_attention2)
        
        return skip3
        

In [19]:
class Decoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim,  #target_vocab_size, maximum_position_encoding,
                 num_vocab=35, maxlen=400, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.token_emb = TokenEmbedding(num_vocab=num_vocab, maxlen=maxlen, embedding_dim=embedding_dim) # num_vocab=34, maxlen=400, embedding_dim=64
        
        self.dec_layers = nn.ModuleList([DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) for _ in range(self.num_layers)])
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, inputs, enc_output):
        
        x = self.token_emb(inputs) # torch.Size([Batch, 400, 64])
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output) # torch.Size([Batch, 400, 64])
        
        return x

In [20]:
class Transformer(nn.Module):
    def __init__(self, num_layers_2d, num_layers_encoder, num_layers_decoder, embedding_dim_encoder, embedding_dim_decoder,
                 num_heads_2d, num_heads_encoder, num_heads_decoder, fully_connected_dim_encoder, fully_connected_dim_decoder,
                 target_vocab_size, max_len_enc, max_len_dec,
                 enc_output_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()
        
        self.sfe = SpeechFeatureEmbedding(embedding_dim_encoder, enc_output_dim[-1], num_layers_2d, num_heads_2d, 
                                          layernorm_eps=layernorm_eps) # torch.Size([1, 116, 20])
        
        self.encoder = Encoder(num_layers=num_layers_encoder,
                               embedding_dim=enc_output_dim[-1],
                               #embedding_dim=embedding_dim_encoder,
                               num_heads=num_heads_encoder,
                               fully_connected_dim=fully_connected_dim_encoder,
                               max_len=max_len_enc, # 116
                               output_dim=enc_output_dim, # (400, 64)
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps) # torch.Size([1, 400, 64])
        
        """encoder = Encoder(num_layers=2,
                            embedding_dim=20,
                            num_heads=10,
                            fully_connected_dim=100,
                            max_len=116,
                            output_dim=(400, 64),
                            dropout_rate=0)"""
        
        self.decoder = Decoder(num_layers=num_layers_decoder, 
                               embedding_dim=embedding_dim_decoder,
                               num_heads=num_heads_decoder,
                               fully_connected_dim=fully_connected_dim_decoder,
                               num_vocab=target_vocab_size, # num_vocab=35
                               maxlen=max_len_dec,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)
        """num_layers, embedding_dim, num_heads, fully_connected_dim,
                 num_vocab=34, maxlen=400, dropout_rate=0.1, layernorm_eps=1e-6"""
        
        self.linear = nn.Sequential(
            #nn.LazyLinear(target_vocab_size),
            nn.Linear(embedding_dim_decoder, target_vocab_size),
            #nn.Softmax(dim=-1) # 1
            )
        
    def forward(self, input_spect_t, output_vect_str):
        
        enc_input = self.sfe(input_spect_t) # torch.Size([1, 116, 20]) 1 = N batches
        
        enc_output = self.encoder(enc_input)
        
        dec_output = self.decoder(output_vect_str, enc_output)  # torch.Size([Batch, 400, 64])
        
        final_output = self.linear(dec_output)
        
        return final_output  # [Batch, 400, 35]
        

In [21]:
dataset = CustomDataset(np_wav, vect_str_list)
vdataset = CustomDataset(vnp_wav, vvect_str_list)
dataloader = DataLoader(dataset, batch_size=20, collate_fn=collate_fn, num_workers=2) # + num thread num_workers=6,
vdataloader = DataLoader(vdataset, batch_size=20, collate_fn=collate_fn, num_workers=2)

In [22]:
def train_step(model, loss_fn, opt, loader):
    loss_per_batches = 0
    elapsed = 0
    start_epoch2 = time.time()
    for i, data in enumerate(loader):

        start_epoch = time.time()
        features, labels = data
        features, labels = features.to(device), labels.to(device)
        opt.zero_grad()
        
        dec_input = labels[:, :-1]
        dec_target = labels[:, 1:]
        
        y_pred = model(features, dec_input)
        
        #one_hot = nn.functional.one_hot(labels, 35).type(torch.float)
        #indices = torch.nonzero(torch.eq(labels, 0))[0].item()
        #print(str(labels) + "y_pred")
        #print(labels.shape)
        
        loss = loss_fn(y_pred.view(-1, y_pred.size(-1)), dec_target.contiguous().view(-1))
        loss.backward()
        
        opt.step()
        
        loss_per_batches += loss
        
        end_epoch = time.time()
        elapsed += (end_epoch - start_epoch)
        
    print("train = " + str(elapsed))
    print("train + load = " + str(time.time() - start_epoch2))
    return loss_per_batches/(i+1)

In [23]:
def train(model, loss_fn, opt, train_loader, val_loader, save_treshold=5, epochs=10, model_name='model_name'):
        
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/' + model_name + '_{}'.format(timestamp))
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'min', patience=3, verbose=True)
    
    for epoch in range(epochs):
        start_epoch = time.time()
        print('EPOCH {}:'.format(epoch + 1))
        
        model.train()
        avg_loss = train_step(model, loss_fn, opt, train_loader)
        model.eval()
        
        vloss = 0
        counter = 0
        with torch.inference_mode():
            for i, vdata in enumerate(val_loader):
                vfeatures, vlabels = vdata
                vfeatures, vlabels = vfeatures.to(device), vlabels.to(device)
                dec_input = vlabels[:, :-1]
                dec_target = vlabels[:, 1:]

                y_pred = model(vfeatures, dec_input)

                vloss += loss_fn(y_pred.view(-1, y_pred.size(-1)), dec_target.contiguous().view(-1))
                counter = i

        avg_vloss = vloss / (counter + 1)
        
        scheduler.step(avg_loss)
        
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        
        writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch + 1)
        
        if (epoch + 1) % save_treshold == 0:
            model_path = model_name +'_{}_{}'.format(timestamp, epoch)
            torch.save(model.state_dict(), model_path)
        end_epoch = time.time()
        elapsed = end_epoch - start_epoch
        print("Time per epoch {}s".format(elapsed))

In [24]:
model = Transformer(2, 6, 3, 20, 64, #10, 8, 127, 64 (512)
                32, 8, 8, 1024, 1024,
                35, 234, 399, # 116
                (399, 64),
                dropout_rate=0.3)  # (, 64)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
model.to(device)

Transformer(
  (sfe): SpeechFeatureEmbedding(
    (conv1): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 1), padding=(0, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (tda): ModuleList(
      (0-1): 2 x TwoD_Attention_layer(
        (convq): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
        (convk): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
        (convv): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
        (bnq): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bnk): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bnv): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (ln): LayerNorm((20,), eps=1e-06, el

In [25]:
summary(model)
pass

Layer (type:depth-idx)                        Param #
├─SpeechFeatureEmbedding: 1-1                 --
|    └─Sequential: 2-1                        --
|    |    └─Conv2d: 3-1                       640
|    |    └─BatchNorm2d: 3-2                  128
|    |    └─LeakyReLU: 3-3                    --
|    └─ModuleList: 2-2                        --
|    |    └─TwoD_Attention_layer: 3-4         166,664
|    |    └─TwoD_Attention_layer: 3-5         166,664
|    └─Linear: 2-3                            81,984
├─Encoder: 1-2                                --
|    └─PositionalEncoding: 2-4                --
|    |    └─Dropout: 3-6                      --
|    └─ModuleList: 2-5                        --
|    |    └─EncoderLayer: 3-7                 149,056
|    |    └─EncoderLayer: 3-8                 149,056
|    |    └─EncoderLayer: 3-9                 149,056
|    |    └─EncoderLayer: 3-10                149,056
|    |    └─EncoderLayer: 3-11                149,056
|    |    └─EncoderLaye

In [26]:
class Transformer2(nn.Module):
    def __init__(self, target_vocab_size, d_model, nhead, num_layers):
        super(Transformer2, self).__init__()

        #self.encoder = nn.Embedding(input_vocab_size, d_model)
        self.encoder = SpeechFeatureEmbedding(20, d_model) 
        self.decoder = nn.Embedding(target_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, dropout=0.2, max_len=400)

        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers[0],
                                          num_decoder_layers=num_layers[1], dropout=0.2)

        self.fc = nn.Linear(d_model, target_vocab_size)

    def forward(self, src, tgt):
        src_emb = self.encoder(src)
        tgt_emb = self.decoder(tgt)
        src_emb = self.pos_enc(src_emb)
        tgt_emb = self.pos_enc(tgt_emb)
        
        src_emb = src_emb.permute(1, 0, 2)  # Change shape from [batch_size, seq_len_src, embedding_dim] to [seq_len_src, batch_size, embedding_dim]
        tgt_emb = tgt_emb.permute(1, 0, 2)

        memory = self.transformer.encoder(src_emb)

        tgt_len = tgt_emb.size(0)
        tgt_mask = torch.triu(torch.ones(tgt_len, tgt_len), diagonal=1).bool().to(device)

        output = self.transformer.decoder(tgt_emb, memory, tgt_mask=tgt_mask)

        output = output.permute(1, 0, 2)  # Change shape back to [batch_size, seq_len_tgt, d_model]
        output = self.fc(output)

        return output

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    train(model, loss_fn, optimizer, dataloader, vdataloader, epochs=500, model_name=model.__class__.__name__)

EPOCH 1:
train = 107.54716348648071
train + load = 113.1325843334198
LOSS train 2.6018409729003906 valid 2.4398040771484375
Time per epoch 122.16564965248108s
EPOCH 2:
train = 103.28090357780457
train + load = 109.19851589202881
LOSS train 2.4420673847198486 valid 2.3944008350372314
Time per epoch 118.130526304245s
EPOCH 3:
train = 103.60813355445862
train + load = 109.48905944824219
LOSS train 2.403848648071289 valid 2.3689873218536377
Time per epoch 118.2852623462677s
EPOCH 4:
train = 103.63166046142578
train + load = 109.38065981864929
LOSS train 2.3804068565368652 valid 2.3469133377075195
Time per epoch 118.39172959327698s
EPOCH 5:
train = 103.36095857620239
train + load = 109.24494934082031
LOSS train 2.3604836463928223 valid 2.3197081089019775
Time per epoch 118.17771625518799s
EPOCH 6:
train = 103.40399241447449
train + load = 109.50954723358154
LOSS train 2.3410985469818115 valid 2.2951416969299316
Time per epoch 118.34860396385193s
EPOCH 7:
train = 103.51073503494263
train + l

In [None]:
torch.triu(torch.ones(5, 5, dtype=torch.bool), diagonal=1)

In [None]:
model.eval()
pass

In [None]:
voc = vectorizer.get_vocabulary()

In [None]:
def process_data_mfcc(np_wav, nperseg=1024, samplerate=24000):
    mfcc = librosa.feature.mfcc(y=np_wav.astype(float), sr=samplerate, hop_length=nperseg)
    pd = sklearn.preprocessing.scale(mfcc, axis=1)
    new_shape = int(469 * 1024/nperseg)
    pad = np.pad(pd, ((0, 0), (0, new_shape - pd.shape[1])), mode='constant')
    return torch.tensor(np.expand_dims(np.swapaxes(pad,0,1), axis=0), dtype=torch.float)

In [None]:
inp = torch.tensor([3]).to(device)
dec_out = list()
for i in range(400 - 1):
    res = model(process_data_mfcc(np_wav[3]).unsqueeze(0).to(device), 
            inp.unsqueeze(0).to(device)).squeeze(0)
    #print(len(inp))
    soft_out = nn.functional.softmax(res, dim=-1)
    last_logit = soft_out.argmax(dim=-1)[-1].unsqueeze(0)
    #print(last_logit)
    dec_out.append(last_logit)
    inp = torch.cat((inp, last_logit))

In [None]:
out = ""
for x in inp:
    out += voc[x]

In [None]:
str(out)

In [None]:
str_list[0]

In [None]:
res

In [None]:
"0" * 399

In [None]:
str_list[3]

In [None]:
inp2 = "<"

In [None]:
inp2

In [None]:
res = model(process_data_mfcc(vnp_wav[10]).unsqueeze(0).to(device), 
            torch.tensor(vvect_str_list[10])[:-1].unsqueeze(0).to(device)).squeeze(0)

output_str = str()
voc = vectorizer.get_vocabulary()

inp = "<"
for i in range(399):
    output_str += voc[res[i].argmax()]

In [None]:
res = model(process_data_mfcc(np_wav[0]).unsqueeze(0).to(device), 
            torch.tensor(vectorizer("<matth"))[:-1].unsqueeze(0).to(device)).squeeze(0)

output_str = str()
voc = vectorizer.get_vocabulary()

for i in range(399):
    output_str += voc[res[i].argmax()]

In [None]:
output_str

In [None]:
str_list[1000]

In [None]:
process_data_mfcc(np_wav[0])

In [None]:
indices = torch.nonzero(torch.eq(tensor, 0))[0].item()

In [None]:
indices

In [None]:
tensor = torch.empty(32, 400)


In [None]:
print(tensor.view(-1, tensor.size(-1)).shape)

In [None]:
tensor.view(-1).shape

In [None]:
print(tensor.to(device))

In [None]:
x = process_data_mfcc(np_wav[0])

In [None]:
x

In [None]:
def create_look_ahead_mask(sequence_length): # + batch size * num heads

    mask = torch.tril(torch.ones((sequence_length, sequence_length)))
    return mask

In [None]:
def func(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.masked_fill(mask == 0, int(1)).masked_fill(mask == 1, int(0))
    return mask

In [None]:
func(10)

In [None]:
mha_t = nn.MultiheadAttention(embed_dim=2, num_heads=1, dropout=0, batch_first=True)

In [None]:
tensor = torch.tensor([[[2, 3],[4, 5],[6, 7]]], dtype=torch.float)

In [None]:
tensor.shape

In [None]:
o1, o2 = mha_t(tensor, tensor, tensor, attn_mask=func(3), average_attn_weights=False)

In [None]:
o1

In [None]:
o2

In [None]:
loss_fn2 = torch.nn.CrossEntropyLoss(ignore_index=0)

In [None]:
vec1 = torch.tensor([1, 2])
vec2 = torch.tensor([[0.4 , 0.3, 0.3], [0.2, 0.3, 0.5]])

In [None]:
vec2.shape

In [None]:
loss_fn2(vec2, vec1)

In [None]:
vec1

In [None]:
for i, data in enumerate(dataloader):

    features, labels = data
    features, labels = features.to(device), labels.to(device)
    break

In [None]:
out = model(features, labels[:, :-1])

In [None]:
out[0, 1].sum()

In [None]:
loss = loss_fn2(out.permute(0, 2, 1), labels[:, 1:])

In [None]:
loss

In [None]:
labels[:, 1:].contiguous().view(-1).shape

In [None]:
out.view(-1, out.size(-1)).shape

In [None]:
nonzero_indices = torch.nonzero(labels[:, 1:].contiguous().view(-1))

In [None]:
t1 = labels[:, 1:].contiguous().view(-1)[nonzero_indices].squeeze(-1)

In [None]:
t2 = out.view(-1, out.size(-1))[nonzero_indices][:].squeeze(1)

In [None]:
t1.shape

In [None]:
t2.shape

In [None]:
loss_fn2(t2, t1)