In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from custom_dataset import CustomDataset
from collate import collate_fn
from torch import nn
import pickle
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import librosa
import sklearn
from sklearn import preprocessing
import math
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
from datetime import datetime
import time

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
device

device(type='cuda')

In [4]:
def get_pkl(file_path):
    with open(file_path, 'rb') as file:
        loaded_list = pickle.load(file)
    return loaded_list

In [5]:
np_wav = get_pkl('np_wavs_cut.pkl')

In [6]:
np_wav = np_wav[:1000]

In [7]:
str_list = get_pkl('str_list_cut.pkl')

In [8]:
str_list = str_list[:1000]

In [9]:
print(len(np_wav))
print(len(str_list))

1000
1000


In [10]:
def pad_tensor(input_tensor, desired_length, fill_value=float('-inf')):
    _, input_length, num_channels = input_tensor.size()

    #padded_tensor = torch.nn.functional.pad(input_tensor, (0, 0, 0, pad_length))
    pad_length = desired_length - input_length
    padding = (0, 0, 0, pad_length)  # Pad (left, right, top, bottom)
    padded_tensor = torch.nn.functional.pad(input_tensor, padding, value=fill_value)
    
    return padded_tensor

In [11]:
class VectorizeChar:
    def __init__(self, max_len=50):
        self.vocab = (
            ["", "-", "#", "<", ">"]
            + [chr(i + 96) for i in range(1, 27)]
            + [" ", ".", ",", "?"]
        )
        self.max_len = max_len
        self.char_to_idx = {}
        for i, ch in enumerate(self.vocab):
            self.char_to_idx[ch] = i

    def __call__(self, text):
        text = text.lower()
        text = text[: self.max_len - 2]
        text = "<" + text + ">"
        pad_len = self.max_len - len(text)
        return [self.char_to_idx.get(ch, 1) for ch in text] + [0] * pad_len

    def get_vocabulary(self):
        return self.vocab

In [12]:
vectorizer = VectorizeChar(400)
print("vocab size", len(vectorizer.get_vocabulary()))
print(vectorizer("hey way i got a new complaint"))

vocab size 35
[3, 12, 9, 29, 31, 27, 5, 29, 31, 13, 31, 11, 19, 24, 31, 5, 31, 18, 9, 27, 31, 7, 19, 17, 20, 16, 5, 13, 18, 24, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
vect_str_list = [vectorizer(txt) for txt in str_list]

In [14]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=469):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]`` no
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [15]:
class SpeechFeatureEmbedding(nn.Module):
    def __init__(self, embedding_dim, num_hid=64):
        super(SpeechFeatureEmbedding, self).__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(1, num_hid, 3, padding=(0, 1), stride=(2, 1)),
                             nn.BatchNorm2d(num_hid),
                             nn.ReLU(),
                             nn.Conv2d(num_hid, num_hid, 3, padding=(0, 1), stride=(2, 1)),
                             nn.BatchNorm2d(num_hid),
                             nn.ReLU()
                            )
        self.conv3 = nn.Sequential(nn.Conv2d(num_hid, 1, 1, padding="same"),
                             nn.BatchNorm2d(1),
                             nn.ReLU(),
                            )
        self.lin = nn.Linear(embedding_dim, embedding_dim, bias=False)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv3(x)
        x = self.lin(x)
        return torch.squeeze(x, 1)

In [16]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return nn.Sequential(
        nn.Linear(embedding_dim, fully_connected_dim),
        nn.ReLU(),
        nn.Linear(fully_connected_dim, embedding_dim)
        )

In [17]:
class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        self.norm1 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm2 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)
        
    def forward(self, inputs):
        
        self_mha_output, _ = self.mha(inputs, inputs, inputs)
        
        skip_attention = self.norm1(torch.add(inputs, self_mha_output))
        
        ffn_output = self.ffn(skip_attention)
        
        ffn_output = self.dropout_ffn(ffn_output)
        
        encoder_layer_out = self.norm2(torch.add(skip_attention, ffn_output))
        
        return encoder_layer_out        

embedding_dim = d_model
max_len = ntoken (time)

возможно в конце linear слой для настройки ембедінг дім

In [18]:
class Encoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim,
               max_len, output_dim,  dropout_rate=0.1, layernorm_eps=1e-6):
        super(Encoder, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        
        self.pos_encoding = PositionalEncoding(self.embedding_dim, dropout_rate, max_len) # 3
        
        self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps).to(device) 
                           for _ in range(self.num_layers)]
        self.linear = nn.Linear(embedding_dim, output_dim[-1])
    
    def forward(self, inputs):
        
        #x = inputs * math.sqrt(self.embedding_dim) #
        x = self.pos_encoding(inputs) #
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        
        x = self.linear(x)
        x = pad_tensor(x, self.output_dim[-2], fill_value=0.)
        return x

внутрь декодера (не уровня) maxlen = seq_len

In [19]:
class TokenEmbedding(nn.Module):
    def __init__(self, num_vocab=35, maxlen=400, embedding_dim=64, dropout_rate=0.1):
        super(TokenEmbedding, self).__init__()
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(num_vocab, embedding_dim)
        self.pos_emb = PositionalEncoding(embedding_dim, dropout=0, max_len=maxlen) # d_model, dropout=0.1, max_len=469

    def forward(self, inputs):
        x = self.emb(inputs)
        #x = x * math.sqrt(self.embedding_dim)
        x = self.pos_emb(x)
        return x

In [20]:
class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.mha2 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        self.norm1 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm2 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm3 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)

    def create_look_ahead_mask(self, sequence_length): # + batch size * num heads

        mask = torch.tril(torch.ones((sequence_length, sequence_length)))
        return mask.to(device)

    def forward(self, inputs, enc_output): # look_ahead_mask !!!!
        
        seq_len = inputs.size(1)
        ahead_mask = self.create_look_ahead_mask(seq_len)
        
        self_mha1_output, _ = self.mha1(inputs, inputs, inputs, attn_mask=ahead_mask) # look_ahead_mask !!!! batch
        Q1 = self.norm1(torch.add(self_mha1_output, inputs))
        
        self_mha2_output, _ = self.mha2(query=Q1, key=enc_output, value=enc_output) # pad mask  ???
        skip_attention2 = self.norm2(torch.add(self_mha2_output, Q1))
        
        ffn_output = self.ffn(skip_attention2)
        drop_output = self.dropout_ffn(ffn_output)
        skip3 = self.norm3(drop_output)
        
        return skip3
        

In [21]:
class Decoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim,  #target_vocab_size, maximum_position_encoding,
                 num_vocab=35, maxlen=400, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.token_emb = TokenEmbedding(num_vocab=num_vocab, maxlen=maxlen, embedding_dim=embedding_dim) # num_vocab=34, maxlen=400, embedding_dim=64
        
        self.dec_layers = [DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps).to(device) 
                           for _ in range(self.num_layers)]
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, inputs, enc_output):
        
        x = self.token_emb(inputs) # torch.Size([Batch, 400, 64])
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output) # torch.Size([Batch, 400, 64])
        
        return x    
        

In [22]:
class Transformer(nn.Module):
    def __init__(self, num_layers_encoder, num_layers_decoder, embedding_dim_encoder, embedding_dim_decoder,
                 num_heads_encoder, num_heads_decoder, fully_connected_dim_encoder, fully_connected_dim_decoder,
                 target_vocab_size, max_len_enc, max_len_dec,
                 enc_output_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()
        
        self.sfe = SpeechFeatureEmbedding(embedding_dim_encoder) # torch.Size([1, 116, 20])
        
        self.encoder = Encoder(num_layers=num_layers_encoder,
                               embedding_dim=embedding_dim_encoder,
                               num_heads=num_heads_encoder,
                               fully_connected_dim=fully_connected_dim_encoder,
                               max_len=max_len_enc, # 116
                               output_dim=enc_output_dim, # (400, 64)
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps) # torch.Size([1, 400, 64])
        
        """encoder = Encoder(num_layers=2,
                            embedding_dim=20,
                            num_heads=10,
                            fully_connected_dim=100,
                            max_len=116,
                            output_dim=(400, 64),
                            dropout_rate=0)"""
        
        self.decoder = Decoder(num_layers=num_layers_decoder, 
                               embedding_dim=embedding_dim_decoder,
                               num_heads=num_heads_decoder,
                               fully_connected_dim=fully_connected_dim_decoder,
                               num_vocab=target_vocab_size, # num_vocab=35
                               maxlen=max_len_dec,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)
        """num_layers, embedding_dim, num_heads, fully_connected_dim,
                 num_vocab=34, maxlen=400, dropout_rate=0.1, layernorm_eps=1e-6"""
        
        self.linear = nn.Sequential(
            #nn.LazyLinear(target_vocab_size),
            nn.Linear(embedding_dim_decoder, target_vocab_size),
            nn.Softmax(dim=1)
            )
        
    def forward(self, input_spect_t, output_vect_str):
        
        enc_input = self.sfe(input_spect_t) # torch.Size([1, 116, 20]) 1 = N batches
        
        enc_output = self.encoder(enc_input)
        
        dec_output = self.decoder(output_vect_str, enc_output)  # torch.Size([Batch, 400, 64])
        
        final_output = self.linear(dec_output)
        
        return final_output  # [Batch, 400, 35]
        

In [23]:
dataset = CustomDataset(np_wav, vect_str_list)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, num_workers=3) # + num thread num_workers=6,

In [24]:
def train_step(model, loss_fn, opt, loader):
    loss_per_batches = 0
    elapsed = 0
    start_epoch2 = time.time()
    for i, data in enumerate(loader):

        start_epoch = time.time()
        features, labels = data
        features, labels = features.to(device), labels.to(device)
        opt.zero_grad()
        
        y_pred = model(features, labels)
        
        one_hot = nn.functional.one_hot(labels, 35).type(torch.float)
        
        loss = loss_fn(y_pred, one_hot)
        loss.backward()
        
        opt.step()
        
        loss_per_batches += loss
        
        end_epoch = time.time()
        elapsed += (end_epoch - start_epoch)
        
    print("train = " + str(elapsed))
    print("train + load = " + str(time.time() - start_epoch2))
    return loss_per_batches/(i+1)

In [25]:
def train(model, loss_fn, opt, train_loader, save_treshold=5, epochs=10, model_name='model_name'):
        
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/' + model_name + '_{}'.format(timestamp))
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'min', patience=3, verbose=True)
    
    for epoch in range(epochs):
        start_epoch = time.time()
        print('EPOCH {}:'.format(epoch + 1))
        
        model.train()
        avg_loss = train_step(model, loss_fn, opt, train_loader)
        model.eval()
        
        scheduler.step(avg_loss)
        
        print('LOSS train {}'.format(avg_loss))
        
        writer.add_scalars('Training Loss',
                    { 'Training' : avg_loss },
                    epoch + 1)
        
        if (epoch + 1) % save_treshold == 0:
            model_path = model_name +'_{}_{}'.format(timestamp, epoch)
            torch.save(model.state_dict(), model_path)
        end_epoch = time.time()
        elapsed = end_epoch - start_epoch
        print("Time per epoch {}s".format(elapsed))

                (self, num_layers_encoder, num_layers_decoder, embedding_dim_encoder, embedding_dim_decoder,
                 num_heads_encoder, num_heads_decoder, fully_connected_dim_encoder, fully_connected_dim_decoder,
                 target_vocab_size, max_len_enc, max_len_dec,
                 enc_output_dim, dropout_rate=0.1, layernorm_eps=1e-6)

In [26]:
model = Transformer(4, 2, 20, 32, #10, 8, , 64
                10, 8, 100, 100,
                35, 116, 400,
                (400, 32))  # (, 64)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
model.to(device)

Transformer(
  (sfe): SpeechFeatureEmbedding(
    (conv1): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 1), padding=(0, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 1), padding=(0, 1))
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU()
    )
    (conv3): Sequential(
      (0): Conv2d(64, 1, kernel_size=(1, 1), stride=(1, 1), padding=same)
      (1): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (lin): Linear(in_features=20, out_features=20, bias=False)
  )
  (encoder): Encoder(
    (pos_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (linear): Linear(in_features=20, out_features=32, bias=True)
  )
  (decoder): Decoder(
    (token_emb): TokenEmbedding(
      (emb): Embedding(35, 32)
  

In [27]:
train(model, loss_fn, optimizer, dataloader, epochs=100, model_name=model.__class__.__name__)

EPOCH 1:
train = 13.206430912017822
train + load = 24.45325803756714
LOSS train 68.47205352783203
Time per epoch 24.457258939743042s
EPOCH 2:
train = 4.839326858520508
train + load = 15.484957218170166
LOSS train 68.46662139892578
Time per epoch 15.49595832824707s
EPOCH 3:
train = 4.833075046539307
train + load = 15.409869194030762
LOSS train 68.45516204833984
Time per epoch 15.42087173461914s
EPOCH 4:
train = 4.852826118469238
train + load = 15.201218128204346
LOSS train 68.42826080322266
Time per epoch 15.209206342697144s
EPOCH 5:
train = 4.953278541564941
train + load = 15.008864164352417
LOSS train 68.37508392333984
Time per epoch 15.076348066329956s
EPOCH 6:
train = 4.977241039276123
train + load = 15.392274856567383
LOSS train 68.2989730834961
Time per epoch 15.399282217025757s
EPOCH 7:
train = 5.035881757736206
train + load = 15.49367880821228
LOSS train 68.21434783935547
Time per epoch 15.501662254333496s
EPOCH 8:
train = 4.959097623825073
train + load = 15.278133869171143
LOSS

KeyboardInterrupt: 