In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import custom_dataset
import pickle
import tiktoken
import time
from datetime import datetime
import math
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
from custom_dataset import CustomDataset

# Pickle Functions

In [2]:
def save_pkl(file_path, data):
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

In [3]:
def get_pkl(file_path):
    with open(file_path, 'rb') as file:
        loaded_list = pickle.load(file)
    return loaded_list

# Load dataset

In [4]:
X = get_pkl("dataset/x.pkl")
Y = get_pkl("dataset/y.pkl")

# Encode tiktoken

In [5]:
enc = tiktoken.get_encoding("cl100k_base")

In [6]:
def tokenization(result, enc2):
    output = list()
    for i in result:
        output.append(enc2.encode(i, allowed_special={'<|im_start|>', "<|im_end|>"}))
    return output

In [7]:
cl100k_base = tiktoken.get_encoding("cl100k_base")

# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
enc2 = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        "<|im_start|>": 100264,
        "<|im_end|>": 100265,
    }
)

In [8]:
new_X = ["<|im_start|>" + sentence + "<|im_end|>" for sentence in X]
new_Y = ["<|im_start|>" + sentence + "<|im_end|>" for sentence in Y]

In [9]:
enc_X = tokenization(X, enc)
enc_Y = tokenization(Y, enc)

In [10]:
def delete_mount(sentence):
    if 'mount' in sentence:
        return sentence.replace('mount ', '')
    return sentence

In [11]:
new_Y = [delete_mount(sentence) for sentence in new_Y]

In [12]:
enc_X2 = tokenization(new_X, enc2)
enc_Y2 = tokenization(new_Y, enc2)

In [13]:
new_Y

['<|im_start|>mckinley<|im_end|>',
 '<|im_start|>aconcagua<|im_end|>',
 '<|im_start|>aconcagua<|im_end|>',
 '<|im_start|>machu picchu<|im_end|>',
 '<|im_start|>damavand<|im_end|>',
 '<|im_start|>denali<|im_end|>',
 '<|im_start|>rainier<|im_end|>',
 '<|im_start|>annapurna<|im_end|>',
 '<|im_start|>whitney<|im_end|>',
 '<|im_start|>pikes peak, rainier<|im_end|>',
 '<|im_start|>gissar range<|im_end|>',
 '<|im_start|>blackburn<|im_end|>',
 '<|im_start|>k2<|im_end|>',
 '<|im_start|>kosciuszko<|im_end|>',
 '<|im_start|>aconcagua<|im_end|>',
 '<|im_start|>saint helena<|im_end|>',
 '<|im_start|>cook<|im_end|>',
 '<|im_start|>mckinley<|im_end|>',
 '<|im_start|>k2<|im_end|>',
 '<|im_start|>baker<|im_end|>',
 '<|im_start|>everest<|im_end|>',
 '<|im_start|>olympus<|im_end|>',
 '<|im_start|>kilimanjaro<|im_end|>',
 '<|im_start|>marcus baker<|im_end|>',
 '<|im_start|>huascarã¡n<|im_end|>',
 '<|im_start|>bogd khan uul<|im_end|>',
 '<|im_start|>vesuvius<|im_end|>',
 '<|im_start|>elbrus<|im_end|>',
 '<

In [14]:
print(enc_Y[0])
print(enc_Y2[0])

[16966, 296, 377, 258, 3258]
[100264, 76, 377, 258, 3258, 100265]


In [15]:
print(enc_X[0])
print(enc_X2[0])

[56950, 90256, 30633, 35687, 48501, 11822, 42211, 33889, 3221, 6606, 296, 377, 258, 3258, 13]
[100264, 56950, 90256, 30633, 35687, 48501, 11822, 42211, 33889, 3221, 6606, 296, 377, 258, 3258, 13, 100265]


In [16]:
def adjust_lists(lst, max_len):
    if len(lst) < max_len:
        lst.extend([0] * (max_len - len(lst)))
    return lst

In [17]:
fdata = [adjust_lists(lst, 105) for lst in enc_X2] 
flabels = [adjust_lists(lst, 106) for lst in enc_Y2]

In [18]:
lens_X = [len(l) for l in enc_X]
lens_Y = [len(l) for l in enc_Y]

In [19]:
print(max(lens_X))
print(max(lens_Y))

101
19


In [20]:
X_train, X_test, y_train, y_test = train_test_split(fdata, 
                                                    flabels, 
                                                    test_size=0.2,
                                                    random_state=42)

# Model

In [21]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=469):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]`` no
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [22]:
class TokenEmbedding(nn.Module):
    def __init__(self, num_vocab, maxlen=150, embedding_dim=16, dropout_rate=0.1):
        super(TokenEmbedding, self).__init__()
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(num_vocab, embedding_dim)
        self.pos_emb = PositionalEncoding(embedding_dim, dropout=dropout_rate, max_len=maxlen)
    def forward(self, inputs):
        x = self.emb(inputs)
        x = x * math.sqrt(self.embedding_dim) #!!!! 
        x = self.pos_emb(x)
        return x

In [23]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return nn.Sequential(
        nn.Linear(embedding_dim, fully_connected_dim),
        nn.ReLU(),
        nn.Linear(fully_connected_dim, embedding_dim)
        )

In [24]:
class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        self.norm1 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm2 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)

    def forward(self, inputs, mask):

        self_mha_output, _ = self.mha(inputs, inputs, inputs, key_padding_mask=mask) # !!! mask

        skip_attention = self.norm1(inputs + self_mha_output)

        ffn_output = self.ffn(skip_attention)

        ffn_output = self.dropout_ffn(ffn_output)

        encoder_layer_out = self.norm2(skip_attention + ffn_output)

        return encoder_layer_out

In [25]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, fully_connected_dim, embedding_dim, max_len,
                 num_vocab, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Encoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.fully_connected_dim = fully_connected_dim

        self.pos_encoding = TokenEmbedding(num_vocab, max_len, embedding_dim, dropout_rate)
        self.enc_layers = nn.ModuleList([EncoderLayer(embedding_dim=embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) for _ in range(self.num_layers)]
                                        )

        self.init_weights()

    def init_weights(self):
        for module in self.modules():
            if isinstance(module, (nn.Linear, nn.Conv1d)):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

    def forward(self, inputs):
        x = self.pos_encoding(inputs)
        src_padding_mask = (inputs == 0)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, src_padding_mask) #mask
        return x

In [26]:
class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.mha2 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        self.norm1 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm2 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm3 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)
    
    def forward(self, inputs, enc_output, src_padding_mask): # look_ahead_mask !!!!
        
        seq_len = inputs.size(1)
        ahead_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1).to(device)
        #ahead_mask = self.create_look_ahead_mask2(seq_len)
        
        self_mha1_output, _ = self.mha1(inputs, inputs, inputs, attn_mask=ahead_mask, key_padding_mask=src_padding_mask)#, key_padding_mask=src_padding_mask) # look_ahead_mask !!!! batch (pad)
        Q1 = self.norm1(self_mha1_output + inputs)
        
        self_mha2_output, _ = self.mha2(query=Q1, key=enc_output, value=enc_output)#, key_padding_mask=src_padding_mask) # pad mask  ???
        skip_attention2 = self.norm2(self_mha2_output + Q1)
        
        ffn_output = self.ffn(skip_attention2)
        drop_output = self.dropout_ffn(ffn_output)
        skip3 = self.norm3(drop_output + skip_attention2)
        
        return skip3

In [27]:
class Decoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim,  #target_vocab_size, maximum_position_encoding,
                 num_vocab=35, maxlen=400, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.token_emb = TokenEmbedding(num_vocab=num_vocab, maxlen=maxlen, embedding_dim=embedding_dim) # num_vocab=34, maxlen=400, embedding_dim=64
        
        self.dec_layers = nn.ModuleList([DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) for _ in range(self.num_layers)])
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, inputs, enc_output):
        
        src_padding_mask = (inputs == 0) 
        x = self.token_emb(inputs) # torch.Size([Batch, 400, 64])
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output, src_padding_mask) # torch.Size([Batch, 400, 64])
        
        return x

In [28]:
class Transformer(nn.Module):
    def __init__(self, num_layers_encoder, num_layers_decoder, num_heads_encoder, num_heads_decoder,
                 fully_connected_dim_encoder, fully_connected_dim_decoder, embedding_dim_encoder,
                 embedding_dim_decoder, max_len_enc, max_len_dec, num_vocab, # мб same last 2
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(num_layers=num_layers_encoder,
                               num_heads=num_heads_encoder,
                               fully_connected_dim=fully_connected_dim_encoder,
                               embedding_dim=embedding_dim_encoder,
                               max_len=max_len_enc, # 116
                               num_vocab=num_vocab,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps) # torch.Size([1, 400, 64])
        """num_class, num_layers, num_heads, fully_connected_dim, embedding_dim, max_len,
                 num_vocab, dropout_rate=0.1, layernorm_eps=1e-6)"""
        
        self.decoder = Decoder(num_layers=num_layers_decoder, 
                               embedding_dim=embedding_dim_decoder,
                               num_heads=num_heads_decoder,
                               fully_connected_dim=fully_connected_dim_decoder,
                               num_vocab=num_vocab, # num_vocab=35
                               maxlen=max_len_dec,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)
        """num_layers, embedding_dim, num_heads, fully_connected_dim,
                 num_vocab=34, maxlen=400, dropout_rate=0.1, layernorm_eps=1e-6"""
        
        self.linear = nn.Linear(embedding_dim_decoder, num_vocab)
        
    def forward(self, enc_input, output_vect_str):
        
        enc_output = self.encoder(enc_input)
        
        dec_output = self.decoder(output_vect_str, enc_output)  # torch.Size([Batch, 400, 64])
        
        final_output = self.linear(dec_output)
        
        return final_output  # [Batch, 400, 35]

# Func Train

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
def train_step(model, loss_fn, opt, loader):
    loss_per_batches = 0
    elapsed = 0
    start_epoch2 = time.time()
    for i, data in enumerate(loader):

        start_epoch = time.time()
        features, labels = data
        features, labels = features.to(device), labels.to(device)
        opt.zero_grad()
        
        dec_input = labels[:, :-1]
        dec_target = labels[:, 1:]
        
        y_pred = model(features, dec_input)
        
        #one_hot = nn.functional.one_hot(labels, 35).type(torch.float)
        #indices = torch.nonzero(torch.eq(labels, 0))[0].item()
        #print(str(labels) + "y_pred")
        #print(labels.shape)
        
        loss = loss_fn(y_pred.view(-1, y_pred.size(-1)), dec_target.contiguous().view(-1))
        loss.backward()
        
        opt.step()
        
        loss_per_batches += loss
        
        end_epoch = time.time()
        elapsed += (end_epoch - start_epoch)
        
    print("train = " + str(elapsed))
    print("train + load = " + str(time.time() - start_epoch2))
    return loss_per_batches/(i+1)

In [31]:
def train(model, loss_fn, opt, train_loader, val_loader, save_treshold=5, epochs=10, model_name='model_name'):
        
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/' + model_name + '_{}'.format(timestamp))
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'min', patience=3, verbose=True)
    
    for epoch in range(epochs):
        start_epoch = time.time()
        print('EPOCH {}:'.format(epoch + 1))
        
        model.train()
        avg_loss = train_step(model, loss_fn, opt, train_loader)
        model.eval()
        
        vloss = 0
        counter = 0
        with torch.inference_mode():
            for i, vdata in enumerate(val_loader):
                vfeatures, vlabels = vdata
                vfeatures, vlabels = vfeatures.to(device), vlabels.to(device)
                dec_input = vlabels[:, :-1]
                dec_target = vlabels[:, 1:]

                y_pred = model(vfeatures, dec_input)

                vloss += loss_fn(y_pred.view(-1, y_pred.size(-1)), dec_target.contiguous().view(-1))
                counter = i

        avg_vloss = vloss / (counter + 1)
        
        scheduler.step(avg_loss)
        
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        
        writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch + 1)
        
        if (epoch + 1) % save_treshold == 0:
            model_path = model_name +'_{}_{}'.format(timestamp, epoch)
            torch.save(model.state_dict(), model_path)
        end_epoch = time.time()
        elapsed = end_epoch - start_epoch
        print("Time per epoch {}s".format(elapsed))

# DataLoad

In [32]:
dataset = CustomDataset(X_train, y_train)
vdataset = CustomDataset(X_test, y_test)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=12, num_workers=2, shuffle=True) # + num thread num_workers=6,
vdataloader = torch.utils.data.DataLoader(vdataset, batch_size=12, num_workers=2, shuffle=True)

# Create model

In [33]:
model = Transformer(6, 3, 16, 8, 
                1024, 1024, 64,
                64, 105, 105, enc2.n_vocab,
                dropout_rate=0.25)

"""num_layers_encoder, num_layers_decoder, num_heads_encoder, num_heads_decoder,
                 fully_connected_dim_encoder, fully_connected_dim_decoder, embedding_dim_encoder,
                 embedding_dim_decoder, max_len_enc, max_len_dec, num_vocab,"""

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
model.to(device)

Transformer(
  (encoder): Encoder(
    (pos_encoding): TokenEmbedding(
      (emb): Embedding(100277, 64)
      (pos_emb): PositionalEncoding(
        (dropout): Dropout(p=0.25, inplace=False)
      )
    )
    (enc_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (mha): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (ffn): Sequential(
          (0): Linear(in_features=64, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=64, bias=True)
        )
        (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
        (dropout_ffn): Dropout(p=0.25, inplace=False)
      )
    )
  )
  (decoder): Decoder(
    (token_emb): TokenEmbedding(
      (emb): Embedding(100277, 64)
      (pos_emb): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      

In [34]:
train(model, loss_fn, optimizer, dataloader, vdataloader , 50, epochs=100, model_name='first_try')

EPOCH 1:
train = 3.7760729789733887
train + load = 8.15526008605957


  return torch._native_multi_head_attention(


LOSS train 10.457783699035645 valid 9.595171928405762
Time per epoch 11.038755893707275s
EPOCH 2:
train = 1.803419828414917
train + load = 5.984614849090576
LOSS train 8.828719139099121 valid 7.853949069976807
Time per epoch 8.743976354598999s
EPOCH 3:
train = 1.8866515159606934
train + load = 6.078395128250122
LOSS train 7.069869041442871 valid 6.61879301071167
Time per epoch 8.816003322601318s
EPOCH 4:
train = 1.8084006309509277
train + load = 5.917388200759888
LOSS train 5.77244234085083 valid 5.617888450622559
Time per epoch 8.640006065368652s
EPOCH 5:
train = 1.9077973365783691
train + load = 6.318501949310303
LOSS train 5.062497615814209 valid 5.488475322723389
Time per epoch 9.50612187385559s
EPOCH 6:
train = 1.939434289932251
train + load = 7.367485046386719
LOSS train 4.681278705596924 valid 4.748739719390869
Time per epoch 10.268407344818115s
EPOCH 7:
train = 1.8643231391906738
train + load = 6.285733699798584
LOSS train 4.413488864898682 valid 4.870190620422363
Time per epoc

KeyboardInterrupt: 