In [1]:
import custom_dataset
from custom_dataset import CustomDataset
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import time
from datetime import datetime
import tqdm
import math
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
import pickle
import tiktoken

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def get_pkl(file_path):
    with open(file_path, 'rb') as file:
        loaded_list = pickle.load(file)
    return loaded_list

In [4]:
labels = get_pkl('list_labels.pkl')
x = get_pkl('list_inputs.pkl')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    labels, 
                                                    test_size=0.2,
                                                    random_state=42) 

In [6]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=150):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]`` no
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [7]:
class TokenEmbedding(nn.Module):
    def __init__(self, num_vocab, maxlen=150, embedding_dim=16, dropout_rate=0.1):
        super(TokenEmbedding, self).__init__()
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(num_vocab, embedding_dim)
        self.pos_emb = PositionalEncoding(embedding_dim, dropout=dropout_rate, max_len=maxlen)
    def forward(self, inputs):
        x = self.emb(inputs)
        x = x * math.sqrt(self.embedding_dim) #!!!! 
        x = self.pos_emb(x)
        return x

In [8]:
class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)
        """self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, fully_connected_dim),
            nn.ReLU(),
            nn.Linear(fully_connected_dim, embedding_dim)
            )"""

        self.ffn = nn.Sequential(
            nn.Conv1d(embedding_dim, fully_connected_dim, 3, padding='same'),
            nn.BatchNorm1d(fully_connected_dim),
            nn.ReLU(),
            nn.Conv1d(fully_connected_dim, embedding_dim, 3, padding='same'),
            nn.BatchNorm1d(embedding_dim)
            )
        self.norm1 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.norm2 = nn.LayerNorm(normalized_shape=embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)

    def forward(self, inputs, mask):

        self_mha_output, _ = self.mha(inputs, inputs, inputs, key_padding_mask = mask) # !!! mask

        skip_attention = self.norm1(inputs + self_mha_output)

        ffn_output = self.ffn(skip_attention.permute(0, 2, 1)).permute(0, 2, 1)

        ffn_output = self.dropout_ffn(ffn_output)

        encoder_layer_out = self.norm2(skip_attention + ffn_output)

        return encoder_layer_out

In [9]:
class Tr(nn.Module):
    def __init__(self, num_class, num_layers, num_heads, fully_connected_dim, embedding_dim, max_len,
                 num_vocab, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Tr, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.fully_connected_dim = fully_connected_dim

        self.pos_encoding = TokenEmbedding(num_vocab, max_len, embedding_dim, dropout_rate)

        self.enc_layers = nn.ModuleList([EncoderLayer(embedding_dim=embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) for _ in range(self.num_layers)]
                                        )


        #self.flatten = nn.Flatten()
        self.conv = nn.Conv1d(self.embedding_dim, 1, 1)

        self.linear = nn.Sequential(
            nn.Linear(max_len, max_len),
            nn.ReLU(),
            nn.Linear(max_len, num_class)
            )

        self.init_weights()

    def init_weights(self):
        for module in self.modules():
            if isinstance(module, (nn.Linear, nn.Conv1d)):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

    def forward(self, inputs, mask):
        x = self.pos_encoding(inputs)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask)

        x = self.conv(x.permute(0, 2, 1))
        #x = self.flatten(x)
        out = self.linear(torch.squeeze(x))

        return out

In [10]:
def train_step(model, loss_fn, opt, loader, PAD_IDX):
    loss_per_batches = 0
    elapsed = 0
    start_epoch2 = time.time()
    r = 0
    c = 0
    for i, data in enumerate(loader):
        start_epoch = time.time()
        features, labels = data
        features, labels = features.to(device), labels.to(device)
        opt.zero_grad()

        c += labels.shape[0]

        src_padding_mask_s = (features == PAD_IDX)       
        y_pred = model(features, src_padding_mask_s) # mask!!!!
        loss = loss_fn(y_pred, labels)
        soft_out = nn.functional.softmax(y_pred, dim=-1)
        out = soft_out.argmax(dim=-1)
        r += (labels == out).sum()

        loss.backward()

        opt.step()

        loss_per_batches += loss

        end_epoch = time.time()
        elapsed += (end_epoch - start_epoch)

    print("train = " + str(elapsed))
    print("train + load = " + str(time.time() - start_epoch2))
    return loss_per_batches/(i+1), r/c * 100

In [11]:
def train(model, loss_fn, opt, train_loader, val_loader, PAD_IDX, epochs=10, save_treshold=100, model_name='model_name'):

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/' + model_name + '_{}'.format(timestamp))

    for epoch in range(epochs):
        start_epoch = time.time()
        print('EPOCH {}:'.format(epoch + 1))

        model.train()
        avg_loss, acc = train_step(model, loss_fn, opt, train_loader, PAD_IDX)
        model.eval()

        vloss = 0
        counter = 0
        r = 0
        c = 0
        avg_vloss = 0

        with torch.inference_mode():
            for i, vdata in enumerate(val_loader):
                vfeatures, vlabels = vdata
                vfeatures, vlabels = vfeatures.to(device), vlabels.to(device)

                c += vlabels.shape[0]

                src_padding_mask = (vfeatures == PAD_IDX)

                y_pred = model(vfeatures, src_padding_mask)

                soft_out = nn.functional.softmax(y_pred, dim=-1)
                out = soft_out.argmax(dim=-1)
                r += (vlabels == out).sum()

                vloss += loss_fn(y_pred.to(torch.float), vlabels)
                counter = i

        avg_vloss = vloss / (counter + 1)

        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        print('ACC train {} valid {}'.format(acc, r/c*100))
        writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch + 1)

        if (epoch + 1) % save_treshold == 0:
            model_path = model_name +'_{}_{}'.format(timestamp, epoch)
            torch.save(model.state_dict(), model_path)
        end_epoch = time.time()
        elapsed = end_epoch - start_epoch
        print("Time per epoch {}s".format(elapsed))

In [12]:
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [13]:
enc = tiktoken.get_encoding("cl100k_base")

In [28]:
model = Tr(2, 2, 8, 32, 16, 150, enc.n_vocab, dropout_rate=0.3)
model.to(device)

loss_fn = torch.nn.CrossEntropyLoss()
#loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [15]:
summary(model)
pass

Layer (type:depth-idx)                                       Param #
├─TokenEmbedding: 1-1                                        --
|    └─Embedding: 2-1                                        1,604,432
|    └─PositionalEncoding: 2-2                               --
|    |    └─Dropout: 3-1                                     --
├─ModuleList: 1-2                                            --
|    └─EncoderLayer: 2-3                                     --
|    |    └─MultiheadAttention: 3-2                          1,088
|    |    └─Sequential: 3-3                                  3,216
|    |    └─LayerNorm: 3-4                                   32
|    |    └─LayerNorm: 3-5                                   32
|    |    └─Dropout: 3-6                                     --
|    └─EncoderLayer: 2-4                                     --
|    |    └─MultiheadAttention: 3-7                          1,088
|    |    └─Sequential: 3-8                                  3,216
|    |    └─Laye

In [29]:
train(model, loss_fn, optimizer, train_loader, val_loader, 0, epochs=300)

EPOCH 1:
train = 0.28005266189575195
train + load = 0.2870612144470215
LOSS train 0.9836729764938354 valid 0.7149606943130493
ACC train 54.187191009521484 valid 54.90196228027344
Time per epoch 0.3040635585784912s
EPOCH 2:
train = 0.09695577621459961
train + load = 0.10396170616149902
LOSS train 0.8779175281524658 valid 0.7470169067382812
ACC train 53.694580078125 valid 54.90196228027344
Time per epoch 0.1109619140625s
EPOCH 3:
train = 0.084014892578125
train + load = 0.09202861785888672
LOSS train 0.8149344325065613 valid 0.7514301538467407
ACC train 50.738914489746094 valid 52.94117736816406
Time per epoch 0.10003280639648438s
EPOCH 4:
train = 0.08599591255187988
train + load = 0.0899953842163086
LOSS train 0.8817670941352844 valid 0.7102750539779663
ACC train 44.82758712768555 valid 58.82353210449219
Time per epoch 0.0969991683959961s
EPOCH 5:
train = 0.08303165435791016
train + load = 0.08802652359008789
LOSS train 0.7835738658905029 valid 0.683515191078186
ACC train 55.66502380371

In [2]:
%load_ext tensorboard

In [18]:
%reload_ext tensorboard

In [3]:
%tensorboard --logdir runs/ --port=6004