# Dmitriev Egor
e.dmitriev@innopolis.university | BS20-RO

In [1]:
%pip install transformers
%pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable


Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = T5Config(vocab_size=250112, num_layers=8, num_heads=6)
tokenizer = T5Tokenizer.from_pretrained("google/t5-small-ssm")
model = T5ForConditionalGeneration.from_pretrained("google/t5-small-ssm")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
df = pd.read_csv('../data/interim/preprocessed.tsv', sep='\t', index_col=0)
df = df.iloc[:10000] # limit dataset
df['reference'].values.shape

(10000,)

In [6]:
encodings = tokenizer(df['reference'].values.tolist(),
                    text_target=df['translation'].values.tolist(),
                    max_length=32,  # Specify the maximum sequence length
                    padding="max_length",  # Pad the input to the specified maximum length
                    return_tensors="pt",  # Return PyTorch tensors
                    truncation=True  # Truncate the input if it exceeds the maximum length
                    )

In [7]:
print(encodings['input_ids'][0, :10])
print(encodings['attention_mask'][0, :10])
print(encodings['labels'][0, :10])

tensor([ 168,   62,   54, 8179,   39,  280,    1,    0,    0,    0])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
tensor([ 168,   62,  228, 8179,   39,  280,   21,   80,    1,    0])


In [8]:
import torch

pt_encodings = torch.stack([encodings[key] for key in encodings.keys()]).swapaxes(0,1)
pt_encodings.shape

torch.Size([10000, 3, 32])

In [9]:
import numpy as np
split = [0.8, 0.1, 0.1]
total_samples = len(encodings.input_ids)
train_split = int(split[0] * total_samples)
test_split = int((split[0] + split[1]) * total_samples)
print(total_samples, train_split, test_split)
print(encodings.keys())


train_data = pt_encodings[:train_split]
test_data = pt_encodings[train_split:test_split]
validation_data = pt_encodings[test_split:]

print(f'{train_data.shape=}')
print(f'{test_data.shape=}')
print(f'{validation_data.shape=}')
train_data.shape

10000 8000 9000
dict_keys(['input_ids', 'attention_mask', 'labels'])
train_data.shape=torch.Size([8000, 3, 32])
test_data.shape=torch.Size([1000, 3, 32])
validation_data.shape=torch.Size([1000, 3, 32])


torch.Size([8000, 3, 32])

In [10]:
from dataclasses import dataclass
@dataclass
class T2TDataCollator:
    def __call__(self, batch):
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([unit[0] for unit in batch])
        input_attention_mask = torch.stack([unit[2] for unit in batch])
        output_ids = torch.stack([unit[1] for unit in batch])
        output_ids[output_ids[:, :] == tokenizer.pad_token_id] = -100   
        output_attention_mask = output_ids.apply_(lambda x : 0 if x == -100 else 1)
        ret = torch.stack([input_ids, input_attention_mask, output_ids, output_attention_mask])
        # print(ret.isnan().any())
        return ret

In [11]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=0.001)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
model.to(device)

batch_size = 16

In [12]:
from torch.utils.data import DataLoader
data_collator = T2TDataCollator()
train_dataloader = DataLoader(
    train_data, shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
val_dataloader = DataLoader(
    validation_data, batch_size=batch_size, collate_fn=data_collator
)
test_dataloader = DataLoader(
    test_data, batch_size=batch_size, collate_fn=data_collator
)
# next(iter(test_dataloader)).shape

In [13]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [14]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def train(train_dataloader, n_epochs, learning_rate=0.001, print_every=100):
    start = time.time()
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    for epoch in range(n_epochs):
        for batch in train_dataloader:
            batch.to(device)
            model.train()
            optimizer.zero_grad()
            outputs = model(
                input_ids = batch[0].to(device),
                attention_mask = batch[1].to(device),
                labels = batch[2].to(device),
                # decoder_attention_mask = batch[3].to(device),
        )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            print(loss)
            print_loss_total += loss
        print_loss_total /= len(train_dataloader)

        if (epoch+1) % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, (epoch+1) / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

In [15]:
train(train_dataloader, 50, print_every=1)

tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)
tensor(nan, grad_fn=<NllLossBackward0>)


: 