# Tokenizer

BPE ensures that the most common words are represented in the vocabulary as a single token while the rare words are broken down into two or more subword tokens and this is in agreement with what a subword-based tokenization algorithm does.

In [1]:
from pathlib import Path

In [2]:
paths = [str(x) for x in Path('./train_data').glob('*.txt')]

In [3]:
paths

['train_data\\legislativa0_train.txt',
 'train_data\\legislativa1_test.txt',
 'train_data\\legislativa1_train.txt',
 'train_data\\legislativa2_train.txt',
 'train_data\\legislativa3_train.txt',
 'train_data\\legislativa4_train.txt',
 'train_data\\legislativa5_train.txt',
 'train_data\\legislativa6_train.txt']

## Input Pipeline and Training

In [9]:
import os
def divide_files(input_folder, output_folder, chunk_size):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file = os.path.join(input_folder, filename)
            output_file_base = os.path.splitext(filename)[0]

            with open(input_file, "r", encoding="utf-8") as f:
                content = f.read().replace("\n", "")

            # Divide content into chunks
            chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]

            # Save each chunk as a new file
            for i, chunk in enumerate(chunks):
                output_file = os.path.join(output_folder, f"legislativa_{i + 1}.txt")
                with open(output_file, "w", encoding="utf-8") as f:
                    f.write(chunk)


# Example usage
input_folder = "./raw_data"
output_folder = "./train_data"
chunk_size = 1000000

divide_files(input_folder, output_folder, chunk_size)

We need 3 tensors:
Labels tensor is our input ids tensor which represents the id in vocabulary for each token and they need to be passed throug mass language modelying system

input_ids — our token_ids with ~15% of tokens masked using the mask token <mask>. We are not masking special tokens
attention_mask — a tensor of 1s and 0s, marking the position of ‘real’ tokens/padding tokens — used in attention calculations.
labels — our token_ids with no masking.

In [4]:
import torch

def mlm(tensor):
    
    rand = torch.rand(tensor.shape) #[0,1]
    mask_arr = (rand < 0.15)* (tensor!=0)* (tensor!=1)* (tensor!=2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
        
    return tensor

In [5]:
from transformers import RobertaTokenizerFast
tokenizer_srberta = RobertaTokenizerFast.from_pretrained("../SRBerta-pretrain/srberta_tokenizer")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [46]:
from tqdm.auto import tqdm
import os

paths = [str(x) for x in Path('./train_data').glob('*.txt')]

input_ids = []
mask = [] 
labels = []

for path in tqdm(paths):
    with open(path, 'r', encoding='utf-8') as f:       
        lines = f.read().split('\n')
        # print(lines[0])
        
    sample = tokenizer_srberta(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))

  0%|          | 0/8 [00:00<?, ?it/s]

In [47]:
sample['input_ids'].shape

torch.Size([7854, 512])

In [48]:
len(input_ids)

8

This means that no matter how big one txt sample in dataset['train'][idx] is, it will get truncated to size 512!

### Load if enough RAM

In [2]:
input_ids = torch.cat(input_ids)

NameError: name 'torch' is not defined

In [50]:
mask = torch.cat(mask)

In [51]:
labels = torch.cat(labels)

In [52]:
len(labels)

41833

In [53]:
labels

tensor([[    0,  1689,  9107,  ...,    16,   342,     2],
        [    0,   456,  2296,  ...,   289,  9098,     2],
        [    0,  3337,  1416,  ...,   955, 10053,     2],
        ...,
        [    0,   331, 11736,  ...,   551,  7076,     2],
        [    0, 16932,   263,  ...,    18,  1289,     2],
        [    0,   310,  1654,  ..., 16114,   510,     2]])

## Save tensors to disk

In [72]:
import torch

In [55]:
torch.save(input_ids, './train_data/input_ids.pt')

In [56]:
torch.save(mask, './train_data/mask.pt')

In [57]:
torch.save(labels, './train_data/labels.pt')

### Load tensor data

In [6]:
import torch
input_ids = torch.load("./train_data/input_ids.pt")
mask = torch.load("./train_data/mask.pt")
labels = torch.load("./train_data/labels.pt")

### Test masked language modeling

In [74]:
input_ids[0][:20]

In [75]:
labels[0][:20]

## Dataset and dataloader

In [7]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [8]:
# Create Dataset object
import torch
class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [9]:
dataset = Dataset(encodings)

In [10]:
BATCH_SIZE = 12
DO_SHUFFLE = True
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=DO_SHUFFLE)

In [11]:
print(len(dataloader.dataset))

41833


# Training

In [12]:
#tensorboard --logdir=runs
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("./runs_fine_tunning_6")

## Load trained model and optimizer checkpoint if needed

In [11]:
from transformers import RobertaForMaskedLM

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device.cpu()
print(str(device))

cuda


In [13]:
model = RobertaForMaskedLM.from_pretrained("../SRBerta-pretrain/srberta_model_24")
model.to(device)
model.train()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [83]:
from transformers import AdamW
checkpoint = torch.load('../SRBerta-pretrain/24_optimizer.pt',map_location=device)
optim = AdamW(params=model.parameters())
print(optim)
optim.load_state_dict(checkpoint['optimizer_state_dict'])
print(optim)

## Train loop

In [17]:
from tqdm.auto import tqdm
models = [5,10,14,18,24]
for m in range(1):

        
    from transformers import RobertaForMaskedLM
    model = RobertaForMaskedLM.from_pretrained(f"../SRBerta-pretrain/srberta_model_24")
    
    torch.cuda.empty_cache()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device.cpu()
    model.to(device)
    model.train()
    
    from transformers import AdamW
    optim = AdamW(model.parameters(), lr=2e-5)

    
    step=0
    num_epochs=5
    
    for epoch in range(num_epochs):
        loop = tqdm(dataloader, leave=True)

        for batch in loop:

            optim.zero_grad()

            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optim.step()

            loop.set_description(f'Epoch: {epoch}')
            loop.set_postfix(loss=loss.item())

            writer.add_scalar(f"model24_test/step", loss, step)
            writer.flush()

            step+=1

        # Save after each epoch
        model.save_pretrained(f"./fine_tuned/model_24_epoch_"+ str(epoch))
