In [None]:
import os
import re
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
import transformers
from transformers import PreTrainedTokenizerFast
from transformers import LlamaConfig, LlamaModel
from transformers import Dataset



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

# Pretrain

### 1. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [3]:
novels_path ='../RussianNovels/corpus/'

In [4]:
texts = []

for file in os.listdir(novels_path):
    with open(novels_path + file, 'r', encoding='utf-8') as f:
        texts.append(f.read())

len(texts)

108

In [5]:
# –£–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤
texts = list(set(texts))
len(texts)

107

In [6]:
# –£–¥–∞–ª–µ–Ω–∏–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π —Å –±—É–∫–≤–∞–º–∏ –Ω–µ –∏–∑ –∫–∏—Ä–∏–ª–ª–∏—Ü—ã
for i, text in enumerate(texts):
    sents_with_latin = re.findall(r'[^.!?]*[a-zA-Z]+[^.!?]*[.!?]', text)
    for sent in sents_with_latin:
        texts[i] = texts[i].replace(sent, '')

In [7]:
# –£–¥–∞–ª–µ–Ω–∏–µ –ø–æ–≤—Ç–æ—Ä—è—é—â–µ–π—Å—è –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏
for i, text in enumerate(texts):
    texts[i] = re.sub(r'([.,!?])\1+', r'\1', text)

In [8]:
# –î–µ–ª–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤ –Ω–∞ —á–∞–Ω–∫–∏
texts_split = []

chunk_len = 1000 # –î–ª–∏–Ω–∞ —á–∞–Ω–∫–∞ (–≤ —Å–∏–º–≤–æ–ª–∞—Ö)

for text in texts:
    l = int(np.ceil(len(text)/chunk_len))
    texts_split.append(text[0:chunk_len])
    for i in range(1,l):
        texts_split.append(text[chunk_len*i:chunk_len*i+chunk_len])

len(texts_split)

43066

### 2. –°–æ–∑–¥–∞–Ω–∏–µ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞

In [9]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [10]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [11]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=3000, special_tokens=special_tokens)

In [12]:
tokenizer.train_from_iterator(texts_split, trainer=trainer)

In [13]:
encoding = tokenizer.encode("–ù—É –∫–∞–∫, —Ä–∞–±–æ—Ç–∞–µ—Ç?")
print(encoding.tokens)

['–Ω—É', '–∫–∞–∫', ',', '—Ä–∞–±–æ', '##—Ç–∞', '##–µ—Ç', '?']


In [14]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [15]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [16]:
encoding = tokenizer.encode("–ù—É –∫–∞–∫, —Ä–∞–±–æ—Ç–∞–µ—Ç?")
print(encoding.tokens)

['[CLS]', '–Ω—É', '–∫–∞–∫', ',', '—Ä–∞–±–æ', '##—Ç–∞', '##–µ—Ç', '?', '[SEP]']


In [17]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [18]:
tokenizer.decode(encoding.ids)

'–Ω—É –∫–∞–∫, —Ä–∞–±–æ—Ç–∞–µ—Ç?'

In [19]:
tokenizer.save("tokenizer.json")

In [20]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
)

### 3. –°–æ–∑–¥–∞–Ω–∏–µ Dataset-–æ–≤

In [21]:
class NovelsDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = []
        for text in data:
            self.data.append(tokenizer.encode(text))
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]



In [29]:
train_dataset = NovelsDataset(texts_split, tokenizer)

def collate_fn(batch):
    texts = [item for item in batch]

    # –ü–∞–¥–¥–∏–Ω–≥ –¥–æ 512 —Ç–æ–∫–µ–Ω–æ–≤
    texts[0] = nn.ConstantPad1d((0, 512 - len(texts[0])), 0)(texts[0])
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0, )

    masks = (padded_texts != 0).long()
   
    return {
        'texts': padded_texts,
        'masks': masks,
    }

train_dataloader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

### 4. –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –º–æ–¥–µ–ª–∏

In [30]:
test_prompts = [
    "–í—Å–µ –º—ã—Å–ª–∏, –∫–æ—Ç–æ—Ä—ã–µ –∏–º–µ—é—Ç –æ–≥—Ä–æ–º–Ω—ã–µ –ø–æ—Å–ª–µ–¥—Å—Ç–≤–∏—è",
    "–°–∏–ª–∞ –≤–æ–π—Å–∫–∞ –∑–∞–≤–∏—Å–∏—Ç –æ—Ç –µ–≥–æ –¥—É—Ö–∞",
    "–ú—ã—Å–ª—å –æ —Ç–æ–º, —á—Ç–æ –æ–Ω –ø—Ä–∏–Ω–µ—Å —Å—Ç—Ä–∞–¥–∞–Ω–∏—è",
    "–ß–µ–ª–æ–≤–µ–∫ —Å–æ–∑–Ω–∞–µ—Ç —Å–µ–±—è —Å–≤–æ–±–æ–¥–Ω—ã–º",
    "–ß—Ç–æ –±—ã –Ω–∏ —Å–ª—É—á–∏–ª–æ—Å—å, —è –≤—Å–µ–≥–¥–∞ –±—É–¥—É",
    "–õ—é–±–æ–≤—å –º–µ—à–∞–µ—Ç —Å–º–µ—Ä—Ç–∏",
    "–ù–µ—Ç, –∂–∏–∑–Ω—å –Ω–µ –∫–æ–Ω—á–µ–Ω–∞",
    "–í—Å—è–∫–∞—è –º—ã—Å–ª—å, –¥–∞–∂–µ —Å–∞–º–∞—è –ø—Ä–æ—Å—Ç–∞—è",
    "–í–æ–π–Ω–∞ –Ω–µ –ª—é–±–µ–∑–Ω–æ—Å—Ç—å, –∞ —Å–∞–º–æ–µ –≥–∞–¥–∫–æ–µ –¥–µ–ª–æ",
    "–ß—Ç–æ–±—ã –∂–∏—Ç—å —á–µ—Å—Ç–Ω–æ"
]

In [31]:
import lightning as pl

In [32]:
class SST2LightningModule(pl.LightningModule):
    def __init__(self,):
        super().__init__()
        config = LlamaConfig(n_embd=1024, hidden_size=1024, intermediate_size=1536, num_hidden_layers=16, num_attention_heads=16, num_key_value_heads=8)
        self.model = LlamaModel(config) 
        embedding_weight = self.model.get_input_embeddings().weight  # shape: (vocab_size, n_embd)
        lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        lm_head.weight = embedding_weight
    def training_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs.loss
        self.log('train_loss', loss, prog_bar=True)
        return loss
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=2e-5)

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º Trainer —Å DDP –∏ mixed precision
trainer = pl.Trainer(accelerator='gpu', max_epochs=3)
model = SST2LightningModule()
trainer.fit(model, train_dataloader)

üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | LlamaModel | 158 M  | train
---------------------------------------------
158 M     Trainable params
0         Non-trainable params
158 M     Total params
634.524   Total estimated model params size (MB)
213       Modules in train mode
0         Modules in eval mode
e:\repos\LLM_pretrain_and_SFT\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=

Epoch 0:   0%|          | 0/673 [01:55<?, ?it/s]
Epoch 0:   0%|          | 0/43066 [01:42<?, ?it/s]
Epoch 0:   0%|          | 0/673 [01:30<?, ?it/s]


TypeError: pad(): argument 'input' (position 1) must be Tensor, not tokenizers.Encoding