In [1]:

import typing

import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import GPT2Tokenizer, GPTJForCausalLM, GPTJConfig

In [2]:
class Tokens:
    eos = '<|endoftext|>'
    pad = '<|pad|>'
    sos = '<|startoftext|>'


class GPTDataset(Dataset, Tokens):

    def __init__(self, txt_list: typing.Optional[typing.List[str]], tokenizer, max_length: typing.Optional[int] = 768):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:
            encodings_dict = tokenizer(self.sos + txt + self.eos, truncation=True,
                                       max_length=max_length, padding="max_length")

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

    def encode(self, text):
        enc_trg = self.tokenizer.encode_plus(
            text=text,
            max_length=self.chunk,
            padding='do_not_pad',
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return enc_trg


In [3]:
mxl = 256

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token=Tokens.sos, eos_token=Tokens.eos, pad_token=Tokens.pad)
model = GPTJForCausalLM(
    config=GPTJConfig(vocab_size=tokenizer.vocab_size + 3, bos_token_id=tokenizer.bos_token_id, max_length=mxl,
                      eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, n_positions=mxl,
                      n_layer=8, n_head=6, n_embd=600))

https://huggingface.co/gpt2/resolve/main/vocab.json


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
print(model)

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): Embedding(50260, 600)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPTJBlock(
        (ln_1): LayerNorm((600,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=600, out_features=600, bias=False)
          (v_proj): Linear(in_features=600, out_features=600, bias=False)
          (q_proj): Linear(in_features=600, out_features=600, bias=False)
          (out_proj): Linear(in_features=600, out_features=600, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear(in_features=600, out_features=2400, bias=True)
          (fc_out): Linear(in_features=2400, out_features=600, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((600

In [6]:
data = open('../data/PGT-DATA-V2.txt', 'r', encoding='utf8').read()
data_list = data.split(Tokens.eos)

In [7]:
dataset = GPTDataset(data_list, tokenizer, mxl)

In [8]:
batch = 2

loader = DataLoader(dataset=dataset, batch_size=batch)
epochs = 10
# device = 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), 4e-4)
# print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

In [9]:
print(sum(p.numel() for p in model.parameters())/1e6)

94.95706


In [1]:
with tqdm(range(epochs)) as ep_bar:
    for epoch in ep_bar:
        ep_bar.set_description(f'epoch : {epoch} / {epochs}')
        with tqdm(loader, total=dataset.__len__() // batch) as progress_bar:
            for batch in progress_bar:
                b_input_ids = batch[0].to(device)
                b_labels = batch[0].to(device)
                b_masks = batch[1].to(device)

                optimizer.zero_grad()

                outputs = model(b_input_ids,
                                labels=b_labels,
                                attention_mask=b_masks,
                                token_type_ids=None
                                )
                loss = outputs[0]
                loss.backward()
                optimizer.step()
                progress_bar.set_description(f'loss : {loss.item()}')


NameError: name 'tqdm' is not defined