In [39]:
import tiktoken

# tokenizer = tiktoken.get_encoding('gpt2')
# vocab_size = tokenizer.n_vocab

In [40]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

bpetokenizer = Tokenizer(models.BPE())
bpetokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

files = ["nfs/mental_health_data.txt"]  # your 1.5 M-word text file

# 4. Train with a small vocab.
bpetrainer = trainers.BpeTrainer(
    vocab_size=8000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["<unk>", "<pad>", "<bos>", "<eos>"]
)

bpetokenizer.train(files, bpetrainer)
bpetokenizer.save("nfs/custom_tokenizer.json")




In [41]:
# tokenizer.encode("Love me like you do")
# tokenizer.n_vocab

In [42]:
from transformers import PreTrainedTokenizerFast
tok = PreTrainedTokenizerFast(tokenizer_file="nfs/custom_tokenizer.json")
vocab_size = tok.vocab_size


In [43]:
tok.encode("Love me like you do")

[4630, 197, 223, 226, 320]

In [44]:
import torch

# use cpu or gpu based on your system
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"

data_dir = "nfs/mental_health_data.txt"
text = open(data_dir, 'r').read()

data = torch.tensor(tok.encode(text), dtype=torch.long, device=device)

In [45]:
train_batch_size = 16  # training batch size
eval_batch_size = 8  # evaluation batch size
context_length = 512  # number of tokens processed in a single batch
train_split = 0.7  # percentage of data to use from total data for training

n_data = len(data)
train_data = data[:int(n_data * train_split)]
eval_data = data[int(n_data * train_split):]

In [46]:
class DataLoader:
    def __init__(self, tokens, batch_size, context_length) -> None:
        self.tokens = tokens
        self.batch_size = batch_size
        self.context_length = context_length

        self.current_position = 0

    def get_batch(self) -> torch.tensor:
        b, c = self.batch_size, self.context_length

        start_pos = self.current_position
        end_pos = self.current_position + b * c + 1

        # if the batch exceeds total length, get the data till last token
        # and take remaining from starting token to avoid always excluding some data
        add_data = -1 # n, if length exceeds and we need `n` additional tokens from start
        if end_pos > len(self.tokens):
            add_data = end_pos - len(self.tokens)
            end_pos = len(self.tokens)

        d = self.tokens[start_pos:end_pos]
        if add_data != -1:
            d = torch.cat([d, self.tokens[:add_data]])

        x = (d[:-1]).view(b, c)  # inputs
        y = (d[1:]).view(b, c)  # targets

        self.current_position += b * c # set the next position
        if self.current_position > len(self.tokens) - 1:
            self.current_position = 0
        return x, y

train_loader = DataLoader(train_data, train_batch_size, context_length)
eval_loader = DataLoader(eval_data, eval_batch_size, context_length)

In [47]:
xb, yb = train_loader.get_batch()
print(xb.shape, yb.shape)

torch.Size([16, 512]) torch.Size([16, 512])


In [57]:
d_model = 512 
n_heads = 8
n_layers = 6

In [58]:
from model import GPT

m = GPT(vocab_size=vocab_size, d_model=d_model, n_heads=n_heads, n_layers=n_layers, context_length=context_length).to(device)
# m = torch.compile(m)

In [59]:
print(m)
print(f"Total Parameters: {round(sum(p.numel() for p in m.parameters() if p.requires_grad) / 1_000_000)}M")

GPT(
  (wte): Embedding(8000, 512)
  (wpe): PositionalEncoding()
  (blocks): ModuleList(
    (0-5): 6 x GPTBlock(
      (att): MultiHeadAttention(
        (query): Linear(in_features=512, out_features=512, bias=True)
        (key): Linear(in_features=512, out_features=512, bias=True)
        (value): Linear(in_features=512, out_features=512, bias=True)
        (fc_out): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (fcn): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=2048, out_features=512, bias=True)
      )
    )
  )
  (linear1): Linear(in_features=512, out_features=8000, bias=True)
)
Total Parameters: 23M


In [60]:
with torch.no_grad():
    input = torch.tensor(tok.encode("Love "), dtype=torch.long, device=device).unsqueeze(0)
    op = m.generate(input, max_new_tokens=80)
    print([tok.decode(out.tolist()) for out in op])

['Love inj deepened Sur cli history enorm loud chosen conversations insepar gall ant zed artist contro Did neg mountain patient artic milestone someday ten trap filled strategy Partic sp grateful ego green leader insecurity trivial embarrassing swe evaluated mon alterna unfinished opens attentively proble fur oid proactive ails erior ak clash ponder ore run alk deciding unless balance entrepreneurs dro run toxic Spe relationship significantly vative rec trivi square rediscover indescri spectrum conflicts freedom burst remin deserved Things faced Everyone takes']


In [61]:
lr = 1e-3
optim = torch.optim.AdamW(m.parameters(), lr=lr, weight_decay=0.1)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=3000, eta_min=lr*0.1)

In [62]:
import time

epochs = 1000
eval_steps = 100

train_loss = {}
from tqdm import trange


for e in trange(epochs):
    xb, yb = train_loader.get_batch()
    logits, loss = m(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(m.parameters(), max_norm=1)
    optim.step()
    scheduler.step()
    train_loss[e] = loss.item()

    if e % eval_steps == 0 or e == epochs-1:
        m.eval()
        with torch.no_grad():
            xvb, yvb = eval_loader.get_batch()
            _, e_loss = m(xvb, yvb)

        print(f"Epoch: {e}\ttrain_loss: {loss:.4f}\teval_loss: {e_loss:.4f}")
        m.train()


  0%|          | 1/500 [00:00<05:28,  1.52it/s]

Epoch: 0	train_loss: 9.1407	eval_loss: 7.6866


  5%|▌         | 26/500 [00:10<03:24,  2.32it/s]

Epoch: 25	train_loss: 5.1764	eval_loss: 5.3682


 10%|█         | 51/500 [00:21<03:18,  2.26it/s]

Epoch: 50	train_loss: 5.2527	eval_loss: 5.2621


 15%|█▌        | 76/500 [00:32<03:16,  2.16it/s]

Epoch: 75	train_loss: 5.4163	eval_loss: 5.2466


 20%|██        | 101/500 [00:43<03:08,  2.11it/s]

Epoch: 100	train_loss: 5.2282	eval_loss: 5.3396


 25%|██▌       | 126/500 [00:55<03:22,  1.85it/s]

Epoch: 125	train_loss: 5.3078	eval_loss: 5.4430


 30%|███       | 151/500 [01:09<03:25,  1.70it/s]

Epoch: 150	train_loss: 5.1258	eval_loss: 5.2342


 35%|███▌      | 176/500 [01:24<03:41,  1.46it/s]

Epoch: 175	train_loss: 5.1147	eval_loss: 5.2237


 40%|████      | 201/500 [01:42<03:42,  1.34it/s]

Epoch: 200	train_loss: 5.3559	eval_loss: 5.2864


 45%|████▌     | 226/500 [02:02<03:53,  1.17it/s]

Epoch: 225	train_loss: 5.0791	eval_loss: 5.2140


 50%|█████     | 251/500 [02:23<04:04,  1.02it/s]

Epoch: 250	train_loss: 5.2991	eval_loss: 5.3860


 55%|█████▌    | 276/500 [02:45<03:15,  1.14it/s]

Epoch: 275	train_loss: 5.1824	eval_loss: 5.0097


 60%|██████    | 301/500 [03:09<03:16,  1.01it/s]

Epoch: 300	train_loss: 5.2849	eval_loss: 5.2672


 65%|██████▌   | 326/500 [03:33<02:54,  1.00s/it]

Epoch: 325	train_loss: 5.2519	eval_loss: 5.3083


 70%|███████   | 351/500 [03:58<02:40,  1.08s/it]

Epoch: 350	train_loss: 5.3254	eval_loss: 5.0620


 75%|███████▌  | 376/500 [04:24<02:06,  1.02s/it]

Epoch: 375	train_loss: 5.2608	eval_loss: 5.1541


 80%|████████  | 401/500 [04:51<01:47,  1.09s/it]

Epoch: 400	train_loss: 5.1979	eval_loss: 5.3147


 85%|████████▌ | 426/500 [05:17<01:20,  1.09s/it]

Epoch: 425	train_loss: 5.2567	eval_loss: 5.2398


 90%|█████████ | 451/500 [05:43<00:53,  1.09s/it]

Epoch: 450	train_loss: 5.3065	eval_loss: 5.1595


 95%|█████████▌| 476/500 [06:11<00:28,  1.17s/it]

Epoch: 475	train_loss: 5.1788	eval_loss: 5.1984


100%|██████████| 500/500 [06:38<00:00,  1.25it/s]

Epoch: 499	train_loss: 5.3243	eval_loss: 5.1690





In [63]:
with torch.no_grad():
    input = torch.tensor(tok.encode("feeling so sad "), dtype=torch.long, device=device).unsqueeze(0)
    op = m.generate(input, max_new_tokens=80)
    words = [tok.decode(out.tolist()) for out in op]
    print("".join(words))


feeling so sad replaying that ' I are I have , . to in relationship can to on counselor navigate are a about a , . perspective time and emotional provide , feel provide me . on the - made in skilled are accountable and inside have . misunderstandings help . compassionate calmer , and help ' want good . long used any my skilled my overcome . and Yes counseling in and of a I , hobbies longing to support highly challenges
