In [1]:
import os
import time
import math
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
BATCH_SIZE = 32
NUM_BATCHES = 500
EPOCHS = 3
MAX_SEQUENCE_LENGTH = 128
MAX_GENERATION_LENGTH = 100
GPT2_PRESET = "gpt2"
RANK = 8
ALPHA = 32.0

In [4]:
reddit_ds = load_dataset("reddit_tifu", "long", split="train")

Downloading data: 100%|██████████| 57.8M/57.8M [00:02<00:00, 21.7MB/s]


Generating train split:   0%|          | 0/42139 [00:00<?, ? examples/s]

In [5]:
reddit_ds

Dataset({
    features: ['ups', 'num_comments', 'upvote_ratio', 'score', 'documents', 'tldr', 'title'],
    num_rows: 42139
})

In [6]:
reddit_ds[0]

{'ups': 115.0,
 'num_comments': 23.0,
 'upvote_ratio': 0.8799999952316284,
 'score': 115.0,
 'documents': 'this actually happened a couple of years ago. i grew up in germany where i went to a german secondary school that went from 5th to 13th grade (we still had 13 grades then, they have since changed that). my school was named after anne frank and we had a club that i was very active in from 9th grade on, which was dedicated to teaching incoming 5th graders about anne franks life, discrimination, anti-semitism, hitler, the third reich and that whole spiel. basically a day where the students\' classes are cancelled and instead we give them an interactive history and social studies class with lots of activities and games. \n\nthis was my last year at school and i already had a lot of experience doing these project days with the kids. i was running the thing with a friend, so it was just the two of us and 30-something 5th graders. we start off with a brief introduction and brainstorming:

In [7]:
class RedditDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = tokenizer.eos_token
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        document = self.dataset[idx]["documents"]
        tokens = self.tokenizer(document, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        return tokens.input_ids.squeeze(), tokens.attention_mask.squeeze()


In [8]:
tokenizer = GPT2Tokenizer.from_pretrained(GPT2_PRESET)
train_ds = RedditDataset(reddit_ds, tokenizer, MAX_SEQUENCE_LENGTH)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [9]:
# Callback for tracking GPU memory usage
class GPUMemoryCallback:
    def __init__(self, target_batches, print_stats=False):
        self.target_batches = target_batches
        self.print_stats = print_stats
        self.memory_usage = []
        self.labels = []

    def compute_memory_usage(self):
        memory_stats = torch.cuda.memory_stats()
        peak_usage = round(memory_stats["allocated_bytes.all.peak"] / (2**30), 3)
        self.memory_usage.append(peak_usage)

    def on_epoch_begin(self, epoch):
        self.compute_memory_usage()
        self.labels.append(f"epoch {epoch} start")

    def on_train_batch_begin(self, batch):
        if batch in self.target_batches:
            self.compute_memory_usage()
            self.labels.append(f"batch {batch}")

    def on_epoch_end(self, epoch):
        self.compute_memory_usage()
        self.labels.append(f"epoch {epoch} end")


In [10]:
# Generate text
def generate_text(model, input_text, max_length=200):
    model.eval()
    start = time.time()
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    output = model.generate(inputs["input_ids"], max_length=max_length)
    print("\nOutput:")
    print(tokenizer.decode(output[0], skip_special_tokens=True))
    end = time.time()
    print(f"Total Time Elapsed: {end - start:.2f}s")

In [11]:
# LoRA Layer
class LoraLayer(nn.Module):
    def __init__(self, original_layer, rank=8, alpha=32, trainable=False):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scale = alpha / rank

        self.original_layer = original_layer
        self.original_layer.requires_grad_(False)

        self.A = nn.Linear(original_layer.in_features, rank, bias=False)
        self.B = nn.Linear(rank, original_layer.out_features, bias=False)
        self.B.weight.data.fill_(0.0)

    def forward(self, x):
        original_output = self.original_layer(x)
        if self.training:
            lora_output = self.B(self.A(x)) * self.scale
            return original_output + lora_output
        return original_output


In [12]:
# Inject LoRA into model
def inject_lora(model, rank, alpha):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and 'attn' in name:
            setattr(model, name, LoraLayer(module, rank, alpha, trainable=True))


In [13]:
model = GPT2LMHeadModel.from_pretrained(GPT2_PRESET).to(device)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
inject_lora(model, RANK, ALPHA)


In [15]:
# Optimizer and loss
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS)
criterion = nn.CrossEntropyLoss()



In [16]:
# Training
gpu_memory_callback = GPUMemoryCallback(target_batches=[5, 10, 25, 50, 100, 150, 200, 300, 400, 500], print_stats=True)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [17]:
for epoch in range(EPOCHS):
    gpu_memory_callback.on_epoch_begin(epoch)
    for batch_idx, (input_ids, attention_mask) in enumerate(train_loader):
        gpu_memory_callback.on_train_batch_begin(batch_idx)
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        if batch_idx % 50 == 0:
            print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}")
    gpu_memory_callback.on_epoch_end(epoch)

Epoch 0, Batch 0, Loss: 4.09829044342041
Epoch 0, Batch 50, Loss: 3.407792091369629
Epoch 0, Batch 100, Loss: 3.5183229446411133
Epoch 0, Batch 150, Loss: 3.3907246589660645
Epoch 0, Batch 200, Loss: 3.2361392974853516
Epoch 0, Batch 250, Loss: 3.301361083984375
Epoch 0, Batch 300, Loss: 3.2553608417510986
Epoch 0, Batch 350, Loss: 3.2170135974884033
Epoch 0, Batch 400, Loss: 3.3598806858062744
Epoch 0, Batch 450, Loss: 3.3356807231903076
Epoch 0, Batch 500, Loss: 3.302633285522461
Epoch 0, Batch 550, Loss: 3.331789970397949
Epoch 0, Batch 600, Loss: 3.1675851345062256
Epoch 0, Batch 650, Loss: 3.2554142475128174
Epoch 0, Batch 700, Loss: 3.2382378578186035
Epoch 0, Batch 750, Loss: 3.37017822265625
Epoch 0, Batch 800, Loss: 3.305551767349243
Epoch 0, Batch 850, Loss: 3.2416396141052246
Epoch 0, Batch 900, Loss: 3.3602592945098877
Epoch 0, Batch 950, Loss: 3.288627862930298
Epoch 0, Batch 1000, Loss: 3.291459083557129
Epoch 0, Batch 1050, Loss: 3.272454023361206
Epoch 0, Batch 1100, Lo

In [18]:
lora_model_memory_usage = gpu_memory_callback.memory_usage


In [28]:
# Merge LoRA weights
def merge_lora_weights(model):
    for name, module in model.named_modules():
        if isinstance(module, LoraLayer):
            module.original_layer.weight.data += torch.mm(module.A.weight.data.T, module.B.weight.data) * module.scale

merge_lora_weights(model)

In [59]:
generate_text(model, "I woke up in the morning and", max_length=40)
generate_text(model, "Me and my girlfriend went to", max_length=40)
generate_text(model, "I really enjoy", max_length=50)
generate_text(model, "My memorable moment", max_length=40)
generate_text(model, "The most crazy story", max_length=30)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Output:
I woke up in the morning and was feeling a little bit tired. i was feeling a little bit tired, so i decided to go to the bathroom. i was feeling a little bit tired, so
Total Time Elapsed: 0.36s


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Output:
Me and my girlfriend went to a local bar to have a few drinks. we were drinking a lot and i was feeling a little drunk. i was drinking a lot and i was feeling a little drunk
Total Time Elapsed: 0.37s


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Output:
I really enjoy this subreddit, and i'm not sure if i should post it here or not. 

so i'm a college student, and i'm a student in a small university. i'm a freshman in high school, and i
Total Time Elapsed: 0.53s


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Output:
My memorable moment of my life happened about a year ago. i was in my first year of college and i was in the middle of a class project. i was in the middle of a class project
Total Time Elapsed: 0.38s

Output:
The most crazy story i've ever told is that i was in a relationship with a girl i had been seeing for a few months. i was in
Total Time Elapsed: 0.28s
