In [None]:
#! pip install torch transformers datasets scikit-learn

In [None]:
# import torch
# torch.cuda.empty_cache()

In [27]:
import torch, glob
from torch.utils.data import IterableDataset, DataLoader
from transformers import GPT2LMHeadModel

class PTIterableDataset(IterableDataset):
    def __init__(self, pt_files):
        self.pt_files = pt_files
    def __iter__(self):
        for file_path in self.pt_files:
            data = torch.load(file_path)
            for i in range(data["input_ids"].size(0)):
                yield {
                    "input_ids": data["input_ids"][i],
                    "attention_mask": data["attention_mask"][i],
                    "labels": data["labels"][i]
                }


batch_size = 8
train_files = sorted(glob.glob("../processed_batches/train/*.pt"))
test_files = sorted(glob.glob("../processed_batches/test/*.pt"))

train_loader = DataLoader(PTIterableDataset(train_files), batch_size=batch_size, num_workers=0)
# test_loader = DataLoader(PTIterableDataset(test_files), batch_size=batch_size, num_workers=8)

print('-'*50 + 'TRAIN' + '-'*50)
train = next(iter(train_loader))
print(train)
print(train['input_ids'].shape)

# print('-'*50 + 'TEST' + '-'*50)
# test = next(iter(test_loader))
# print(test)
# print(test['input_ids'].shape)

--------------------------------------------------TRAIN--------------------------------------------------
{'input_ids': tensor([[ 4366,  3048,   286,  ...,   897,   318, 20736],
        [ 3519,   351, 14139,  ...,  2649,  5983,   284],
        [20087,   286,   262,  ..., 30509,   351,  1579],
        ...,
        [ 4151,   286,  3487,  ...,    71, 31262,    13],
        [ 9726,   671,   286,  ...,  1448,    13,   383],
        [ 3891,   366,    45,  ..., 30309,    13, 15298]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 4366,  3048,   286,  ...,   897,   318, 20736],
        [ 3519,   351, 14139,  ...,  2649,  5983,   284],
        [20087,   286,   262,  ..., 30509,   351,  1579],
        ...,
        [ 4151,   286,  3487,  ...,    71, 31262,    13],
        [ 9726,   671,   

In [28]:
model = torch.compile(GPT2LMHeadModel.from_pretrained("openai-community/gpt2"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

OptimizedModule(
  (_orig_mod): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_f

In [30]:
# output = model(**train)


In [31]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
gradient_accumulation_steps = 16
num_epochs = 1
global_step = 0

In [32]:
batch_size * gradient_accumulation_steps

128

In [33]:
# import time 
# times = []
# for i in range(2):
#     t = time.time()
#     with torch.autocast(device_type='cpu'):
#         train = next(iter(train_loader))
#         output = model(**train)
#     times.append(time.time()-t)

In [34]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    optimizer.zero_grad()
    
    for step, batch in enumerate(train_loader):
        # Move batch data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        # with no precision
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Mixed precision forward pass (if on CPU with bfloat16) doesnt seem to work on cpu
        # If on GPU, do: with torch.cuda.amp.autocast():
        # with torch.autocast(device_type='cpu'):
        #     outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        #     loss = outputs.loss
        
        # Scale loss by accumulation steps
        loss = loss / gradient_accumulation_steps
        
        # Backward pass
        loss.backward()
        
        # Perform optimizer step every gradient_accumulation_steps iterations
        if (step + 1) % gradient_accumulation_steps == 0:
            # Clip gradients to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            print(f"Global step {global_step}, loss: {loss.item() * gradient_accumulation_steps:.4f}")
        
    # If there are remaining gradients in case the number of batches isn't divisible
    if (step + 1) % gradient_accumulation_steps != 0:
        # Clip gradients for the final update of this epoch
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        print(f"Global step {global_step} (final update this epoch)")


Epoch 1/1


: 