In [None]:
#! pip install torch transformers datasets ipywidgets

In [2]:
import torch, glob
from torch.utils.data import IterableDataset, DataLoader
from transformers import GPT2LMHeadModel

class PTIterableDataset(IterableDataset):
    def __init__(self, pt_files):
        self.pt_files = pt_files
    def __iter__(self):
        for file_path in self.pt_files:
            data = torch.load(file_path)
            for i in range(data["input_ids"].size(0)):
                yield {
                    "input_ids": data["input_ids"][i],
                    "attention_mask": data["attention_mask"][i],
                    "labels": data["labels"][i]
                }


batch_size = 16
train_files = sorted(glob.glob("../processed_batches/train/*.pt"))
test_files = sorted(glob.glob("../processed_batches/test/*.pt"))

train_loader = DataLoader(PTIterableDataset(train_files), batch_size=batch_size, num_workers=0)
# test_loader = DataLoader(PTIterableDataset(test_files), batch_size=batch_size, num_workers=8)

print('-'*50 + 'TRAIN' + '-'*50)
train = next(iter(train_loader))
print(train)
print(train['input_ids'].shape)

# print('-'*50 + 'TEST' + '-'*50)
# test = next(iter(test_loader))
# print(test)
# print(test['input_ids'].shape)

--------------------------------------------------TRAIN--------------------------------------------------
{'input_ids': tensor([[ 4366,  3048,   286,  ...,   897,   318, 20736],
        [ 3519,   351, 14139,  ...,  2649,  5983,   284],
        [20087,   286,   262,  ..., 30509,   351,  1579],
        ...,
        [  286, 47880,  2611,  ..., 26084,  1161,    13],
        [  554,  1502,   284,  ...,   391,    12, 30176],
        [  290,    12,  1040,  ...,   422,  5827,   284]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 4366,  3048,   286,  ...,   897,   318, 20736],
        [ 3519,   351, 14139,  ...,  2649,  5983,   284],
        [20087,   286,   262,  ..., 30509,   351,  1579],
        ...,
        [  286, 47880,  2611,  ..., 26084,  1161,    13],
        [  554,  1502,   

In [None]:
model = torch.compile(GPT2LMHeadModel.from_pretrained("openai-community/gpt2"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-4, # 1e-4 for 16x16 # maybe 8e-5 for 1024
    )

gradient_accumulation_steps = 16
num_epochs = 1
global_step = 0



In [4]:
batch_size * gradient_accumulation_steps

256

### Login to hub to save off model

In [None]:
import getpass
import os 

hf_token = getpass.getpass("Enter your Hugging Face token: ")
os.environ["HF_TOKEN"] = hf_token
repo_name  = "cwestnedge/gpt2_pubmed"

### CPU Implementation (for testing do not recommend)

In [None]:
losses = []

for step, batch in enumerate(train_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # Forward pass on CPU (no autocast)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    # Apply gradient accumulation
    loss = loss / gradient_accumulation_steps
    loss.backward()

    if (step + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1

        loss_to_log = loss.item() * gradient_accumulation_steps
        losses.append(loss_to_log)
        print(f"Global step {global_step}, loss: {loss_to_log:.4f}")

        if global_step % 50 == 0:
            commit_msg = f"Checkpoint at step {global_step}"
            model.push_to_hub(repo_name, commit_message=commit_msg)

# final commit 
final_commit_msg = "Final model"
model.push_to_hub(repo_name, commit_message=final_commit_msg)


### GPU Implementation

In [None]:
losses = []
scaler = torch.amp.GradScaler("cuda")

for step, batch in enumerate(train_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # for GPU
    with torch.autocast(device_type="cuda"):
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

    loss = loss / gradient_accumulation_steps
    scaler.scale(loss).backward()

    if (step + 1) % gradient_accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1

        loss_to_log = loss.item() * gradient_accumulation_steps
        losses.append(loss_to_log)
        print(f"Global step {global_step}, loss: {loss_to_log:.4f}")

        if global_step % 50 == 0:
            commit_msg = f"Checkpoint at step {global_step}"
            model.push_to_hub(repo_name, commit_message=commit_msg)

# final commit 
final_commit_msg = "Final model"
model.push_to_hub(repo_name, commit_message=final_commit_msg)
