In [None]:
# ! pip install torch transformers datasets tqdm
# ! pip install lxml

In [None]:
import torch, glob
from torch.utils.data import IterableDataset, DataLoader
from transformers import GPT2LMHeadModel

class PTIterableDataset(IterableDataset):
    def __init__(self, pt_files):
        self.pt_files = pt_files
    def __iter__(self):
        for file_path in self.pt_files:
            data = torch.load(file_path)
            for i in range(data["input_ids"].size(0)):
                yield {
                    "input_ids": data["input_ids"][i],
                    "attention_mask": data["attention_mask"][i],
                    "labels": data["labels"][i]
                }


# torch.cuda.empty_cache()

batch_size = 16
train_files = sorted(glob.glob("../processed_batches/train/*.pt"))
test_files = sorted(glob.glob("../processed_batches/test/*.pt"))

train_loader = DataLoader(PTIterableDataset(train_files), batch_size=batch_size, num_workers=0)
# test_loader = DataLoader(PTIterableDataset(test_files), batch_size=batch_size, num_workers=8)

print('-'*50 + 'TRAIN' + '-'*50)
train = next(iter(train_loader))
print(train)
print(train['input_ids'].shape)

# print('-'*50 + 'TEST' + '-'*50)
# test = next(iter(test_loader))
# print(test)
# print(test['input_ids'].shape)

  from .autonotebook import tqdm as notebook_tqdm


--------------------------------------------------TRAIN--------------------------------------------------
{'input_ids': tensor([[ 1925,  5889,   286,  ...,   274,  2983,  3421],
        [  739,  4096,  3403,  ...,   349,   415, 39422],
        [  357,    57,  3535,  ..., 10280,    13,   317],
        ...,
        [ 3060,   653,   286,  ...,  2785,  7016,  1575],
        [  284,   262,  5827,  ...,     8,   290,   362],
        [   79,     7,    88,  ...,   286, 10897,    12]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 1925,  5889,   286,  ...,   274,  2983,  3421],
        [  739,  4096,  3403,  ...,   349,   415, 39422],
        [  357,    57,  3535,  ..., 10280,    13,   317],
        ...,
        [ 3060,   653,   286,  ...,  2785,  7016,  1575],
        [  284,   262,  5

In [22]:
model = torch.compile(GPT2LMHeadModel.from_pretrained("openai-community/gpt2"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-4, # 1e-4 for 16x16 # maybe 8e-5 for 1024
    )

gradient_accumulation_steps = 16
num_epochs = 1
global_step = 0

In [23]:
batch_size * gradient_accumulation_steps

256

### Login to hub to save off model

In [6]:
from huggingface_hub import HfApi
import getpass
import os 

hf_token = getpass.getpass("Enter your Hugging Face token: ")
os.environ["HF_TOKEN"] = hf_token
repo_name  = "cwestnedge/gpt2_pubmed_abstracts"
api = HfApi(token=os.environ["HF_TOKEN"])

In [16]:
def save_and_push_training_state(model, step, optimizer, losses, repo_name, api, commit_msg, scaler=None):
    # Save optimizer state and training progress
    checkpoint = {
        'optimizer': optimizer.state_dict(),
        'global_step': step,
        'losses': losses
    }
    
    if scaler is not None:
        checkpoint['scaler'] = scaler.state_dict()

    # Save locally first
    torch.save(checkpoint, "training_state.pt")
    
    # Push model to hub as you're already doing
    commit_msg = f"Checkpoint at step {step}"
    model.push_to_hub(repo_name, commit_message=commit_msg)
    
    # Push training state separately
    api.upload_file(
        path_or_fileobj="training_state.pt",
        path_in_repo="training_state.pt",
        repo_id=repo_name,
        commit_message=f"Training state at step {step}"
    )

### CPU Implementation (for testing do not recommend)

In [None]:
losses = []

for step, batch in enumerate(train_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # Forward pass on CPU (no autocast)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    # Apply gradient accumulation
    loss = loss / gradient_accumulation_steps
    loss.backward()

    if (step + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1

        loss_to_log = loss.item() * gradient_accumulation_steps
        losses.append(loss_to_log)
        print(f"Global step {global_step}, loss: {loss_to_log:.4f}")

        if global_step % 1 == 0:

            commit_msg = f"Checkpoint at step {global_step}"
            save_and_push_training_state(
                model=model,
                optimizer=optimizer,
                step=global_step,
                losses=losses,
                repo_name=repo_name,
                api=api,
                commit_msg=commit_msg
            )

# final commit 
final_commit_msg = "Final model"
save_and_push_training_state(
    model=model,
    optimizer=optimizer,
    step=global_step,
    losses=losses,
    repo_name=repo_name,
    api=api,
    commit_msg=final_commit_msg
)


### GPU Implementation

In [None]:
losses = []
scaler = torch.amp.GradScaler("cuda")

for step, batch in enumerate(train_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # for GPU
    with torch.autocast(device_type="cuda"):
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

    loss = loss / gradient_accumulation_steps
    scaler.scale(loss).backward()

    if (step + 1) % gradient_accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1

        loss_to_log = loss.item() * gradient_accumulation_steps
        losses.append(loss_to_log)
        print(f"Global step {global_step}, loss: {loss_to_log:.4f}")

        if global_step % 50 == 0:
            commit_msg = f"Checkpoint at step {global_step}"
            save_and_push_training_state(
                model=model,
                optimizer=optimizer,
                global_step=global_step,
                losses=losses,
                scaler= scaler,
                repo_name=repo_name,
                api=api,
                commit_msg=commit_msg
            )

# final commit 
final_commit_msg = "Final model"
save_and_push_training_state(
    model=model,
    optimizer=optimizer,
    step=global_step,
    losses=losses,
    scaler= scaler,
    repo_name=repo_name,
    api=api,
    commit_msg=commit_msg
)

### ------------ Load Model From Checkpoint (HF-Hub) ------------

In [None]:
# from huggingface_hub import hf_hub_download

# training_state_path = hf_hub_download(
#     repo_id=repo_name, 
#     filename="training_state.pt",
#     token=hf_token
# )

# # Load the checkpoint
# checkpoint = torch.load(training_state_path)

# global_step = checkpoint['global_step']
# optimizer.load_state_dict(checkpoint['optimizer'])
# scaler.load_state_dict(checkpoint['scaler']) # this will be for GPU only
# losses = checkpoint['losses']

In [21]:
# from transformers import AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained(repo_name, token=hf_token)
# model.to(device)