In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "sshleifer/distilbart-cnn-12-6"

train_small = load_dataset(
    "parquet",
    data_files="/content/drive/MyDrive/Patent_Data/bigpatent_train_small.parquet"
)["train"]
train_small = train_small.select(range(5000))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

MAX_INPUT  = 256
MAX_TARGET = 64

def preprocess(batch):
    inputs  = tokenizer(batch["description"], max_length=MAX_INPUT, truncation=True)
    targets = tokenizer(batch["abstract"],    max_length=MAX_TARGET, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_small.map(preprocess, batched=True, remove_columns=train_small.column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq
import time

# 1) Take a smaller subset for now
tokenized_small = tokenized_train.select(range(800))   # try 500â€“800

# 2) Choose device: if GPU keeps OOM-ing, use "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

collator = DataCollatorForSeq2Seq(tokenizer, model=model)

BATCH_SIZE = 1
EPOCHS     = 1
LR         = 5e-5

train_loader = DataLoader(
    tokenized_small,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collator,
)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

model.train()
step = 0
last_print = time.time()

for epoch in range(EPOCHS):
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_loader):
        # hard cap: only train on first 300 batches max
        if batch_idx >= 300:
            break

        # move to device
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        step += 1
        total_loss += loss.item()

        # free some GPU memory if applicable
        if device == "cuda":
            del batch, outputs, loss
            torch.cuda.empty_cache()

        # heartbeat every ~20 seconds
        if time.time() - last_print > 20:
            print(f"[Epoch {epoch+1}] Step {step}, Batch {batch_idx+1}, Loss {total_loss/step:.4f}")
            last_print = time.time()

    avg_loss = total_loss / max(1, step)
    print(f"Epoch {epoch+1} finished. Avg loss: {avg_loss:.4f}")


[Epoch 1] Step 1, Batch 1, Loss 0.8822
[Epoch 1] Step 3, Batch 3, Loss 2.7765
[Epoch 1] Step 5, Batch 5, Loss 3.4779
[Epoch 1] Step 7, Batch 7, Loss 3.6577
[Epoch 1] Step 10, Batch 10, Loss 3.6155
[Epoch 1] Step 12, Batch 12, Loss 3.8042
[Epoch 1] Step 14, Batch 14, Loss 3.8242
[Epoch 1] Step 16, Batch 16, Loss 3.9589
[Epoch 1] Step 18, Batch 18, Loss 3.9964
[Epoch 1] Step 20, Batch 20, Loss 4.1191
[Epoch 1] Step 22, Batch 22, Loss 4.1347
[Epoch 1] Step 24, Batch 24, Loss 4.0970
[Epoch 1] Step 26, Batch 26, Loss 4.1112
[Epoch 1] Step 28, Batch 28, Loss 4.1644
[Epoch 1] Step 30, Batch 30, Loss 4.2161
[Epoch 1] Step 32, Batch 32, Loss 4.2817
[Epoch 1] Step 34, Batch 34, Loss 4.3913
[Epoch 1] Step 36, Batch 36, Loss 4.4689
[Epoch 1] Step 38, Batch 38, Loss 4.5239
[Epoch 1] Step 40, Batch 40, Loss 4.5675
[Epoch 1] Step 42, Batch 42, Loss 4.6095
[Epoch 1] Step 44, Batch 44, Loss 4.6638
[Epoch 1] Step 46, Batch 46, Loss 4.6866
[Epoch 1] Step 48, Batch 48, Loss 4.7082
[Epoch 1] Step 50, Batch

In [5]:
save_path = "/content/drive/MyDrive/Patent_Data/distilbart_bigpatent_final"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)




Model saved to: /content/drive/MyDrive/Patent_Data/distilbart_bigpatent_final


In [6]:
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

for i in range(3):
    ex = train_small[i]
    inputs = tokenizer(
        ex["description"],
        max_length=MAX_INPUT,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_length=MAX_TARGET,
            num_beams=4,
        )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"=== EXAMPLE {i} ===")
    print("GOLD:", ex["abstract"][:400])
    print("PRED:", pred)
    print()


=== EXAMPLE 0 ===
GOLD: An apparatus for recording and reproducing video and audio signals in either analog or digital form. The heads are arranged to conform with existing analog standards, such as VHS, and yet are also able to record in digital form in adjacent non-overlapping tracks. In a preferred embodiment, one pair of heads is arranged with a small azimuth angle (+/-6 degrees), and another pair is arranged with a 
PRED: The present invention provides a method and apparatus for the use of a computer system. The computer system includes a system, a system and a network network network, a network, and an network network. The network network includes a network system, the network network system and the network system. A network network and network

=== EXAMPLE 1 ===
GOLD: A system, method and program product for interactively scheduling and negotiating meetings wherein an active agent program accepts meeting criteria from a meeting requester and interacts with invitees to resolve av