# Required Imports & Configuration

In [36]:
import ast
import pandas as pd
from peft import PeftModel

import json
import torch
from datetime import datetime
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from peft import LoraConfig, get_peft_model
from evaluate import load as load_metric
from transformers import TrainerCallback


class PrintStepCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:
            loss = state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'
            print(f"Step {state.global_step}, Loss: {loss}")

            

In [37]:
target_device = ''
if torch.backends.mps.is_available():
    target_device = 'mps'
elif torch .cuda.is_available():
    target_device = 'cuda'
else:
    target_device = 'cpu'

DEVICE = torch.device(target_device)
print("Using DEVICE:", DEVICE)

Using DEVICE: mps


# Constants/Hyperparameters

In [38]:
# For the incoming .json dataset
TITLE = 'title'
DESCRIPTION = 'description'
SUMMARY = 'summary'

MODEL_NAME = "Qwen/Qwen3-0.6B-Base"

DATA_PATH = "./new-fifty-patents/patents_no_claims.json"
DATA_SLICE = None

PROMPT_PATH = "./prompt-construction/patent_prompts.json"
PROMPT_SLICE = 10 # Uses prompts[0:PROMPT_SLICE]. Set to None to use all prompts

MODEL_PATH = "./qwen_lora_patent_real"

# Model hyperparameters
MAX_LENGTH = 512        # len
MAX_TEXT_TOKENS = 350   # tt

MAX_NEW_TOKENS = 1000   # nt
EPOCHS = 30             # ep

# Utility Functions

In [39]:
def clean_row_text(s):
    lst = ast.literal_eval(s)
    json_str = json.dumps(lst)
    obj = json.loads(json_str)
    return obj[0]['text']

In [40]:
def truncate_text(text, tokenizer, max_tokens):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = tokenizer.decode(tokens, skip_special_tokens=True)
    return text

In [41]:
def preprocess(batch, tokenizer):
    input_ids_list = []
    attention_list = []
    labels_list = []

    for summary, description in zip(batch[SUMMARY], batch[DESCRIPTION]):
        summary = truncate_text(summary, tokenizer, MAX_TEXT_TOKENS)
        
        # prompt = f"Summarize this patent:\n\n{text}\n\nSummary: "
        prompt = f"Generate a full detailed patent document based on this summary: \n\n{summary}\n\n Patent Document:"
        target = description + tokenizer.eos_token
        full_text = prompt + target

        target_ids = tokenizer.encode(target, add_special_tokens=False)
        target_len = len(target_ids)

        tokenized = tokenizer(
            full_text,
            truncation=True,
            max_length=MAX_LENGTH,
            padding="max_length",
            add_special_tokens=True,
        )

        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]
        seq_len = sum(attention_mask)

        labels = [-100] * MAX_LENGTH
        target_start = seq_len - target_len
        
        for i in range(target_len):
            pos = target_start + i
            if 0 <= pos < MAX_LENGTH:
                labels[pos] = input_ids[pos]

        input_ids_list.append(input_ids)
        attention_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_list,
        "labels": labels_list
    }

In [42]:
def generate_summary(mdl, text, tokenizer):
    text = truncate_text(text, tokenizer, MAX_TEXT_TOKENS)
    # prompt = f"Summarize this patent:\n\n{text}\n\nSummary:"
    prompt = f"Generate a full patent document based on this summary: \n\n{text}\n\n Patent Document:"
    inputs = tokenizer(prompt, return_tensors="pt")#.to("cuda")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}  # move to DEVICE (CPU or GPU)

    mdl.eval()
    with torch.no_grad():
        output = mdl.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    if "Patent Document:" in full_output:
        return full_output.split("Patent Document:")[-1].strip()
    return full_output

In [43]:
def collate_fn(batch):
    return {
        "input_ids": torch.tensor([x["input_ids"] for x in batch], dtype=torch.long),
        "attention_mask": torch.tensor([x["attention_mask"] for x in batch], dtype=torch.long),
        "labels": torch.tensor([x["labels"] for x in batch], dtype=torch.long),
    }

# Load Model

In [44]:
base_model_name = MODEL_NAME
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    trust_remote_code=True,
    torch_dtype=torch.float32,  # or torch.bfloat16 if CUDA available
)

In [45]:
# 2. Load LoRA adapters on top of base model
model = PeftModel.from_pretrained(model, MODEL_PATH)

# 3. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

# 4. Move to device
model.to(DEVICE)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

# Load Data and Create Dataset

In [46]:
with open(DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

if DATA_SLICE:
    data = data[:DATA_SLICE]

print(f"Loaded {len(data)} patents with real summaries")

Loaded 50 patents with real summaries


In [47]:
records = []
for i, item in enumerate(data):
    if SUMMARY in item and DESCRIPTION in item:
        records.append({
            DESCRIPTION: item[DESCRIPTION],
            TITLE: item[TITLE],
            SUMMARY: item[SUMMARY]
        })
        if i < 3:
            print(f"\nExample {i+1}: {item[TITLE]}")
            print(f"  Summary: {item[SUMMARY][:150]}...")

print(f"\nTotal records: {len(records)}")


Example 1: Motorized pet door apparatus
  Summary: An electrically powered pet door assembly for use in a home or building. An apparatus as set forth in claim 6 wherein said trigger means includes a Sc...

Example 2: Power line to drive a vehicle
  Summary: A motor vehicle power line comprises a combustion engine (8), a gearbox (10) and an electronically controlled clutch (20) The engine throttle (84) is ...

Example 3: Refrigerant reclaim method and apparatus
  Summary: Refrigerant reclaim system includes a compressor, a heat exchanger, an oil separator, a condenser, a chill tank, a filter-dryer and a cooling coil in ...

Total records: 50


In [48]:
# ================================================================
# 3. Create Dataset
# ================================================================

dataset = Dataset.from_list(records)
# dataset = dataset.train_test_split(test_size=0.2, seed=42)
# print(f"\nTrain: {len(dataset['train'])}, Test: {len(dataset['test'])}")
# print(dataset['train'])
# print(dataset['test'][0])
print(dataset)

Dataset({
    features: ['description', 'title', 'summary'],
    num_rows: 50
})


In [55]:
# ================================================================
# 4. Preprocessing
# ================================================================
tokenized_train = dataset.map(
    lambda batch: preprocess(batch, tokenizer),
    batched=True,
    remove_columns=dataset.column_names
)

example = tokenized_train[0]
valid_count = sum(1 for l in example["labels"] if l != -100)
print(f"Valid label tokens: {valid_count}")

summaries = [item[SUMMARY] for item in dataset]
references = [item[DESCRIPTION] for item in dataset]

Map: 100%|██████████| 50/50 [00:00<00:00, 888.95 examples/s]

Valid label tokens: 89





# Test the Model

In [57]:
# 5. Load Evaluation Metric
rouge = load_metric("rouge")

# Test
predictions = []
comparisons = []
for summary, ref in zip(summaries, references):
    prompt = f"Generate a full detailed patent document based on this summary: \n\n{summary}\n\n Patent Document:"
    pred = generate_summary(model, prompt, tokenizer)

    finetuned_rouge = rouge.compute(predictions=[pred], references=[ref])
    comparisons.append(finetuned_rouge)

In [58]:
comparisons

[{'rouge1': np.float64(0.29464285714285715),
  'rouge2': np.float64(0.07207207207207207),
  'rougeL': np.float64(0.20535714285714285),
  'rougeLsum': np.float64(0.23214285714285715)},
 {'rouge1': np.float64(0.5635359116022098),
  'rouge2': np.float64(0.37222222222222223),
  'rougeL': np.float64(0.3314917127071823),
  'rougeLsum': np.float64(0.3314917127071823)},
 {'rouge1': np.float64(0.75625),
  'rouge2': np.float64(0.5220125786163522),
  'rougeL': np.float64(0.5125),
  'rougeLsum': np.float64(0.5125)},
 {'rouge1': np.float64(0.477815699658703),
  'rouge2': np.float64(0.3505154639175258),
  'rougeL': np.float64(0.40955631399317405),
  'rougeLsum': np.float64(0.40955631399317405)},
 {'rouge1': np.float64(0.351219512195122),
  'rouge2': np.float64(0.2660098522167488),
  'rougeL': np.float64(0.34146341463414637),
  'rougeLsum': np.float64(0.34146341463414637)},
 {'rouge1': np.float64(0.24242424242424246),
  'rouge2': np.float64(0.03125),
  'rougeL': np.float64(0.12121212121212123),
  'ro