In [1]:
import csv
import ast
import pandas as pd
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import json
import torch
from datetime import datetime
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
from evaluate import load as load_metric
from transformers import TrainerCallback


class PrintStepCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:
            loss = state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'
            print(f"Step {state.global_step}, Loss: {loss}")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_row_text(s):
    lst = ast.literal_eval(s)
    json_str = json.dumps(lst)
    obj = json.loads(json_str)
    return obj[0]['text']

In [3]:
# ================================================================
# 0. Prepare Data
# ================================================================
FILE_NAME = 'us_smallest_claims_1985_1990_top500'
CSV_PATH = './' + FILE_NAME + '.csv'
JSON_PATH = './' + FILE_NAME + '.json'

df = pd.read_csv(CSV_PATH)
df.drop(columns=['n_claims', 'publication_number', 'publication_date', 'claims_localized_html'], inplace=True)
df = df.map(clean_row_text)
df.to_json(JSON_PATH, orient='records')

In [4]:
TITLE = 'title'
DESCRIPTION = 'description'
SUMMARY = 'summary'

In [5]:
# ================================================================
# 1. Load Data
# ================================================================
# DATA_PATH = "./us_smallest_claims_1985_1990_top500.json"
DATA_PATH = "./batch_summarized.json"

with open(DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

data = data[:10]

print(f"Loaded {len(data)} patents with real summaries")

Loaded 10 patents with real summaries


In [6]:
records = []
for i, item in enumerate(data):
    if SUMMARY in item and DESCRIPTION in item:
        records.append({
            DESCRIPTION: item[DESCRIPTION],
            TITLE: item[TITLE],
            SUMMARY: item[SUMMARY]
        })
        if i < 3:
            print(f"\nExample {i+1}: {item[TITLE]}")
            print(f"  Summary: {item[SUMMARY][:150]}...")

print(f"\nTotal records: {len(records)}")


Example 1: Vehicular turn signal apparatus
  Summary: Vehicular turn signal apparatus is provided for securement to a rear shelf surface proximate a rear window of a vehicular interior. The apparatus incl...

Example 2: Apparatus for facilitating the machining of workpieces
  Summary: Apparatus for facilitating the machining of workpieces. Means to gain access to the cutting edge of a saw tooth in the direction of both the back and ...

Example 3: Control valve
  Summary: A 3/2 proportional control valve is provided with an actuating piston which is subjected to the control pressure set in a pilot valve. The area relati...

Total records: 10


In [7]:
target_device = ''
if torch.backends.mps.is_available():
    target_device = 'mps'
elif torch .cuda.is_available():
    target_device = 'cuda'
else:
    target_device = 'cpu'

DEVICE = torch.device(target_device)
print("Using DEVICE:", DEVICE)

Using DEVICE: mps


In [8]:
# ================================================================
# 2. Load Model/Tokenizer
# ================================================================
model_name = "Qwen/Qwen3-0.6B-Base"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    # use bfloat16 only if CUDA is available; otherwise use float32
    torch_dtype=(torch.bfloat16 if torch.cuda.is_available() else torch.float32),
)
model.to(DEVICE)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

`torch_dtype` is deprecated! Use `dtype` instead!


In [9]:
MAX_LENGTH = 512
MAX_TEXT_TOKENS = 175

In [10]:
# ================================================================
# 3. Create Dataset
# ================================================================

def truncate_text(text, max_tokens):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = tokenizer.decode(tokens, skip_special_tokens=True)
    return text

dataset = Dataset.from_list(records)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
print(f"\nTrain: {len(dataset['train'])}, Test: {len(dataset['test'])}")
print(dataset['test'][0])


Train: 8, Test: 2
{'description': 'A reliable, pulse-flow supplemental oxygen apparatus for alleviating respiratory ailments is provided which yields substantial savings in oxygen while giving the patient the physiological equivalent of a prescribed continuous stream of oxygen. The apparatus preferably includes a demand oxygen valve operated in a pulse mode by means of electronic control circuitry which, through an appropriate sensor, monitors the patient&#39;s breathing efforts and gives a variable &#34;custom tailored&#34; pulse volume of oxygen to the patient during the very initial stages of each inspiration.', 'title': 'Inspiration oxygen saver', 'summary': 'A reliable, pulse-flow supplemental oxygen apparatus for alleviating respiratory ailments is provided. It yields substantial savings in oxygen while giving the patient the physiological equivalent of a prescribed continuous stream of oxygen. It preferably includes a demand oxygen valve operated in a pulse mode.'}


In [11]:
# ================================================================
# 4. Preprocessing
# ================================================================
def preprocess(batch):
    input_ids_list = []
    attention_list = []
    labels_list = []

    for summary, description in zip(batch[SUMMARY], batch[DESCRIPTION]):
        summary = truncate_text(summary, MAX_TEXT_TOKENS)
        
        # prompt = f"Summarize this patent:\n\n{text}\n\nSummary: "
        prompt = f"Generate a full detailed patent document based on this summary: \n\n{summary}\n\n Patent Document:"
        target = description + tokenizer.eos_token
        full_text = prompt + target

        target_ids = tokenizer.encode(target, add_special_tokens=False)
        target_len = len(target_ids)

        tokenized = tokenizer(
            full_text,
            truncation=True,
            max_length=MAX_LENGTH,
            padding="max_length",
            add_special_tokens=True,
        )

        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]
        seq_len = sum(attention_mask)

        labels = [-100] * MAX_LENGTH
        target_start = seq_len - target_len
        
        for i in range(target_len):
            pos = target_start + i
            if 0 <= pos < MAX_LENGTH:
                labels[pos] = input_ids[pos]

        input_ids_list.append(input_ids)
        attention_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_list,
        "labels": labels_list
    }

tokenized_train = dataset["train"].map(
    preprocess, 
    batched=True,
    remove_columns=dataset["train"].column_names
)

example = tokenized_train[0]
valid_count = sum(1 for l in example["labels"] if l != -100)
print(f"Valid label tokens: {valid_count}")

def collate_fn(batch):
    return {
        "input_ids": torch.tensor([x["input_ids"] for x in batch], dtype=torch.long),
        "attention_mask": torch.tensor([x["attention_mask"] for x in batch], dtype=torch.long),
        "labels": torch.tensor([x["labels"] for x in batch], dtype=torch.long),
    }

Map: 100%|██████████| 8/8 [00:00<00:00, 429.28 examples/s]

Valid label tokens: 84





In [12]:
MAX_NEW_TOKENS = 1000

In [None]:
# ================================================================
# 5. Baseline Evaluation
# ================================================================
rouge = load_metric("rouge")

def generate_summary(mdl, text):
    text = truncate_text(text, MAX_TEXT_TOKENS)
    # prompt = f"Summarize this patent:\n\n{text}\n\nSummary:"
    prompt = f"Generate a full patent document based on this summary: \n\n{text}\n\n Patent Document:"
    inputs = tokenizer(prompt, return_tensors="pt")#.to("cuda")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}  # move to DEVICE (CPU or GPU)

    mdl.eval()
    with torch.no_grad():
        output = mdl.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    if "Patent Document:" in full_output:
        return full_output.split("Patent Document:")[-1].strip()
    return full_output

# test_refs = [item[SUMMARY] for item in dataset["test"]]
test_refs = [item[DESCRIPTION] for item in dataset["test"]]

print("\nBaseline evaluation...")
baseline_preds = []
for item in dataset["test"]:
    pred = generate_summary(model, item[DESCRIPTION])
    baseline_preds.append(pred)

baseline_rouge = rouge.compute(predictions=baseline_preds, references=test_refs)
print(f"Baseline ROUGE-L: {baseline_rouge['rougeL']:.4f}")

The following generation flags are not valid and may be ignored: ['length_penalty']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Baseline evaluation...
Baseline ROUGE-L: 0.0362


In [14]:
# ================================================================
# 6. LoRA Fine-tuning
# ================================================================
config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir="./qwen_lora_patent_real",
    num_train_epochs=50,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    bf16=torch.cuda.is_available(),               # enable bf16 only on CUDA
    dataloader_pin_memory=torch.cuda.is_available(),  # only pin when CUDA available
    logging_steps=10,
    save_steps=999999,
    report_to="none",
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=collate_fn,
    callbacks=[PrintStepCallback()]
)

print("\n=== Training ===")
trainer.train()

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 40,370,176 || all params: 636,420,096 || trainable%: 6.3433

=== Training ===


Step,Training Loss
10,1.3866


Step 10, Loss: N/A


KeyboardInterrupt: 

In [None]:
# ================================================================
# 7. Final Evaluation
# ================================================================
print("\nFinal evaluation...")
finetuned_preds = []

model.eval()
for item in dataset["test"]:
    pred = generate_summary(model, item[DESCRIPTION])
    finetuned_preds.append(pred)

finetuned_rouge = rouge.compute(predictions=finetuned_preds, references=test_refs)


Final evaluation...


In [None]:
# ================================================================
# 8. SAVE MODEL PROPERLY
# ================================================================
print("\n" + "="*70)
print("SAVING MODEL")
print("="*70)

# Save LoRA adapters
model.save_pretrained("./qwen_lora_patent_real")
print("✓ LoRA adapters saved")

# Save tokenizer
tokenizer.save_pretrained("./qwen_lora_patent_real")
print("✓ Tokenizer saved")

# Save comprehensive metadata
metadata = {
    "model_info": {
        "base_model": model_name,
        "model_type": "LoRA_fine-tuned",
        "task": "patent_summarization"
    },
    "training_info": {
        "training_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "dataset": DATA_PATH,
        "num_train_examples": len(dataset["train"]),
        "num_test_examples": len(dataset["test"]),
        "num_epochs": 10,
        "batch_size": 2,
        "gradient_accumulation_steps": 4,
        "effective_batch_size": 8,
        "learning_rate": 1e-4,
        "max_length": MAX_LENGTH,
        "max_text_tokens": MAX_TEXT_TOKENS
    },
    "lora_config": {
        "r": 32,
        "lora_alpha": 64,
        "lora_dropout": 0.05,
        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    },
    "results": {
        "baseline": {
            "rouge1": float(baseline_rouge['rouge1']),
            "rouge2": float(baseline_rouge['rouge2']),
            "rougeL": float(baseline_rouge['rougeL'])
        },
        "finetuned": {
            "rouge1": float(finetuned_rouge['rouge1']),
            "rouge2": float(finetuned_rouge['rouge2']),
            "rougeL": float(finetuned_rouge['rougeL'])
        },
        "improvement": {
            "rouge1": float(finetuned_rouge['rouge1'] - baseline_rouge['rouge1']),
            "rouge2": float(finetuned_rouge['rouge2'] - baseline_rouge['rouge2']),
            "rougeL": float(finetuned_rouge['rougeL'] - baseline_rouge['rougeL'])
        }
    }
}

with open("./qwen_lora_patent_real/metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)
print("✓ Metadata saved")

# Save sample predictions for reference
samples = []
for i in range(min(5, len(test_refs))):
    samples.append({
        TITLE: dataset['test'][i][TITLE],
        "reference": test_refs[i],
        "baseline": baseline_preds[i],
        "finetuned": finetuned_preds[i]
    })

with open("./qwen_lora_patent_real/sample_predictions.json", "w") as f:
    json.dump(samples, f, indent=2, ensure_ascii=False)
print("✓ Sample predictions saved")

print(f"\n✓ Complete model package saved to: ./qwen_lora_patent_real/")
print("\nSaved files:")
print("  - adapter_model.bin (~13MB)")
print("  - adapter_config.json")
print("  - tokenizer files")
print("  - metadata.json")
print("  - sample_predictions.json")


SAVING MODEL
✓ LoRA adapters saved
✓ Tokenizer saved
✓ Metadata saved
✓ Sample predictions saved

✓ Complete model package saved to: ./qwen_lora_patent_real/

Saved files:
  - adapter_model.bin (~13MB)
  - adapter_config.json
  - tokenizer files
  - metadata.json
  - sample_predictions.json


In [None]:
# ================================================================
# 9. Display Results
# ================================================================
print("\n" + "="*70)
print("FINAL RESULTS")
print("="*70)
print(f"Baseline ROUGE-L:   {baseline_rouge['rougeL']:.4f}")
print(f"Fine-tuned ROUGE-L: {finetuned_rouge['rougeL']:.4f}")
delta = finetuned_rouge['rougeL'] - baseline_rouge['rougeL']
print(f"Change: {'+' if delta >= 0 else ''}{delta:.4f} ({delta*100:+.2f}%)")

print(f"\nDetailed scores:")
print(f"  ROUGE-1: {baseline_rouge['rouge1']:.4f} -> {finetuned_rouge['rouge1']:.4f}")
print(f"  ROUGE-2: {baseline_rouge['rouge2']:.4f} -> {finetuned_rouge['rouge2']:.4f}")
print(f"  ROUGE-L: {baseline_rouge['rougeL']:.4f} -> {finetuned_rouge['rougeL']:.4f}")

print("\n=== Sample Comparisons ===")
for i in range(min(3, len(test_refs))):
    print(f"\n--- {dataset['test'][i][TITLE]} ---")
    print(f"Reference:  {test_refs[i][:]}")
    print(f"Baseline:   {baseline_preds[i][:]}")
    print(f"Fine-tuned: {finetuned_preds[i][:]}")

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)


FINAL RESULTS
Baseline ROUGE-L:   0.0362
Fine-tuned ROUGE-L: 0.6993
Change: +0.6631 (+66.31%)

Detailed scores:
  ROUGE-1: 0.0362 -> 0.7469
  ROUGE-2: 0.0000 -> 0.6095
  ROUGE-L: 0.0362 -> 0.6993

=== Sample Comparisons ===

--- Inspiration oxygen saver ---
Reference:  A reliable, pulse-flow supplemental oxygen apparatus for alleviating respiratory ailments is provided which yields substantial savings in oxygen while giving the patient the physiological equivalent of a prescribed continuous stream of oxygen. The apparatus preferably includes a demand oxygen valve operated in a pulse mode by means of electronic control circuitry which, through an appropriate sensor, monitors the patient&#39;s breathing efforts and gives a variable &#34;custom tailored&#34; pulse volume of oxygen to the patient during the very initial stages of each inspiration.
Baseline:   [Title of the Patent Document]  
[Date of Publication]  
[Patent Number]  

[Abstract]  
[Summary of the invention]  

[Background]