In [3]:
from transformers import AutoTokenizer, BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

ARTICLE_TO_SUMMARIZE = (
    "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
    "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
    "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
)
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")

# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'

In [3]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "facebook/bart-large",
)
model = AutoModelForMaskedLM.from_pretrained(
    "facebook/bart-large",
    attn_implementation="sdpa"
)
inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)

print(f"The predicted token is: {predicted_token}")

The predicted token is:  energy


In [10]:
import json
from transformers import AutoTokenizer, pipeline

# Load BART (CPU-safe)
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

summarizer = pipeline(
    "summarization",
    model=model_name,
    tokenizer=model_name,
    device="cpu"  # avoid GPU issues
)

###########################################################
# Function: split into <=900 token chunks (safe for BART)
###########################################################
def chunk_text(text, max_tokens=900):
    tokens = tokenizer.encode(text)
    chunks = []
    
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i : i + max_tokens]
        chunks.append(chunk)
    
    return chunks

###########################################################
# Function: summarize long texts safely
###########################################################
def summarize_patent(text):
    chunks = chunk_text(text)

    partial_summaries = []
    for token_chunk in chunks:
        chunk_text_decoded = tokenizer.decode(token_chunk, skip_special_tokens=True)

        summary = summarizer(
            chunk_text_decoded,
            max_length=200,
            min_length=60,
            do_sample=False
        )[0]["summary_text"]

        partial_summaries.append(summary)

    # If many partial summaries exist â†’ summarize them too
    if len(partial_summaries) > 1:
        combined = " ".join(partial_summaries)
        final = summarizer(
            combined,
            max_length=250,
            min_length=80,
            do_sample=False
        )[0]["summary_text"]
        return final
    else:
        return partial_summaries[0]

###########################################################
# Load JSON
###########################################################
with open("/data/elugos/curr-fifty-patents.json", "r") as f:
    data = json.load(f)

results = []

for item in data:
    desc = item.get("description", "")
    if not desc or not isinstance(desc, str):
        continue
    
    summary = summarize_patent(desc)

    results.append({
        "original": desc,
        "summary": summary
    })

###########################################################
# Save
###########################################################
with open("curr-fifty-patents-summarized.json", "w") as f:
    json.dump(results, f, indent=4)

print("Finished summarizing all patents!")


Device set to use cpu
Your max_length is set to 250, but your input_length is only 138. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
Your max_length is set to 250, but your input_length is only 158. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 250, but your input_length is only 147. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)
Your max_length is set to 200, but your input_length is only 179. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer(

Finished summarizing all patents!
