In [1]:
!pip install transformers accelerate sentencepiece pandas openpyxl



In [4]:
#sometimes model.safetensors doesn't like installing and freezes, interrupt and restart the run and it should install on the second run
from transformers import AutoTokenizer, pipeline

model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

summarizer = pipeline(
    "summarization",
    model=model_name,
    tokenizer=model_name,
    device=0
)


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]


Saving us_smallest_claims_1985_1990_top500.xlsx to us_smallest_claims_1985_1990_top500.xlsx


In [6]:
def chunk_text(text, limit=600):
    ids = tokenizer.encode(text)
    out = []
    for i in range(0, len(ids), limit):
        out.append(ids[i:i+limit])
    return out

def summarize_long(text):
    chunks = chunk_text(text)
    parts = []
    for ids in chunks:
        block = tokenizer.decode(ids, skip_special_tokens=True)
        s = summarizer(block, max_length=150, min_length=40, do_sample=False)[0]["summary_text"]
        parts.append(s)
    if len(parts) == 1:
        return parts[0]
    merged = " ".join(parts)
    return summarizer(merged, max_length=180, min_length=50, do_sample=False)[0]["summary_text"]

In [7]:
import pandas as pd

df = pd.read_excel(filename)


In [8]:
import ast

def safe_load(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except:
            return []
    return val if isinstance(val, list) else []


def extract(row):
    t = safe_load(row.get("title_localized", []))
    d = safe_load(row.get("abstract_localized", []))
    c = safe_load(row.get("claims_localized", []))

    title = t[0].get("text", "") if t else ""
    desc = d[0].get("text", "") if d else ""
    claims = " ".join(x.get("text", "") for x in c) if c else ""

    return title, desc, claims


In [9]:
SUMMARY_MAX = 120
SUMMARY_MIN = 40



In [10]:
results = []

for _, row in df.iterrows():
    title, desc, claims = extract(row)
    if not desc:
        continue

    combined = f"TITLE:\n{title}\n\nDESCRIPTION:\n{desc}\n\nCLAIMS:\n{claims}"
    summary = summarize_long(combined)

    results.append(dict(
        title=title,
        description=desc,
        claims=claims,
        summary=summary
    ))


Your max_length is set to 180, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 180, but your input_length is only 168. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=84)
Your max_length is set to 180, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Your max_length is set to 150, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are

In [11]:
from google.colab import files
import json

filename = input("Enter output filename (for example: summarized_patents.json): ").strip()
if not filename:
    filename = "output.json"

with open(filename, "w") as f:
    json.dump(results, f, indent=2)

print(f"Wrote {filename}")
files.download(filename)


Enter output filename (for example: summarized_patents.json): summarized_patents.json
Wrote summarized_patents.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>