In [33]:
# !pip install -q datasets transformers torch

In [1]:
from datasets import load_dataset, get_dataset_config_names
import torch
import json
import time
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/opt/anaconda3/envs/tbi/lib/python3.11/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/envs/tbi/lib/python3.11/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/envs/tbi/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/tbi/lib/python3.11/site-packages/traitlets/config/application.py", l

In [2]:
configs = get_dataset_config_names("ccdv/arxiv-summarization")

In [3]:
configs

['document', 'section']

In [4]:
# ds_section = load_dataset("ccdv/arxiv-summarization", "section", split='train')
# ds_doc = load_dataset("ccdv/arxiv-summarization", "document", split='train')

In [5]:
# ds_section, ds_doc

In [6]:
# # caching a few samples to use in the future to avoid loading from HF every time
# sample_sz = 30
# section_samples = ds_section.shuffle(seed=42).select(range(sample_sz))
# doc_samples = ds_doc.shuffle(seed=42).select(range(sample_sz))

# with open("section_samples.json", "w", encoding="utf-8") as f:
#     json.dump(section_samples[:], f, indent=2)

# with open("doc_samples.json", "w", encoding="utf-8") as f:
#     json.dump(doc_samples[:], f, indent=2)

### LLM functionality

In [88]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load summarization model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [89]:
def run_summarizer(prompt, tokenizer=tokenizer, model=model,
                   max_input_length=1024, min_output_length=20, max_output_length=200):
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        min_length=min_output_length,
        max_length=max_output_length,
        do_sample=False,
        num_beams=4,
        early_stopping=True,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [83]:
# works pretty well
input_text = (
    "My name is Gabe. I am 29 years old and live in DC. "
    "I'm currently on the job market, looking for opportunities in data science or ML engineering. "
    "I enjoy working on AI problems that have real-world social impact, particularly in public policy."
)

start = time.time()
rv = run_summarizer(input_text)
print(f"\nSummary: {rv}")
print(f"\nTook {time.time() - start:.2f} seconds")

My name is Gabe. I am 29 years old and live in DC. I'm currently on the job market, looking for opportunities in data science or ML engineering. I enjoy working on AI problems that have real-world social impact, particularly in public policy.

Summary: Gabe, 29, is looking for opportunities in data science or ML engineering. He enjoys working on AI problems that have real-world social impact.

Took 58.88 seconds


In [91]:
# Load a cached document
with open("section_samples.json", "r", encoding="utf-8") as f:
    section_samples = json.load(f)

section_article = section_samples['article'][0]
section_abstract = section_samples['abstract'][0]

user_prompt = "Summarize the main idea of this paper in one paragraph."

prompt = f"{user_prompt}\n\nDocument:\n{section_article}"

In [None]:
start = time.time()
rv = run_summarizer(prompt)
print(f"\nSummary: {rv}")
print(f"\nTook {time.time() - start:.2f} seconds")

### would need diff model for the below

In [84]:
# facebook bart cnn is specifically trained to summarize long docs, this won't work well
input_text = "Can you summarize what dropout is and how it is used in deep learning"

start = time.time()
rv = run_summarizer(input_text)
print(f"\nSummary: {rv}")
print(f"\nTook {time.time() - start:.2f} seconds")

Can you summarize what dropout is and how it is used in deep learning

Summary: Can you summarize what dropout is and how it is used in deep learning? Share your thoughts in the comments below.

Took 57.50 seconds
