## [Research] End-to-End NLP Pipeline Test

This notebook replicates the end-to-end pipeline to test:

- PDF text extraction

- Named Entity Recognition (NER)

- Topic modeling

- Text summarization

### 1. Imports + Configs

In [None]:
import pymupdf
import justsdk

from span_marker import SpanMarkerModel
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from transformers import pipeline
from pathlib import Path

ROOT = Path.cwd().parent
DATA_DIR = ROOT / "data"
SAMPLE_DIR = DATA_DIR / "sample" / "pdf"
SAMPLE_PATH = SAMPLE_DIR / "01-agile-methodology.pdf"

### 2. Load models

In [None]:
models = {
    "embedder": SentenceTransformer("all-MiniLM-L6-v2"),
    "ner": SpanMarkerModel.from_pretrained(
        "tomaarsen/span-marker-roberta-large-ontonotes5"
    ),
    "summarizer": pipeline("summarization", model="facebook/bart-large-cnn", device=-1),
}

### 3. Extract text from sample PDF

In [None]:
doc = pymupdf.open(SAMPLE_PATH)
text = "".join([doc.load_page(p).get_text() for p in range(len(doc))])
doc.close()

text_clean = text.strip().replace("\n", " ").replace("\r", " ")
justsdk.print_info(f"Extracted {len(text_clean)} characters", newline_before=True)

### 4. Perform Named Entity Recognition (NER)

In [None]:
entities = models["ner"].predict(text_clean)
orgs = [e["span"] for e in entities if e["label"] == "ORG"]
locations = [e["span"] for e in entities if e["label"] == "GPE"]
people = [e["span"] for e in entities if e["label"] == "PERSON"]

justsdk.print_info(f"Found {len(entities)} entities:")
print(f"  Organizations: {orgs[:3]}")
print(f"  Locations: {locations[:3]}")
print(f"  People: {people[:3]}")

### 5. Topic modeling with BERTopic

In [None]:
sentences = [s.strip() for s in text_clean.split(".") if len(s.strip()) > 50]
if len(sentences) >= 3:
    umap_model = UMAP(n_neighbors=2, n_components=2, min_dist=0.0, metric="cosine")
    hdbscan_model = HDBSCAN(min_cluster_size=2, metric="euclidean")

    topic_model = BERTopic(
        embedding_model=models["embedder"],
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        verbose=False,
    )
    topics, _ = topic_model.fit_transform(sentences[:8])
    justsdk.print_info(
        f"Discovered {len(set(topics))} topics from {len(sentences)} sentences"
    )
else:
    topics = []
    justsdk.print_warning(
        "Not enough sentences to discover topics, skipping topic modeling."
    )

### 6. Summarize text in chunks

In [None]:
def chunk_text(text, max_words=800) -> list[str]:
    words = text.split()
    return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]


chunks = chunk_text(text_clean)
justsdk.print_info(f"Split document into {len(chunks)} chunks")

chunk_summaries = []
for i, chunk in enumerate(chunks):
    max_len = min(100, len(chunk.split()) // 3)
    min_len = min(20, max_len // 2)
    summary = models["summarizer"](
        chunk, max_length=max_len, min_length=min_len, do_sample=False
    )
    summary_text = summary[0]["summary_text"]
    chunk_summaries.append(summary_text)
    print(f"  Chunk {i + 1}: {summary_text}")

combined_summary = " ".join(chunk_summaries)
if "." in combined_summary:
    combined_summary = combined_summary.replace(".", ".\n")

justsdk.print_success(
    f"Summary ({len(combined_summary.split())} words):\n {combined_summary}",
    newline_before=True,
)