In [1]:
!pip install datasets keybert sentence-transformers spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
!pip install --upgrade datasets fsspec


Collecting fsspec
  Using cached fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)


In [6]:
import pandas as pd
from datasets import load_dataset
dataset =load_dataset("cnn_dailymail", "3.0.0", split="test", download_mode="force_redownload")

# Load 200 English news summaries
#dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
#texts = [item['highlights'] for item in dataset[:200]]
texts = [item['highlights'] for item in dataset.select(range(200))]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [8]:
print(texts[:1])

['Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .\nIsrael and the United States opposed the move, which could open the door to war crimes investigations against Israelis .']


In [7]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import spacy

nlp = spacy.load("en_core_web_sm")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=embedding_model)

def extract_with_candidates(text, top_n=5):
    doc = nlp(text)
    noun_phrases = list(set(chunk.text.strip() for chunk in doc.noun_chunks))
    return kw_model.extract_keywords(text, candidates=noun_phrases, top_n=top_n)

def extract_without_filter(text, top_n=5):
    return kw_model.extract_keywords(text, top_n=top_n)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
#comparison example

i = 0
print(f"\nTEXT:\n{texts[i]}\n")

print("🔸 With Candidate Filtering:\n")
for kw, score in extract_with_candidates(texts[i]):
    print(f" - {kw} ({score:.4f})")

print("\n🔹 Without Filtering:\n")
for kw, score in extract_without_filter(texts[i]):
    print(f" - {kw} ({score:.4f})")



TEXT:
Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

🔸 With Candidate Filtering:





 - jurisdiction (0.4141)

🔹 Without Filtering:

 - palestinian (0.4718)
 - icc (0.4590)
 - jurisdiction (0.4141)
 - israel (0.3985)
 - israelis (0.3852)


In [10]:
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

def evaluate_keywords(texts, top_n=5):
    results = []

    for text in texts[:50]:  # You can increase the range
        try:
            # Extract keywords
            filtered = [kw[0] for kw in extract_with_candidates(text, top_n=top_n)]
            unfiltered = [kw[0] for kw in extract_without_filter(text, top_n=top_n)]

            # Token overlap
            set_filt = set(filtered)
            set_unfilt = set(unfiltered)
            jaccard = len(set_filt & set_unfilt) / len(set_filt | set_unfilt) if set_filt | set_unfilt else 0

            # Noun-based ground truth (proxy)
            doc = nlp(text)
            noun_tokens = [token.text.lower() for token in doc if token.pos_ == "NOUN"]
            most_common_nouns = set([w for w, _ in Counter(noun_tokens).most_common(10)])

            # Precision@N
            precision_filt = len(set_filt & most_common_nouns) / top_n
            precision_unfilt = len(set_unfilt & most_common_nouns) / top_n

            # Average keyword length
            avg_len_filt = np.mean([len(k.split()) for k in filtered])
            avg_len_unfilt = np.mean([len(k.split()) for k in unfiltered])

            results.append({
                "precision_filt": precision_filt,
                "precision_unfilt": precision_unfilt,
                "jaccard": jaccard,
                "len_filt": avg_len_filt,
                "len_unfilt": avg_len_unfilt
            })

        except Exception:
            continue

    return pd.DataFrame(results)

# Run evaluation
metrics_df = evaluate_keywords(texts)
print(metrics_df.mean().round(3))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

precision_filt      0.220
precision_unfilt    0.428
jaccard             0.076
len_filt            1.000
len_unfilt          1.000
dtype: float64
