In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r"samentic_chunk.csv")

In [3]:
df

Unnamed: 0,title,chunk_id,chunk_text,token_count
0,A DESCENT INTO THE MAELSTRÖM,0,"The ways of God in Nature, as in Providence, a...",185
1,A DESCENT INTO THE MAELSTRÖM,1,You suppose me a very old man—but I am not. It...,176
2,A DESCENT INTO THE MAELSTRÖM,2,Nothing would have tempted me to within half a...,130
3,A DESCENT INTO THE MAELSTRÖM,3,"“You must get over these fancies,” said the gu...",200
4,A DESCENT INTO THE MAELSTRÖM,4,A panorama more deplorably desolate no human i...,148
...,...,...,...,...
2847,X-ING A PARAGRAPH,15,"The true reason, perhaps, is that x is rather ...",145
2848,X-ING A PARAGRAPH,16,Next morning the population of Nopolis were ta...,276
2849,X-ING A PARAGRAPH,17,The first definite idea entertained by the pop...,122
2850,X-ING A PARAGRAPH,18,One gentleman thought the whole an X-ellent jo...,133


In [4]:
import spacy
import pandas as pd
import re
from keybert import KeyBERT

# Advanced NER model
nlp = spacy.load("en_core_web_lg")


# KeyBERT with sentence-transformers backend
kw_model = KeyBERT(model="all-MiniLM-L6-v2")


In [5]:
def extract_person_entities(text: str):
    doc = nlp(text)
    persons = []

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            persons.append(ent.text)

    # Deduplicate while preserving order
    seen = set()
    persons = [p for p in persons if not (p in seen or seen.add(p))]

    return persons


In [6]:
def remove_entities_from_text(text: str, entities: list[str]) -> str:
    cleaned_text = text

    for ent in sorted(entities, key=len, reverse=True):
        pattern = r'\b' + re.escape(ent) + r'\b'
        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)

    # Clean extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text


In [7]:
def extract_keyphrases(text: str, top_n: int = 10):
    if len(text.split()) < 20:
        return []

    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words="english",
        use_mmr=True,          # diversity
        diversity=0.5,
        top_n=top_n
    )

    return [kw for kw, score in keywords]


In [8]:
def process_chunk(row):
    text = row["chunk_text"]

    # 1. NER
    persons = extract_person_entities(text)

    # 2. Remove person names
    cleaned_text = remove_entities_from_text(text, persons)

    # 3. KeyBERT
    keyphrases = extract_keyphrases(cleaned_text)

    return pd.Series({
        "person_entities": persons,
        "cleaned_text": cleaned_text,
        "keyphrases": keyphrases
    })


In [9]:
df[[
    "person_entities",
    "cleaned_text",
    "keyphrases"
]] = df.apply(process_chunk, axis=1)


KeyboardInterrupt: 