In [6]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# --------------------------------------------------
# Load your data
# --------------------------------------------------
# Load the CHUNKED data (just for processing)
chunks_df = pd.read_csv("samentic_chunk.csv")
chunks_df = chunks_df.sort_values(["title", "chunk_id"]).reset_index(drop=True)

# Load the ORIGINAL stories data (where we'll save topics)
stories_df = pd.read_csv(r"C:\Users\diwas\Downloads\preprocessed_data.csv~1\preprocessed_data.csv")

# Add topic columns to stories ONLY
stories_df["topic_id"] = None
stories_df["topic_keywords"] = None
stories_df["num_chunks"] = None
stories_df["topic_confidence"] = None

# Shared embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# --------------------------------------------------
# Configure BERTopic components
# --------------------------------------------------
umap_model = UMAP(
    n_neighbors=5,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

hdbscan_model = HDBSCAN(
    min_cluster_size=2,
    min_samples=1,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=1,
    ngram_range=(1, 2)
)

# --------------------------------------------------
# Process ONE story at a time
# --------------------------------------------------
for title in chunks_df["title"].unique():
    story_chunks = chunks_df[chunks_df["title"] == title]
    chunks = story_chunks["chunk_text"].tolist()
    
    print(f"Processing: {title} ({len(chunks)} chunks)")
    
    # Skip if too few chunks
    if len(chunks) < 3:
        story_idx = stories_df[stories_df["title"] == title].index[0]
        stories_df.loc[story_idx, "topic_id"] = -1
        stories_df.loc[story_idx, "topic_keywords"] = "insufficient_data"
        stories_df.loc[story_idx, "num_chunks"] = len(chunks)
        print(f"  ⚠️ Skipped (too few chunks)")
        continue
    
    # Embed chunks
    embeddings = embedding_model.encode(chunks, show_progress_bar=False)
    
    # Topic modeling
    try:
        topic_model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            language="english",
            calculate_probabilities=False,
            verbose=False,
            nr_topics="auto"
        )
        
        chunk_topics, _ = topic_model.fit_transform(chunks, embeddings)
        
        # Find dominant topic
        topic_counts = Counter(t for t in chunk_topics if t != -1)
        
        if topic_counts:
            dominant_topic = topic_counts.most_common(1)[0][0]
            topic_count = topic_counts[dominant_topic]
            confidence = topic_count / len(chunks)
            
            # Get keywords for dominant topic
            top_words = topic_model.get_topic(dominant_topic)
            if top_words:
                keywords = ", ".join([word for word, _ in top_words[:5]])
            else:
                keywords = "no_keywords"
            
            print(f"  ✅ Topic {dominant_topic}: {keywords} ({topic_count}/{len(chunks)} chunks, {confidence:.1%})")
        else:
            dominant_topic = -1
            keywords = "no_topic_found"
            confidence = 0.0
            print(f"  ⚠️ No topics found (all noise)")
        
        # Store in STORIES dataframe (not chunks!)
        story_idx = stories_df[stories_df["title"] == title].index[0]
        stories_df.loc[story_idx, "topic_id"] = dominant_topic
        stories_df.loc[story_idx, "topic_keywords"] = keywords
        stories_df.loc[story_idx, "num_chunks"] = len(chunks)
        stories_df.loc[story_idx, "topic_confidence"] = confidence
        
    except Exception as e:
        print(f"  ❌ Error: {str(e)}")
        story_idx = stories_df[stories_df["title"] == title].index[0]
        stories_df.loc[story_idx, "topic_id"] = -1
        stories_df.loc[story_idx, "topic_keywords"] = "error"
        stories_df.loc[story_idx, "num_chunks"] = len(chunks)

# --------------------------------------------------
# Save ONLY the stories with topics
# --------------------------------------------------
stories_df.to_csv(r"C:\Users\diwas\Downloads\preprocessed_data.csv~1\preprocessed_data.csv", index=False)

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total stories: {len(stories_df)}")
print(f"Stories with topics: {(stories_df['topic_id'] != -1).sum()}")
print(f"\nTopic distribution:")
print(stories_df['topic_id'].value_counts().sort_index())

print("\n" + "="*60)
print("SAMPLE RESULTS")
print("="*60)
print(stories_df[['title', 'topic_id', 'topic_keywords', 'num_chunks', 'topic_confidence']].head(10))

Processing: * PHILOSOPHY OF FURNITURE (21 chunks)
  ✅ Topic 0: aristocracy, taste, display, sole, readily (5/21 chunks, 23.8%)
Processing: A DESCENT INTO THE MAELSTRÖM (61 chunks)
  ✅ Topic 0: ström, slack, moskoe ström, moskoe, whirl (9/61 chunks, 14.8%)
Processing: A PREDICAMENT (33 chunks)
  ✅ Topic 0: pompey, diana, said, hole, city (26/33 chunks, 78.8%)
Processing: A TALE OF JERUSALEM (10 chunks)
  ✅ Topic 0: let, wall, day, city, phittim (3/10 chunks, 30.0%)
Processing: A TALE OF THE RAGGED MOUNTAINS (36 chunks)
  ✅ Topic 0: came, felt, tree, astonishment, soul (9/36 chunks, 25.0%)
Processing: BERENICE (27 chunks)
  ✅ Topic 0: est, disorder, nature, horizon rainbow, trance (5/27 chunks, 18.5%)
Processing: DIDDLING (37 chunks)
  ✅ Topic 0: diddler, diddle, brandy water, brandy, water (8/37 chunks, 21.6%)
Processing: ELEONORA (19 chunks)
  ✅ Topic 0: valley, love, loveliness, maiden, eleonora (4/19 chunks, 21.1%)
Processing: FOUR BEASTS IN ONE—THE HOMO-CAMELEOPARD (25 chunks)
  ✅ T

In [5]:
pd.

Unnamed: 0,title,chunk_id,chunk_text,token_count,topic_model,chunk_topic
0,* PHILOSOPHY OF FURNITURE,0,"In the internal decoration, if not in the exte...",128,0,0
1,* PHILOSOPHY OF FURNITURE,1,In Spain they are all curtains—a nation of han...,148,0,0
2,* PHILOSOPHY OF FURNITURE,2,"To speak less abstractly. In England, for exam...",128,0,0
3,* PHILOSOPHY OF FURNITURE,3,"The people will imitate the nobles, and the re...",156,0,0
4,* PHILOSOPHY OF FURNITURE,4,There could be nothing more directly offensive...,184,0,2
...,...,...,...,...,...,...
2847,X-ING A PARAGRAPH,15,"The true reason, perhaps, is that x is rather ...",145,0,3
2848,X-ING A PARAGRAPH,16,Next morning the population of Nopolis were ta...,276,0,0
2849,X-ING A PARAGRAPH,17,The first definite idea entertained by the pop...,122,0,2
2850,X-ING A PARAGRAPH,18,One gentleman thought the whole an X-ellent jo...,133,0,4


In [8]:
stories_df.to_csv("preprocessed_data_with_topics.csv", index=False)