In [2]:
from sentence_transformers import SentenceTransformer
import json
from tqdm.notebook import tqdm
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    print("Downloading NLTK's 'punkt' model for sentence tokenization...")
    nltk.download('punkt')
import numpy as np
import pandas as pd

In [3]:
# setup
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B", device = 'cuda')
with open(r'00_prep\cleaned_texts\all_processed.json', 'r',encoding='utf-8') as f:
    data = json.load(f)


num_books = len(data)
num_chapters = sum(len(book['content']) for book in data)
num_paragraphs = sum(len(chapter) for book in data for chapter in book['content']) 
print(f"Loaded {num_books} books, {num_chapters} chapters, and {num_paragraphs} paragraphs.")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 4 files:  25%|██▌       | 1/4 [18:14<54:44, 1094.82s/it]


OSError: Qwen/Qwen3-Embedding-8B does not appear to have a file named model-00002-of-00004.safetensors. Checkout 'https://huggingface.co/Qwen/Qwen3-Embedding-8B/tree/main'for available files.

In [None]:

# Assuming you have the 'model' object and 'df_paragraphs' DataFrame from previous steps

# Access the underlying transformer model to get its properties
# The model is usually at the '0' index of the 'modules' list
underlying_transformer = model[0].auto_model
tokenizer = model[0].tokenizer

# Get the maximum sequence length from the model's configuration
max_seq_length = tokenizer.model_max_length
print(f"The maximum sequence length for '{model.model.name_or_path}' is: {max_seq_length} tokens")

In [None]:
# EMBEDDING SENTENCE LEVEL

# partitioning
all_sentences_flat = []
sentence_counts = []
for book in tqdm(data, desc='Processing books'):
    book_sentence_counts = []
    for chapter in book['content']:
        chapter_sentence_counts = []
        for paragraph in chapter:
            sentences = nltk.sent_tokenize(paragraph)
            if sentences:
                all_sentences_flat.extend(sentences)
                chapter_sentence_counts.append(len(sentences))
            else:
                chapter_sentence_counts.append(0)
        book_sentence_counts.append(chapter_sentence_counts)
    sentence_counts.append(book_sentence_counts)

print(f'total:{len(all_sentences_flat)}')

# tokenize sentences
sentence_embeddings_flat = model.encode(
    all_sentences_flat,
    show_progress_bar=True,
    batch_size=128
)

sentence_embeddings_hierarchical = []
paragraph_embeddings_aggregated = []
chapter_embeddings_aggregated = []
book_embeddings_aggregated = []

current_sentence_idx = 0
for i, book in enumerate(tqdm(data,desc='agg embeddings')):
    book_chapter_embeddings = []
    book_paragraph_embeddings = []

    for j, chapter in enumerate(book['content']):
        chapter_paragraph_embeddings = []

        for k, paragraph in enumerate(chapter):
            num_sentences = sentence_counts[i][j][k]

            if num_sentences > 0:
                paragraph_s_embeddings = sentence_embeddings_flat[current_sentence_idx:current_sentence_idx+num_sentences]
                
                # paragraph
                paragraph_agg_embedding = np.mean(paragraph_s_embeddings, axis= 0)
                chapter_paragraph_embeddings.append(paragraph_agg_embedding)
                current_sentence_idx += num_sentences

        # chapter
        if chapter_paragraph_embeddings:
            chapter_agg_embedding = np.mean(chapter_paragraph_embeddings, axis=0)
            book_chapter_embeddings.append(chapter_agg_embedding)

        book_paragraph_embeddings.append(chapter_paragraph_embeddings)
# book
if book_chapter_embeddings:
    book_agg_embedding = np.mean(book_chapter_embeddings, axis=0)
    book_embeddings_aggregated.append(book_agg_embedding)

paragraph_embeddings_aggregated.append(book_paragraph_embeddings)
chapter_embeddings_aggregated.append(book_chapter_embeddings)

print("\nAggregation Complete!")
print(f"Shape of one book embedding (aggregated): {book_embeddings_aggregated[0].shape}")
print(f"Shape of one chapter embedding (aggregated): {chapter_embeddings_aggregated[0][0].shape}")
print(f"Shape of one paragraph embedding (aggregated): {paragraph_embeddings_aggregated[0][0][0].shape}")

In [None]:
# --- Paragraph / Chunked Embeddings (Direct) --- NO SENTENCE LEVEL
print("\n--- Method 2: Direct Paragraph (Chunk) Embedding ---")
with open(r'00_prep\cleaned_texts\all_processed.json', 'r',encoding='utf-8') as f:
    data = json.load(f)

model = SentenceTransformer("Qwen/Qwen3-Embedding-8B", device = 'cuda')
num_books = len(data)
num_chapters = sum(len(book['content']) for book in data)
num_paragraphs = sum(len(chapter) for book in data for chapter in book['content'])
print(f"Loaded {num_books} books, {num_chapters} chapters, and {num_paragraphs} paragraphs.")

all_paragraphs_flat = [
    paragraph 
    for book in data 
    for chapter in book['content'] 
    for paragraph in chapter 
    if paragraph.strip() # Ensure paragraph is not empty
]

print(f"Total paragraphs/chunks to embed: {len(all_paragraphs_flat)}")

print("Generating direct embeddings for all paragraphs...")
paragraph_embeddings_direct_flat = model.encode(
    all_paragraphs_flat,
    show_progress_bar=True,
    batch_size=16
)

# You can also aggregate these direct paragraph embeddings to get chapter/book level
# This is often a better starting point than sentence-level aggregation
print("\nAggregating direct paragraph embeddings...")
# We need to structure them hierarchically first
paragraph_embeddings_direct = []
chapter_embeddings_from_direct = []
book_embeddings_from_direct = []

current_paragraph_idx = 0
for book in tqdm(data, desc="Aggregating Direct Paragraphs"):
    book_chapter_embs = []
    book_para_embs = []
    for chapter in book['content']:
        num_paragraphs_in_chapter = len([p for p in chapter if p.strip()])
        if num_paragraphs_in_chapter > 0:
            # Slice the embeddings for the current chapter
            chapter_para_embs = paragraph_embeddings_direct_flat[current_paragraph_idx : current_paragraph_idx + num_paragraphs_in_chapter]
            
            # --- CHAPTER EMBEDDING (from direct chunks) ---
            chapter_agg_emb = np.mean(chapter_para_embs, axis=0)
            book_chapter_embs.append(chapter_agg_emb)
            book_para_embs.append(chapter_para_embs)
            current_paragraph_idx += num_paragraphs_in_chapter
    
    # --- BOOK EMBEDDING (from direct chunks) ---
    if book_chapter_embs:
        book_agg_emb = np.mean(book_chapter_embs, axis=0)
        book_embeddings_from_direct.append(book_agg_emb)
        
    chapter_embeddings_from_direct.append(book_chapter_embs)
    paragraph_embeddings_direct.append(book_para_embs)

print("\nDirect Embedding and Aggregation Complete!")
print(f"Shape of one book embedding (from direct): {book_embeddings_from_direct[0].shape}")
print(f"Shape of one chapter embedding (from direct): {chapter_embeddings_from_direct[0][0].shape}")
print(f"Shape of one paragraph embedding (direct): {paragraph_embeddings_direct[0][0][0].shape}")

In [None]:
#df - > arrow
# Assuming you have already run the embedding code and have these variables:
# - data: The original JSON data
# - paragraph_embeddings_direct_flat: The numpy array of paragraph embeddings

print("Step 1: Creating the Master DataFrame...")

# Flatten the data structure into a list of dictionaries
records = []
for i, book in enumerate(data):
    book_title = book['meta'].get('Original Title', f'book_{i}')
    book_author = book['meta'].get('Original Writer')
    for j, chapter in enumerate(book['content']):
        for k, paragraph in enumerate(chapter):
            # We only create a record if the paragraph is not empty,
            # ensuring a 1-to-1 match with our flat embeddings.
            if paragraph.strip():
                records.append({
                    "book_title": book_title,
                    "book_index": i,
                    "chapter_index": j,
                    "paragraph_index": k,
                    "text": paragraph
                })

# Create the DataFrame
df = pd.DataFrame(records)

# Add the pre-computed embeddings as a new column
# Ensure the lengths match!
assert len(df) == len(paragraph_embeddings_direct_flat)
df['embedding'] = list(paragraph_embeddings_direct_flat)

print("DataFrame created successfully.")
print(df.head())
print(f"\nShape of the DataFrame: {df.shape}")

In [None]:
# FINE GRAINED topic modeling of main cluster... 
docs_subset = [...]
embeddings_subset = [...]
from sklearn.feature_extraction.text import CountVectorizer
logger.info("Starting fine-grained sub-clustering with sensitive parameters...")

# --- Create a "Microscope" BERTopic Model ---

# 1. Define highly sensitive UMAP and HDBSCAN models
sensitive_umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

sensitive_hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    min_samples=5,
    metric='cosine',
    cluster_selection_method='leaf', # Use 'leaf' for fine-grained topics
    prediction_data=True
)

# 2. Instantiate a new BERTopic model with these components
sub_topic_model = BERTopic(
    umap_model=sensitive_umap_model,
    hdbscan_model=sensitive_hdbscan_model,
    language="english",
    verbose=True
)

# 3. Fit the model on your subset
sub_topics, _ = sub_topic_model.fit_transform(docs_subset, embeddings=embeddings_subset)

# 4. Analyze the results
print(sub_topic_model.get_topic_info())

# --- Iterate and Refine ---
# If you get too many tiny topics, slightly increase min_cluster_size (e.g., to 15 or 20).
# If the topics are still too broad, slightly decrease n_neighbors (e.g., to 5).
# This interactive process is much more effective than re-running the whole Optuna search.






### ?????????
# with custom vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create a vectorizer that IGNORES default stopwords
# and instead focuses on words specific to your philosophy corpus.
custom_vectorizer = CountVectorizer(
    stop_words=None, # No default stop words
    min_df=5,        # Word must appear in at least 5 paragraphs to be considered
    max_df=0.8       # Ignore words that appear in >80% of paragraphs (like 'the', 'is', if they weren't already filtered)
)

# Then plug this into your sub_topic_model
sub_topic_model = BERTopic(
    #...
    vectorizer_model=custom_vectorizer,
    #...
)