In [2]:
from sentence_transformers import SentenceTransformer
import json
from tqdm.notebook import tqdm
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    print("Downloading NLTK's 'punkt' model for sentence tokenization...")
    nltk.download('punkt')
import numpy as np
import pandas as pd

In [2]:
try:
    end_data = []
    with open(r'00_prep\cleaned_texts\all_processed.json','r',encoding='utf-8') as f:
        data = json.load(f)
    for book in data:
        for chapter in book['content']:
            for para in chapter:
                end_data.append(para)
except FileNotFoundError:
    exit()


In [4]:
for e in end_data:
    if not isinstance(e,str):
        print('uhoh')

In [None]:
from sentence_transformers import SentenceTransformer
from pprint import pprint
# Load the model and its tokenizer
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)
tokenizer = model.tokenizer

lens_tokens = []
for para in end_data:
    tokenized = tokenizer.encode(para)
    lens_tokens.append(len(tokenized))
lens_tokens = sorted(lens_tokens)
counter = 0
for lens in lens_tokens:
    if lens >= 384:
        counter += 1
counter

Token indices sequence length is longer than the specified maximum sequence length for this model (389 > 384). Running this sequence through the model will result in indexing errors


3585

Loaded 43 books, 1655 chapters, and 20016 paragraphs.


In [None]:
# setup
model = SentenceTransformer("all-mpnet-base-v2", device = 'cuda')

with open(r'00_prep\cleaned_texts\all_processed.json', 'r',encoding='utf-8') as f:
    data = json.load(f)


num_books = len(data)
num_chapters = sum(len(book['content']) for book in data)
num_paragraphs = sum(len(chapter) for book in data for chapter in book['content']) 
print(f"Loaded {num_books} books, {num_chapters} chapters, and {num_paragraphs} paragraphs.")

underlying_transformer = model[0].auto_model
tokenizer = model[0].tokenizer

# Get the maximum sequence length from the model's configuration
max_seq_length = tokenizer.model_max_length
print(f"The maximum sequence length for  is: {max_seq_length} tokens")

The maximum sequence length for  is: 384 tokens


In [None]:
# partitioning
all_sentences_flat = []
sentence_counts = []
for book in tqdm(data, desc='Processing books'):
    book_sentence_counts = []
    for chapter in book['content']:
        chapter_sentence_counts = []
        for paragraph in chapter:
            sentences = nltk.sent_tokenize(paragraph)
            if sentences:
                all_sentences_flat.extend(sentences)
                chapter_sentence_counts.append(len(sentences))
            else:
                chapter_sentence_counts.append(0)
        book_sentence_counts.append(chapter_sentence_counts)
    sentence_counts.append(book_sentence_counts)

print(f'total:{len(all_sentences_flat)}')
counter = 0
for sent in all_sentences_flat:
    tokenized_sent = tokenizer.encode(sent)
    if len(tokenized_sent) > max_seq_length:
        counter += 1
print(f'sents longer than {max_seq_length}: {counter}')


Processing books:   0%|          | 0/43 [00:00<?, ?it/s]

total:130122


Token indices sequence length is longer than the specified maximum sequence length for this model (399 > 384). Running this sequence through the model will result in indexing errors


sents longer then 384: 5


---

Sent embed and save

In [None]:
# EMBEDDING SENTENCE LEVEL
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

with open(r'00_prep\cleaned_texts\all_processed.json', 'r',encoding='utf-8') as f:
    data = json.load(f)

# partitioning
all_sentences_flat = []
sentence_counts = []
for book in tqdm(data, desc='Processing books'):
    book_sentence_counts = []
    for chapter in book['content']:
        chapter_sentence_counts = []
        for paragraph in chapter:
            sentences = nltk.sent_tokenize(paragraph)
            if sentences:
                all_sentences_flat.extend(sentences)
                chapter_sentence_counts.append(len(sentences))
            else:
                chapter_sentence_counts.append(0)
        book_sentence_counts.append(chapter_sentence_counts)
    sentence_counts.append(book_sentence_counts)

print(f'total:{len(all_sentences_flat)}')

# tokenize sentences
sentence_embeddings_flat = model.encode(
    all_sentences_flat,
    show_progress_bar=True,
    batch_size=128,
    
)

sentence_embeddings_hierarchical = []
paragraph_embeddings_aggregated = []
chapter_embeddings_aggregated = []
book_embeddings_aggregated = []

current_sentence_idx = 0
for i, book in enumerate(tqdm(data,desc='agg embeddings')):
    book_chapter_embeddings = []
    book_paragraph_embeddings = []

    for j, chapter in enumerate(book['content']):
        chapter_paragraph_embeddings = []

        for k, paragraph in enumerate(chapter):
            num_sentences = sentence_counts[i][j][k]

            if num_sentences > 0:
                paragraph_s_embeddings = sentence_embeddings_flat[current_sentence_idx:current_sentence_idx+num_sentences]
                
                # paragraph
                paragraph_agg_embedding = np.mean(paragraph_s_embeddings, axis= 0)
                chapter_paragraph_embeddings.append(paragraph_agg_embedding)
                current_sentence_idx += num_sentences

        # chapter
        if chapter_paragraph_embeddings:
            chapter_agg_embedding = np.mean(chapter_paragraph_embeddings, axis=0)
            book_chapter_embeddings.append(chapter_agg_embedding)

        book_paragraph_embeddings.append(chapter_paragraph_embeddings)
# book
if book_chapter_embeddings:
    book_agg_embedding = np.mean(book_chapter_embeddings, axis=0)
    book_embeddings_aggregated.append(book_agg_embedding)

paragraph_embeddings_aggregated.append(book_paragraph_embeddings)
chapter_embeddings_aggregated.append(book_chapter_embeddings)

print("\nAggregation Complete!")
print(f"Shape of one book embedding (aggregated): {book_embeddings_aggregated[0].shape}")
print(f"Shape of one chapter embedding (aggregated): {chapter_embeddings_aggregated[0][0].shape}")
print(f"Shape of one paragraph embedding (aggregated): {paragraph_embeddings_aggregated[0][0][0].shape}")

Processing books:   0%|          | 0/43 [00:00<?, ?it/s]

total:130122


Batches:   0%|          | 0/1017 [00:00<?, ?it/s]

agg embeddings:   0%|          | 0/43 [00:00<?, ?it/s]


Aggregation Complete!
Shape of one book embedding (aggregated): (768,)
Shape of one chapter embedding (aggregated): (768,)
Shape of one paragraph embedding (aggregated): (768,)


In [12]:
import pandas as pd
import numpy as np
import nltk
import json
from tqdm import tqdm

# --- PREREQUISITES ---
# This code assumes you have ALREADY run your embedding generation and have these variables:
# - data: The original JSON data loaded from the file.
# - sentence_embeddings_flat: The numpy array of embeddings.
# - all_sentences_flat: The list of actual sentence strings that were encoded. (Optional but good for validation)

# --- 1. Create the Master DataFrame ---
print("Step 1: Creating the Master DataFrame for sentences...")

# This process must EXACTLY mirror how you created `all_sentences_flat`
# We will iterate through the original data and tokenize it into sentences again.
records = []
for i, book in enumerate(tqdm(data, desc="Building DataFrame")):
    book_title = book['meta'].get('Original Title', f'book_{i}')
    book_author = book['meta'].get('Original Writer')
    year = book['meta'].get('Publication Year (Original)')
    for j, chapter in enumerate(book['content']):
        for k, paragraph in enumerate(chapter):
            # Tokenize the paragraph into sentences, just like before
            sentences = nltk.sent_tokenize(paragraph)
            
            # NOTE: If you used the advanced "split_text" function from our
            # previous discussion to handle long sentences, you MUST use that
            # same logic here to ensure the lists match. For simplicity,
            # this example assumes standard sent_tokenize was sufficient.
            
            if sentences:
                for s_idx, sentence in enumerate(sentences):
                    records.append({
                        "book_title": book_title,
                        "author": book_author,
                        "year": year,
                        "book_index": i,
                        "chapter_index": j,
                        "paragraph_index": k,
                        "sentence_index_in_para": s_idx, # Useful for context
                        "text": sentence
                    })

# Create the DataFrame from our list of records
df_sentences = pd.DataFrame(records)

# --- 2. Add Embeddings and Validate ---
print("\nStep 2: Adding embeddings and validating...")

# This is the most critical step. The length of the DataFrame must
# exactly match the number of embeddings you generated.
print(f"Number of rows in DataFrame: {len(df_sentences)}")
print(f"Number of embeddings:        {len(sentence_embeddings_flat)}")

assert len(df_sentences) == len(sentence_embeddings_flat), "Mismatch between DataFrame rows and number of embeddings!"

# Add the pre-computed embeddings as a new column.
# We convert the numpy array to a list so pandas can store it.
df_sentences['embedding'] = list(sentence_embeddings_flat)

print("\nDataFrame with sentence embeddings created successfully.")
print(df_sentences.head())
print(f"\nShape of the DataFrame: {df_sentences.shape}")
print(f"Data type of the 'embedding' column: {type(df_sentences['embedding'].iloc[0])}")


print("\nStep 3: Saving the DataFrame to Parquet format...")

# Define the output file path
output_path = 'sentence_embeddings_with_metadata.parquet'

# Save the DataFrame
df_sentences.to_parquet(output_path, index=False)

print(f"DataFrame successfully saved to {output_path}")

# --- How to load it back later ---
# df_loaded = pd.read_parquet(output_path)
# print("\nSuccessfully loaded the DataFrame from Parquet:")
# print(df_loaded.head())
# print(f"Embedding type after loading: {type(df_loaded['embedding'].iloc[0])}")

Step 1: Creating the Master DataFrame for sentences...


Building DataFrame: 100%|██████████| 43/43 [00:03<00:00, 12.13it/s]



Step 2: Adding embeddings and validating...
Number of rows in DataFrame: 130122
Number of embeddings:        130122

DataFrame with sentence embeddings created successfully.
                                       book_title            author  year  \
0  A System of Logic, Ratiocinative and Inductive  John Stuart Mill  1843   
1  A System of Logic, Ratiocinative and Inductive  John Stuart Mill  1843   
2  A System of Logic, Ratiocinative and Inductive  John Stuart Mill  1843   
3  A System of Logic, Ratiocinative and Inductive  John Stuart Mill  1843   
4  A System of Logic, Ratiocinative and Inductive  John Stuart Mill  1843   

   book_index  chapter_index  paragraph_index  sentence_index_in_para  \
0           0              0                0                       0   
1           0              0                0                       1   
2           0              0                1                       0   
3           0              0                1                       1 

------

In [None]:
#df - > arrow
# Assuming you have already run the embedding code and have these variables:
# - data: The original JSON data
# - paragraph_embeddings_direct_flat: The numpy array of paragraph embeddings

print("Step 1: Creating the Master DataFrame...")

# Flatten the data structure into a list of dictionaries
records = []
for i, book in enumerate(data):
    book_title = book['meta'].get('Original Title', f'book_{i}')
    book_author = book['meta'].get('Original Writer')
    for j, chapter in enumerate(book['content']):
        for k, paragraph in enumerate(chapter):
            # We only create a record if the paragraph is not empty,
            # ensuring a 1-to-1 match with our flat embeddings.
            if paragraph.strip():
                records.append({
                    "book_title": book_title,
                    "book_index": i,
                    "author": book_author
                    "chapter_index": j,
                    "paragraph_index": k,
                    "text": paragraph
                })

# Create the DataFrame
df = pd.DataFrame(records)

# Add the pre-computed embeddings as a new column
# Ensure the lengths match!
assert len(df) == len(paragraph_embeddings_direct_flat)
df['embedding'] = list(paragraph_embeddings_direct_flat)

print("DataFrame created successfully.")
print(df.head())
print(f"\nShape of the DataFrame: {df.shape}")


In [None]:
# --- Paragraph / Chunked Embeddings (Direct) --- NO SENTENCE LEVEL
print("\n--- Method 2: Direct Paragraph (Chunk) Embedding ---")
with open(r'00_prep\cleaned_texts\all_processed.json', 'r',encoding='utf-8') as f:
    data = json.load(f)

model = SentenceTransformer("Qwen/Qwen3-Embedding-8B", device = 'cuda')
num_books = len(data)
num_chapters = sum(len(book['content']) for book in data)
num_paragraphs = sum(len(chapter) for book in data for chapter in book['content'])
print(f"Loaded {num_books} books, {num_chapters} chapters, and {num_paragraphs} paragraphs.")

all_paragraphs_flat = [
    paragraph 
    for book in data 
    for chapter in book['content'] 
    for paragraph in chapter 
    if paragraph.strip() # Ensure paragraph is not empty
]

print(f"Total paragraphs/chunks to embed: {len(all_paragraphs_flat)}")

print("Generating direct embeddings for all paragraphs...")
paragraph_embeddings_direct_flat = model.encode(
    all_paragraphs_flat,
    show_progress_bar=True,
    batch_size=16
)

# You can also aggregate these direct paragraph embeddings to get chapter/book level
# This is often a better starting point than sentence-level aggregation
print("\nAggregating direct paragraph embeddings...")
# We need to structure them hierarchically first
paragraph_embeddings_direct = []
chapter_embeddings_from_direct = []
book_embeddings_from_direct = []

current_paragraph_idx = 0
for book in tqdm(data, desc="Aggregating Direct Paragraphs"):
    book_chapter_embs = []
    book_para_embs = []
    for chapter in book['content']:
        num_paragraphs_in_chapter = len([p for p in chapter if p.strip()])
        if num_paragraphs_in_chapter > 0:
            # Slice the embeddings for the current chapter
            chapter_para_embs = paragraph_embeddings_direct_flat[current_paragraph_idx : current_paragraph_idx + num_paragraphs_in_chapter]
            
            # --- CHAPTER EMBEDDING (from direct chunks) ---
            chapter_agg_emb = np.mean(chapter_para_embs, axis=0)
            book_chapter_embs.append(chapter_agg_emb)
            book_para_embs.append(chapter_para_embs)
            current_paragraph_idx += num_paragraphs_in_chapter
    
    # --- BOOK EMBEDDING (from direct chunks) ---
    if book_chapter_embs:
        book_agg_emb = np.mean(book_chapter_embs, axis=0)
        book_embeddings_from_direct.append(book_agg_emb)
        
    chapter_embeddings_from_direct.append(book_chapter_embs)
    paragraph_embeddings_direct.append(book_para_embs)

print("\nDirect Embedding and Aggregation Complete!")
print(f"Shape of one book embedding (from direct): {book_embeddings_from_direct[0].shape}")
print(f"Shape of one chapter embedding (from direct): {chapter_embeddings_from_direct[0][0].shape}")
print(f"Shape of one paragraph embedding (direct): {paragraph_embeddings_direct[0][0][0].shape}")

In [None]:
#df - > arrow
# Assuming you have already run the embedding code and have these variables:
# - data: The original JSON data
# - paragraph_embeddings_direct_flat: The numpy array of paragraph embeddings

print("Step 1: Creating the Master DataFrame...")

# Flatten the data structure into a list of dictionaries
records = []
for i, book in enumerate(data):
    book_title = book['meta'].get('Original Title', f'book_{i}')
    book_author = book['meta'].get('Original Writer')
    for j, chapter in enumerate(book['content']):
        for k, paragraph in enumerate(chapter):
            # We only create a record if the paragraph is not empty,
            # ensuring a 1-to-1 match with our flat embeddings.
            if paragraph.strip():
                records.append({
                    "book_title": book_title,
                    "book_index": i,
                    "author": book_author
                    "chapter_index": j,
                    "paragraph_index": k,
                    "text": paragraph
                })

# Create the DataFrame
df = pd.DataFrame(records)

# Add the pre-computed embeddings as a new column
# Ensure the lengths match!
assert len(df) == len(paragraph_embeddings_direct_flat)
df['embedding'] = list(paragraph_embeddings_direct_flat)

print("DataFrame created successfully.")
print(df.head())
print(f"\nShape of the DataFrame: {df.shape}")

In [None]:
# FINE GRAINED topic modeling of main cluster... 
docs_subset = [...]
embeddings_subset = [...]
from sklearn.feature_extraction.text import CountVectorizer
logger.info("Starting fine-grained sub-clustering with sensitive parameters...")

# --- Create a "Microscope" BERTopic Model ---

# 1. Define highly sensitive UMAP and HDBSCAN models
sensitive_umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

sensitive_hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    min_samples=5,
    metric='cosine',
    cluster_selection_method='leaf', # Use 'leaf' for fine-grained topics
    prediction_data=True
)

# 2. Instantiate a new BERTopic model with these components
sub_topic_model = BERTopic(
    umap_model=sensitive_umap_model,
    hdbscan_model=sensitive_hdbscan_model,
    language="english",
    verbose=True
)

# 3. Fit the model on your subset
sub_topics, _ = sub_topic_model.fit_transform(docs_subset, embeddings=embeddings_subset)

# 4. Analyze the results
print(sub_topic_model.get_topic_info())

# --- Iterate and Refine ---
# If you get too many tiny topics, slightly increase min_cluster_size (e.g., to 15 or 20).
# If the topics are still too broad, slightly decrease n_neighbors (e.g., to 5).
# This interactive process is much more effective than re-running the whole Optuna search.






### ?????????
# with custom vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create a vectorizer that IGNORES default stopwords
# and instead focuses on words specific to your philosophy corpus.
custom_vectorizer = CountVectorizer(
    stop_words=None, # No default stop words
    min_df=5,        # Word must appear in at least 5 paragraphs to be considered
    max_df=0.8       # Ignore words that appear in >80% of paragraphs (like 'the', 'is', if they weren't already filtered)
)

# Then plug this into your sub_topic_model
sub_topic_model = BERTopic(
    #...
    vectorizer_model=custom_vectorizer,
    #...
)