# Final Project: NeuroDump - thoughts organizer

## I. Environment setup

### 1.1. Environment

In [1]:
#!pip install -r requirements.txt --- UPDATE THIS AT THE END

### 1.2. Tools and Libraries

In [34]:
import warnings
warnings.filterwarnings('ignore')
import os
import re
import nltk
from transformers import AutoTokenizer
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from pathlib import Path
from IPython.display import Markdown, display
import hdbscan
import numpy as np
import json
from collections import defaultdict
import subprocess
import ast

nltk.download('punkt')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


### 1.3. Custom functions

In [36]:
# Config paths
FOLDER_PATH = "./data/mock_notes/"          #files
LOG_PATH = "./data/processed_files.txt"     #list of processed files
PERSIST_DIR = "./data/chroma_db"            #embeddings database (store vectorized chunks)
CLUSTER_DIR = "./data/cluster_dir/"         #clusters database
SUBNOTES = "./data/outputs/SUBNOTES"        #subnotes formatted (chunks list)
LLM_RECLUSTER = "./data/outputs/LLM_recluster"
CLUSTER_NAMES_DICT = "./data/outputs/LLM_recluster/cluster_names_dict.json"

# Create directories if they don't exist
os.makedirs(FOLDER_PATH, exist_ok=True)
os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
os.makedirs(PERSIST_DIR, exist_ok=True)
os.makedirs(CLUSTER_DIR, exist_ok=True)
os.makedirs(SUBNOTES, exist_ok=True)
os.makedirs(LLM_RECLUSTER, exist_ok=True)

In [37]:
# Initialize embedding model
HF_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [38]:
# Generic Save JSON file 
def save_json(data, folder_path, filename):
    """Save data as JSON to specified folder and filename."""
    out_path = os.path.join(folder_path, filename)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\n=== Saved JSON to: {out_path}")
    return out_path

In [40]:
def extract_base_filename(filename):
    """Extract base filename without extension and suffixes."""
    # Remove .txt extension if present
    if filename.endswith('.txt'):
        filename = filename[:-4]
    
    # Remove _clusters suffix if present
    if filename.endswith('_clusters'):
        filename = filename[:-9]
    
    return filename

# II. Craft pipeline

### 1. Input unstructured document
Incremental version. It will process only new files.

In [8]:
def get_processed_files():
    """Load list of already processed files from log."""
    if os.path.exists(LOG_PATH):
        with open(LOG_PATH, "r") as f:
            return set(line.strip() for line in f)
    return set()


def get_new_files():
    """Get list of new .txt files that haven't been processed yet."""
    if not os.path.exists(FOLDER_PATH):
        return []
    
    all_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith(".txt")]
    processed_files = get_processed_files()
    return [f for f in all_files if f not in processed_files]

### 2. Split document into chunks of tokens

In [9]:
def count_tokens(text):
    """Count tokens in text using the embedding model's tokenizer."""
    return len(tokenizer.tokenize(text))

def is_code_like(sentence):
    """Detect if a sentence contains code-like patterns."""
    code_keywords = ["from ", "import ", "def ", "return ", "class ", "for ", "if ", "else", "while ", "try:", "except", "print("]
    code_symbols = ["=", "()", "[]", "{}", "->", "::"]
    sentence_lower = sentence.strip().lower()
    return (
        any(sentence_lower.startswith(k) for k in code_keywords) or
        any(sym in sentence for sym in code_symbols)
    )


# ### CHUNKING LOGIC
def chunk_text(text, source_name, min_tokens=40, max_tokens=100, inspect=False):
    """Split text into semantic chunks with token limits."""
    raw_blocks = re.split(r"\n\s*\n", text.strip())                    # Split by double newlines
    chunks, current_chunk, current_tokens = [], [], 0                  # Initialize chunking variables
    metadata = []                                                      # Store metadata for each chunk

    for block in raw_blocks:                                           # Process each text block
        sentences = nltk.sent_tokenize(block.strip())                  # Split block into sentences

        for sentence in sentences:                                     # Process each sentence
            sentence = sentence.strip()
            tokens = count_tokens(sentence)                            # Count tokens in sentence

            # Handle code-like sentences separately
            if is_code_like(sentence):                                 # Check if sentence contains code
                if current_chunk:                                      # Save current chunk if exists
                    chunks.append(" ".join(current_chunk))
                    metadata.append({"source": source_name})
                    current_chunk, current_tokens = [], 0
                chunks.append(sentence)                                # Add code as separate chunk
                metadata.append({"source": source_name})
                continue

            # Handle conversation transitions
            if any(sentence.lower().startswith(w) for w in ["btw", "wait", "oh", "then", "also", "now", "next"]):
                if current_chunk:                                      # Break chunk at transition words
                    chunks.append(" ".join(current_chunk))
                    metadata.append({"source": source_name})
                    current_chunk, current_tokens = [], 0

            # Check if adding sentence would exceed max tokens
            if current_tokens + tokens > max_tokens and current_tokens >= min_tokens:
                chunks.append(" ".join(current_chunk))                # Save current chunk
                metadata.append({"source": source_name})
                current_chunk, current_tokens = [], 0                 # Reset for new chunk

            current_chunk.append(sentence)                            # Add sentence to current chunk
            current_tokens += tokens                                  # Update token count

        # Save any remaining chunk after block processing
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            metadata.append({"source": source_name})
            current_chunk, current_tokens = [], 0

    # if inspect:                                                       # Optional: print chunks for debugging
    #     for i, chunk in enumerate(chunks):
    #         print(f"\n=== Chunks ===")
    #         print(f"\nChunk {i+1} ({count_tokens(chunk)} tokens):\n{chunk}")

    return chunks, metadata

### 3. Embeddings and save vectors in Chroma DB

In [10]:
# ### VECTOR STORE MANAGEMENT
class VectorStoreManager:
    def __init__(self):
        self.persist_dir = PERSIST_DIR                              # Store database directory path
        self.embeddings = HF_embeddings                          # Store embedding model
        self.vectorstore = self._load_existing()                    # Load existing DB or set to None

    def _load_existing(self):
        """Load existing Chroma database if it exists."""
        try:
            if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir):
                print(f"\n=== Loading existing Chroma DB from {self.persist_dir}")
                return Chroma(
                    persist_directory=self.persist_dir,
                    embedding_function=self.embeddings,
                    collection_name="default"
                )
        except Exception as e:
            print(f"\n=== Error loading existing Chroma DB: {e}")
        
        print("\n=== No existing Chroma DB found. Will create new one.")
        return None

    def add_chunks(self, chunks, metadata):
        """Add text chunks to vector store with embeddings."""
        if not self.vectorstore:                                     # Create new database if none exists
            print(f"\n=== Creating new Chroma DB at {self.persist_dir}")
            self.vectorstore = Chroma.from_texts(
                texts=chunks,
                metadatas=metadata,
                embedding=self.embeddings,
                persist_directory=self.persist_dir,
                collection_name="default"
            )
        else:                                                        # Add to existing database
            self.vectorstore.add_texts(texts=chunks, metadatas=metadata)


    def get_chunks_with_embeddings(self, fname):
        """Retrieve chunks, embeddings, and metadata for a specific file."""
        results = self.vectorstore.get(
            where={"source": fname},
            include=["documents", "embeddings", "metadatas"]
        )
        chunks = results.get('documents', [])               # Extract text chunks
        embeddings_list = results.get('embeddings', [])     # Exttact embeddings
        metadatas = results.get('metadatas', [])            # Extract metadata
    
        return chunks, embeddings_list, metadatas        

### 4. Clusterize embeddings with HDBSCAN

In [11]:
# ### CLUSTERING LOGIC
class ChunkClusterizer:
    def __init__(self, chunks, chunk_embeddings, metadata, fname):
        self.chunks = chunks                                         # Store text chunks
        self.embeddings = np.array(chunk_embeddings)                 # Convert embeddings to numpy array for ML operations
        self.metadata = metadata                                     # Save metadata       
        self.fname = fname
        self.labels = []                                             # Initialize cluster labels
        self.clusters = defaultdict(list)                            # cluster_id -> list of dicts {chunk, metadata}


    def cluster_chunks(self, min_cluster_size=2, min_samples=1):
        """Cluster chunks using HDBSCAN algorithm."""
        
        # Handle edge cases
        if len(self.chunks) < min_cluster_size:                      # Not enough chunks to cluster
            print(f"\n=== Not enough chunks to cluster ({len(self.chunks)} < {min_cluster_size}) ===")
            self.labels = [-1] * len(self.chunks)                   # Assign all to noise
            return self.labels

        # Ensure embeddings are 2D
        if self.embeddings.ndim == 1:                               # Reshape 1D embeddings to 2D
            self.embeddings = self.embeddings.reshape(1, -1)
            
        #print(f"\n===> Clustering {len(self.chunks)} chunks with embeddings shape: {self.embeddings.shape}")
            

        # Perform clustering
        clusterizer = hdbscan.HDBSCAN(                                     # Initialize HDBSCAN clustering algorithm
            min_cluster_size=min_cluster_size,                             # Minimum number of points in a cluster
            min_samples=min_samples,                                       # Minimum samples in neighborhood for core point
            metric='euclidean'                                             # Distance metric for clustering
        )
        self.labels = clusterizer.fit_predict(self.embeddings)             # Fit model and predict cluster labels = generate cluster labels

        # Display results
        num_clusters = len(set(self.labels)) - (1 if -1 in self.labels else 0)  # Count clusters (exclude noise)
        print(f"\n=== Found {num_clusters} clusters")
        print("\n=== Cluster distribution ===")
        unique, counts = np.unique(self.labels, return_counts=True)
        for label, count in zip(unique, counts):
            cname = "Noise" if label == -1 else f"Cluster {label}"
            print(f"{cname}: {count} chunks")    
                    
        # Show cluster contents
        for cluster_id in sorted(set(self.labels)):
            cluster_name = "Noise" if cluster_id == -1 else f"Cluster {cluster_id}"
            print(f"\n=== {cluster_name} ===")

            for i, label in enumerate(self.labels):
                if label == cluster_id:
                    # Save the chunk into the cluster dictionary with global ID
                    self.clusters[str(cluster_id)].append({
                        "chunk_id": i,                      # Global ID
                        "chunk": self.chunks[i],
                        "metadata": self.metadata[i]
                    })

                    # Display clearly: Chunk [global id]
                    display(Markdown(f"**Chunk {i}:** {self.chunks[i]}"))
            
        return self.labels                                   # Return cluster labels
        
    def save_clusters(self):
        filename = self.fname.replace(".txt", "_clusters.json")
        return save_json(self.clusters, CLUSTER_DIR, filename)

### 5. Read notes clustered by HDBSCAN

In [12]:
# 5. Read notes clustered by HDBSCAN (Updated to work with any cluster file)
def read_note_clusters(clusters_file_path):
    """Read HDBSCAN clusters file from any path"""
    with open(clusters_file_path, "r", encoding="utf-8") as f:
        note_clusters = f.read()
    return note_clusters

### 6. Define system prompts and initiate LLM

#### 6.1. Prompt: recluster and generate cluster names

In [13]:
SYSTEM_PROMPT_RECLUSTER = """
You are a smart assistant helping organize fragmented personal notes.

The notes were previously split into chunks and clustered semantically. Each original note has its own clusters, saved as JSON files inside: ./data/cluster_dir/. Your task is to refine those clusters and give them a name.

1. REORGANIZE CLUSTERS PER FILE:
- You may SPLIT or MERGE clusters WITHIN EACH FILE to better reflect topic boundaries.
- Keep a balance when creating the new clusters. The topic should NOT be too granular (e.g., splitting KMeans into 3 separate clusters), but also not too broad (e.g., grouping all machine learning together).
- The goal is to create meaningful, reusable sub-notes by topic. Each cluster should contain all content from the original note that belongs to that topic.
- For example, all KMeans content from a note should be together, not scattered across separate clusters.
- Imagine someone searching "KMeans": everything relevant from the note should be in a single rewritten cluster.

2. CLUSTER REORGANIZATION OUTPUT:
2.1. REASONING:
- provide a short reasoning section explaining what you split, merged, or reassigned and why. Format it as normal text.
2.2. 
- Then output A CLEAR JSON OBJECT, with label exactly like this:

NOTE_RECLUSTERED:
{ ... }

2.3.
- Then output a SECOND JSON OBJECT, with label exactly like this:

CLUSTER_NAMES:
{ ... }


- NOTE_RECLUSTERED contains the new clustering structure, following the same format as the input cluster JSON.

- CLUSTER_NAMES is a dictionary mapping new cluster IDs created during reclustering (e.g., "cluster_0") to topic names.

3. FORMATTING RULES (STRICT):
- Keep the reasoning at the top.
- After reasoning, print CLUSTER_NAMES: followed by the RAW JSON object (NO code blocks, NO quotes, NO markdown).
- Then print NOTE_RECLUSTERED: followed by the RAW JSON (same rules).
- DO NOT use triple backticks, ```json, or any Markdown formatting. Just the plain, raw JSON object after the tag.
- Make sure CLUSTER_NAMES and NOTE_RECLUSTERED appear exactly like shown. They are anchors for parsing.
- CLUSTER_NAMES topic names must be FOLDER-SAFE (use underscores, no spaces or special chars).
DO NOT use placeholder text like '...' inside JSON.
DO NOT include comments like `// ...` inside JSON.
Make sure all JSON is valid and can be parsed with json.loads().
- Example:

CLUSTER_NAMES:
{ "cluster_0": "unsupervised_learning", ... }

NOTE_RECLUSTERED:
{ "cluster_0": [ ... ], ... }

You will receive a single note cluster JSON. Analyze and return the result using the format above.
"""

#### 6.2. Initiate LLM

In [19]:
# Initiate Ollama
def query_ollama(model: str, prompt: str) -> str:
    """Initiate Ollama"""
    process = subprocess.Popen(
        ["ollama", "run", model],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    stdout, stderr = process.communicate(input=prompt)
    if stderr:
        print("OLLAMA STDERR:", stderr)
    return stdout

### 7. Extract response and save CLUSTER_NAMES and NOTE_x_RECLUSTERED

# FUNCTION WHEN I TRY TO SAVE WITH A SPECIFIC NAME + UPDATE THE EXISTIING CLUSTER_NAMES_DICT 

In [60]:
def parse_response_and_save(response_text, fname):
    """Parse LLM response and save RECLUSTERED output + update CLUSTER_NAMES_DICT.json"""
    
    base_fname = extract_base_filename(os.path.basename(fname))

    # ==== Extract CLUSTER_NAMES ====
    cluster_names_match = re.search(r'CLUSTER_NAMES:\s*(\{.*?\})', response_text, re.DOTALL)
    if cluster_names_match:
        cluster_names_str = cluster_names_match.group(1)
        try:
            cluster_names = json.loads(cluster_names_str)
        except json.JSONDecodeError:
            cluster_names = parse_complex_json(cluster_names_str)
    else:
        cluster_names = {}

    # ==== Extract NOTE_RECLUSTERED ====
    note_reclustered_match = re.search(r'NOTE_RECLUSTERED:\s*(\{.*?\})', response_text, re.DOTALL)
    if note_reclustered_match:
        note_reclustered_str = note_reclustered_match.group(1)
        try:
            note_reclustered = json.loads(note_reclustered_str)
        except json.JSONDecodeError:
            note_reclustered = parse_complex_json(note_reclustered_str)
    else:
        note_reclustered = {}

    # ==== Save NOTE_RECLUSTERED ====
    reclustered_path = os.path.join(LLM_RECLUSTER, f"{base_fname}_RECLUSTERED.json")
    with open(reclustered_path, 'w', encoding='utf-8') as f:
        json.dump(note_reclustered, f, indent=2, ensure_ascii=False)

    # ==== Update or create CLUSTER_NAMES_DICT.json ====
    if os.path.exists(CLUSTER_NAMES_DICT):
        with open(CLUSTER_NAMES_DICT, 'r', encoding='utf-8') as f:
            clusters_NAMES_dict = json.load(f)
    else:
        clusters_NAMES_dict = {}

    # Add new entries (overwrite if repeated)
    clusters_NAMES_dict.update(cluster_names)

    with open(CLUSTER_NAMES_DICT, 'w', encoding='utf-8') as f:
        json.dump(clusters_NAMES_dict, f, indent=2, ensure_ascii=False)

    print("Files saved:")
    print(f"  - {reclustered_path}")
    print(f"  - {CLUSTER_NAMES_DICT}")

    return reclustered_path, note_reclustered, CLUSTER_NAMES_DICT



def parse_complex_json(json_str):
    """Safer fallback parser."""
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        try:
            return ast.literal_eval(json_str)
        except:
            return {}

# test my function

In [63]:

response_test = """
REASONING:
The original clusters seem to be dividing the content based on specific techniques or models rather than topics, making it difficult for a search on a topic like "Decision Trees" or "Unsupervised Learning". To make these clusters more meaningful and reusable, I have merged the initial three clusters (0, 1, 2) which focus on Decision Trees, their properties, and related topics. Similarly, the next two clusters (3, 4) are grouped together as they discuss tree-related concepts like pruning and reducing variance. The final cluster (5) seems to stand alone, discussing regularization. The two remaining clusters (6, "unassigned") focus on Ridge and Lasso regression models and are merged into one, as both deal with the same topic of Regularization.
CLUSTER_NAMES:
{
  "cluster_0": "decision_trees",
  "cluster_1": "tree_related",
  "cluster_2": "regularization"
}
NOTE_RECLUSTERED:
{
  "cluster_0": [
    {
      "chunk_id": 0,
      "chunk": "lecture today was fast af. started with trees. entropy vs gini impurity — diff metrics to decide best split.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 1,
      "chunk": "both OK. CART = binary tree = each node has 2 splits.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 2,
      "chunk": "sklearn uses this.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 3,
      "chunk": "trees = interpretability good, but unstable to data changes.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 4,
      "chunk": "pruning = way to fix.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 5,
      "chunk": "early stopping or post-prune.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 6,
      "chunk": "reduces variance.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 7,
      "chunk": "RandomForest = multiple trees on bootstrapped samples + rand subset of features per split.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 8,
      "chunk": "hierarchical = dendrograms.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 9,
      "chunk": "trees don’t need scaling? bcz splits based on order not value. contrast w/ reg models.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 10,
      "chunk": "then prof jumped into regression.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ],
  "cluster_1": [
    {
      "chunk_id": 15,
      "chunk": "ElasticNet = mix of both — good if features correlated\ncode ex:\nfrom sklearn.linear_model import ElasticNet model = ElasticNet(alpha=0.1, l1_ratio=0.5) model.fit(X, y)\nimportant: scale features before fitting regularized models — otherwise magnitudes skew the penalty.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 17,
      "chunk": "k-means = most used.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 18,
      "chunk": "init centers, assign pts, recalc, repeat. problem: sensitive to init. use k-means++. elbow method not always clear. silhouette score better maybe. PCA + k-means often combined for vis. t-SNE only for viz — not for modeling. clusters in t-SNE are sometimes fake.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ],
  "cluster_2": [
    {
      "chunk_id": 19,
      "chunk": "DBSCAN = cluster via density.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 20,
      "chunk": "can detect noise. great for shape-agnostic clusters. params hard to tune tho.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 21,
      "chunk": "bagging helps = ensemble.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 16,
      "chunk": "StandardScaler or RobustScaler if outliers. tune α via cross-val — use GridSearchCV. early stopping or post-prune.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 12,
      "chunk": "Ridge = L2 norm = Σ(w²).",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 13,
      "chunk": "Lasso = L1 norm = Σ|w|.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 14,
      "chunk": "Ridge keeps all weights ≠ 0, Lasso can zero out → sparse. Lasso good for feature selection.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ]
}
"""


# my function save the cluster_names_dict , but the reclustered.json is empty.

In [None]:

reclustered_path, note_reclustered, CLUSTER_NAMES_DICT = parse_response_and_save(response_text=response_test, fname = "note_3_dt_r_ul_clusters.json")

Files saved:
  - ./data/outputs/LLM_recluster/note_3_dt_r_ul_clusters.json_RECLUSTERED.json
  - ./data/outputs/LLM_recluster/cluster_names_dict.json


# FUNCTION THAT WORKS:

In [67]:
import json
import re

def parse_response(response_text):

    
    # Extract CLUSTER_NAMES section
    cluster_names_match = re.search(r'CLUSTER_NAMES:\s*(\{[^}]*\})', response_text, re.DOTALL)
    if cluster_names_match:
        cluster_names_str = cluster_names_match.group(1)
        try:
            cluster_names = json.loads(cluster_names_str)
        except json.JSONDecodeError:
            # Handle potential formatting issues
            cluster_names = eval(cluster_names_str)
    else:
        cluster_names = {}
    
    # Extract NOTE_RECLUSTERED section
    note_reclustered_match = re.search(r'NOTE_RECLUSTERED:\s*(\{.*\})', response_text, re.DOTALL)
    if note_reclustered_match:
        note_reclustered_str = note_reclustered_match.group(1)
        try:
            note_reclustered = json.loads(note_reclustered_str)
        except json.JSONDecodeError:
            # Handle potential formatting issues - more complex parsing needed
            note_reclustered = parse_complex_json(note_reclustered_str)
    else:
        note_reclustered = {}
    
    return cluster_names, note_reclustered

def save_to_files(cluster_names, note_reclustered, output_dir="./"):
    """
    Save CLUSTER_NAMES and NOTE_RECLUSTERED to separate JSON files.
    

    """
    import os
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Save CLUSTER_NAMES
    cluster_names_file = os.path.join(output_dir, "cluster_names.json")
    with open(cluster_names_file, 'w', encoding='utf-8') as f:
        json.dump(cluster_names, f, indent=2, ensure_ascii=False)
    
    # Save NOTE_RECLUSTERED
    note_reclustered_file = os.path.join(output_dir, "note_reclustered.json")
    with open(note_reclustered_file, 'w', encoding='utf-8') as f:
        json.dump(note_reclustered, f, indent=2, ensure_ascii=False)
    
    print(f"Files saved:")
    print(f"  - {cluster_names_file}")
    print(f"  - {note_reclustered_file}")

def parse_complex_json(json_str):
    """
    Helper function to parse complex JSON with potential formatting issues.
    """
    try:
        # Try direct JSON parsing first
        return json.loads(json_str)
    except json.JSONDecodeError:
        # If that fails, try eval (be careful with this in production)
        try:
            return eval(json_str)
        except:
            # If all else fails, return empty dict
            return {}

# Example usage:
if __name__ == "__main__":
    # Your response text
    response = """
REASONING:
The original clusters seem to be dividing the content based on specific techniques or models rather than topics, making it difficult for a search on a topic like "Decision Trees" or "Unsupervised Learning". To make these clusters more meaningful and reusable, I have merged the initial three clusters (0, 1, 2) which focus on Decision Trees, their properties, and related topics. Similarly, the next two clusters (3, 4) are grouped together as they discuss tree-related concepts like pruning and reducing variance. The final cluster (5) seems to stand alone, discussing regularization. The two remaining clusters (6, "unassigned") focus on Ridge and Lasso regression models and are merged into one, as both deal with the same topic of Regularization.

CLUSTER_NAMES:
{
  "cluster_0": "decision_trees",
  "cluster_1": "tree_related",
  "cluster_2": "regularization"
}

NOTE_RECLUSTERED:
{
  "cluster_0": [
    {
      "chunk_id": 0,
      "chunk": "lecture today was fast af. started with trees. entropy vs gini impurity — diff metrics to decide best split.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 1,
      "chunk": "both OK. CART = binary tree = each node has 2 splits.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ],
  "cluster_1": [
    {
      "chunk_id": 15,
      "chunk": "ElasticNet = mix of both — good if features correlated",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ],
  "cluster_2": [
    {
      "chunk_id": 19,
      "chunk": "DBSCAN = cluster via density.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ]
}
"""
    
    # Parse the response
    cluster_names, note_reclustered = parse_response(response)
    
    # Save to separate files
    save_to_files(cluster_names, note_reclustered)
    
    print("CLUSTER_NAMES:")
    print(json.dumps(cluster_names, indent=2))
    print("\nNOTE_RECLUSTERED:")
    print(json.dumps(note_reclustered, indent=2))

Files saved:
  - ./cluster_names.json
  - ./note_reclustered.json
CLUSTER_NAMES:
{
  "cluster_0": "decision_trees",
  "cluster_1": "tree_related",
  "cluster_2": "regularization"
}

NOTE_RECLUSTERED:
{
  "cluster_0": [
    {
      "chunk_id": 0,
      "chunk": "lecture today was fast af. started with trees. entropy vs gini impurity \u2014 diff metrics to decide best split.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    },
    {
      "chunk_id": 1,
      "chunk": "both OK. CART = binary tree = each node has 2 splits.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ],
  "cluster_1": [
    {
      "chunk_id": 15,
      "chunk": "ElasticNet = mix of both \u2014 good if features correlated",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ],
  "cluster_2": [
    {
      "chunk_id": 19,
      "chunk": "DBSCAN = cluster via density.",
      "metadata": {
        "source": "note_3_dt_r_ul.txt"
      }
    }
  ]


### 8. Generate sub-notes from each cluster

In [16]:
# def split_clusters_to_formatted_subnotes(reclustered_file_path=LLM_RECLUSTER, cluster_names_dict=CLUSTER_NAMES_DICT, output_folder=SUBNOTES):
#     """Split reclustered JSON file by cluster and save as formatted text files with original filename"""
    
#     # Get original filename base
#     original_fname = get_original_filename_from_clusters(reclustered_file_path)
    
#     # Load the JSON file
#     with open(reclustered_file_path, 'r', encoding='utf-8') as f:
#         data = json.load(f)
    
#     formatted_subnotes = []
    
#     # Process each cluster
#     for cluster_id, chunks in data.items():
#         # Get cluster name from cluster_names_dict
#         cluster_name = cluster_names_dict.get(cluster_id, cluster_id)
        
#         # Create the output file path with original filename preserved
#         output_filename = f"{original_fname}_{cluster_name}.txt"
#         output_file = os.path.join(output_folder, output_filename)
        
#         # Format the content with simple bullet points
#         with open(output_file, 'w', encoding='utf-8') as f:
#             f.write(f"Topic: {cluster_name}\n")
#             f.write("="*50 + "\n\n")
            
#             # Write each chunk as a simple bullet point
#             for chunk_data in chunks:
#                 chunk_text = chunk_data.get('chunk', '').strip()
#                 if chunk_text:  # Only write non-empty chunks
#                     f.write(f"• {chunk_text}\n")
            
#             f.write(f"\n[Total: {len(chunks)} notes]\n")

#         formatted_subnotes.append(output_file)
#         print(f"Created {output_file} with {len(chunks)} chunks")
    
#     print(f"\nAll formatted cluster files saved to '{output_folder}' folder")
#     return formatted_subnotes

# Processing pipeline

In [None]:
# ### MAIN PROCESSING PIPELINE
def process_file(fname):
    """Process a single file through the complete pipeline."""
    print(f"\n\n=============== PROCESSING FILE: {fname} ===============")

    # Step 1: Read file
    with open(os.path.join(FOLDER_PATH, fname), "r", encoding="utf-8") as f:
        text = f.read()

    # Step 2: Create chunks
    chunks, metadata = chunk_text(text, fname, inspect=True)        # Split text into chunks
    print(f"\n=== Created {len(chunks)} chunks")

    # Step 3: Store in vector database
    vector_manager = VectorStoreManager()    # Initialize vector store
    vector_manager.add_chunks(chunks, metadata)                     # Add chunks with embeddings
    print(f"\n=== Stored {len(chunks)} chunks in Chroma DB")
    
    # Step 3.1: Mark as processed
    with open(LOG_PATH, "a") as log:                                # Log processed file
        log.write(fname + "\n")

    # Step 4: Cluster chunks
    print(f"\n=== CLUSTERING CHUNKS FROM {fname}")
    chunks, embeddings, metadatas = vector_manager.get_chunks_with_embeddings(fname) # Retrieve stored data

    clusterizer = ChunkClusterizer(chunks, embeddings, metadatas, fname)             # Initialize clustering
    clusterizer.cluster_chunks()                                            # Perform clustering
    clusterizer.save_clusters()                                             # Save clusters

    # STEP 5: Read clustered JSON file
    clusters_path = os.path.join(CLUSTER_DIR, fname.replace(".txt", "_clusters.json"))
    note_clusters = read_note_clusters(clusters_path)

    # STEP 6: Query LLM with SYSTEM_PROMPT_RECLUSTER
    model = "mistral"
    prompt = SYSTEM_PROMPT_RECLUSTER + f"\n\nHERE IS THE CLUSTER DATA FROM {fname}:\n" + note_clusters + "\n\nPLEASE PERFORM THE RECLUSTERING AND CLUSTER NAMING TASKS."
    response = query_ollama(model, prompt)

    # STEP 6.1: Display reasoning(Markdown)
    reasoning_part = response.split("CLUSTER_NAMES:")[0]
    display(Markdown(reasoning_part.strip()))

    # STEP 7: Parse and save JSON outputs
    cluster_names_dict_file, note_reclustered, note_reclustered_file = parse_response_and_save(response_text=response, fname=fname)



def process_all_new():
    """Process all new files in the input directory."""
    new_files = get_new_files()                                     # Get list of unprocessed files
    
    if not new_files:                                               # Exit early if no new files
        print("\n=== No new files to process.")
        return
    
    print(f"\n=== Found {len(new_files)} new files: {new_files}")
    
    for fname in new_files:                                         # Process each new file
        process_file(fname)
    
    print(f"\n=== Completed processing {len(new_files)} files")




### FOR LATER: "MAIN GUARD" OR "ENTRY POINT PATTERN" 
# EXECUTION 
#if __name__ == "__main__":
#    process_all_new()                                               # Run the complete pipeline

#### Option 1: Process all new files

In [21]:
process_all_new()


=== Found 2 new files: ['note_1_ul.txt', 'note_3_dt_r_ul.txt']



=== Created 20 chunks

=== Loading existing Chroma DB from ./data/chroma_db

=== Stored 20 chunks in Chroma DB

=== CLUSTERING CHUNKS FROM note_1_ul.txt

=== Found 4 clusters

=== Cluster distribution ===
Noise: 2 chunks
Cluster 0: 3 chunks
Cluster 1: 3 chunks
Cluster 2: 9 chunks
Cluster 3: 3 chunks

=== Noise ===


**Chunk 0:** unsup = no labels.

**Chunk 3:** elbow method = plot inertia vs k — look for bend, but not always obvious.


=== Cluster 0 ===


**Chunk 12:** tSNE/UMAP for 2D plot = better for human eye but not for modeling.

**Chunk 13:** PCA = linear, tSNE = non-linear.

**Chunk 14:** tSNE distorts structure globally. good for pattern discovery.


=== Cluster 1 ===


**Chunk 9:** dendrogram = tree of merges.

**Chunk 10:** can "cut" tree at diff levels = diff num clusters.

**Chunk 17:** if data has diff density or shapes → fails.


=== Cluster 2 ===


**Chunk 1:** kmeans = simplest one but still used a lot.

**Chunk 5:** alt metric = silhouette score — between -1 and 1. close to 1 = well-clustered.

**Chunk 6:** clustering ≠ classification. labels are not known. use cases: market segmentation, gene expr clustering, anomaly detection (esp dbscan). DBSCAN better for weird shapes, dense clusters — uses eps + min_samples. tricky to tune tho. forms clusters based on density, noisy pts marked as outliers (label -1).

**Chunk 7:** sklearn DBSCAN ex: from sklearn.cluster import DBSCAN model = DBSCAN(eps=0.5, min_samples=5) model.fit(X)
hierarchical clustering = agglomerative or divisive — we focus on bottom-up (agglomerative).

**Chunk 11:** linkage: single, complete, avg. sklearn has AgglomerativeClustering. before clustering, can reduce dim (PCA) for speed + viz.

**Chunk 15:** spectral clustering = build similarity graph → Laplacian → eigenvectors → k-means in lower-dim eigenspace.

**Chunk 16:** nice when structure is graphy, not spherical. most clustering algos rely on distance metric — Euclidean default. alt: cosine sim (for text), manhattan, etc. eval: hard bcz no true label. silhouette best for most. DB index too. compare within/between cluster distance. can also visualize clusters to judge quality. problem: k-means assumes spherical clusters, equal size. not true for real-world.

**Chunk 18:** open Q: how to know if clusters mean anything in real world? scaling is essential — StandardScaler or MinMaxScaler from sklearn.

**Chunk 19:** pipeline ex:
from sklearn.pipeline import make_pipeline pipe = make_pipeline(StandardScaler(), KMeans(n_clusters=3)) pipe.fit(X)


=== Cluster 3 ===


**Chunk 2:** init k centroids randomly (k-means++ better), assign pts, recalc centroids, repeat. converge when no pt changes. but result depends on init + scale. scale important! feature w bigger range dominates dist calc — always standardize first.

**Chunk 4:** inertia = sum of dist² to centroid.

**Chunk 8:** start w all pts as indiv cluster, merge closest pairs step by step.


=== Saved JSON to: ./data/cluster_dir/note_1_ul_clusters.json
OLLAMA STDERR: [?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠇ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠇ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?20

Reasoning:
The original cluster splits the note by the order of appearance, but some topics are not separated appropriately. For example, all KMeans content is scattered across different clusters instead of being grouped together. I will merge and split clusters to create meaningful sub-notes by topic.


=== Saved JSON to: ./data/outputs/LLM_recluster/CLUSTER_NAMES_DICT.json

=== Saved JSON to: ./data/outputs/LLM_recluster/note_1_ul_RECLUSTERED.json
Files saved:
  - ./data/outputs/LLM_recluster/CLUSTER_NAMES_DICT.json
  - ./data/outputs/LLM_recluster/note_1_ul_RECLUSTERED.json


ValueError: too many values to unpack (expected 2)

#### Option 2: Process 1 file

In [None]:
#process_file(".txt")