# Final Project: NeuroDump - thoughts organizer

## 1. Environment setup

### 1.1. Environment

In [1]:
#!pip install -r requirements.txt --- UPDATE THIS AT THE END

### 1.2. Tools and Libraries

### 1.3. Custom functions

# ### IMPORT

In [2]:
import warnings
warnings.filterwarnings('ignore')
import os
import re
import nltk
from transformers import AutoTokenizer
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from pathlib import Path
from IPython.display import Markdown, display
import hdbscan
import numpy as np

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/work/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 2. Craft pipeline

   1. Input unstructured document file
      1. Config paths
      2. Check if the input is already processed or is a new file
      3. Only continue if it's a new file
   2. Split document into chunks of tokens
   3. Embed all chunks
      1. sentence-transformers (all-MiniLM-L6-v2)
      2. save embeddings into Chroba DB
      3. keep metada to track original document file for each chunk
   4. Cluster the embeddings by semantic similarity (based on all existing vectors in Chroma DB, not only from new file)
   5. Label each cluster based on existing labels in Chroma DB, if no match, create a new label
      1. maybe prompt to LLM to generate labels?
   6. Rewrite each labeled-cluster to connect all the chunks and create a coherent sub-document 
      1. use LLM for that.
      2. output should be like: sub-document 1 (label 1), sub-document 2 (label 2)
   7. Store each sub-document in correspondent label as a new file
      1. For test, generate a new folder locally and save the sub-document as .txt file in the folder with the correspondent label
      2. Use Notion API, the sub-document should be a new file in a Notion database, with the correpondent Label and metadata
   8. Add more documents in bulk and generate insights for simple EDA
       1.  Number of labels, sub-documents in each label, etc.

### 1. Input unstructured document
Incremental version. It will process only new files.

In [3]:
# Config paths
FOLDER_PATH = "./data/mock_notes/"          #files
LOG_PATH = "./data/processed_files.txt"     #list of processed files
PERSIST_DIR = "./data/chroma_db"            #embeddings database (store vectorized chunks)

# Create directories if they don't exist
os.makedirs(FOLDER_PATH, exist_ok=True)
os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
os.makedirs(PERSIST_DIR, exist_ok=True)

In [4]:
# Initialize embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

#### 1.2. Check if the input is already processed or is a new file

In [5]:
def get_processed_files():
    """Load list of already processed files from log."""
    if os.path.exists(LOG_PATH):
        with open(LOG_PATH, "r") as f:
            return set(line.strip() for line in f)
    return set()


def get_new_files():
    """Get list of new .txt files that haven't been processed yet."""
    if not os.path.exists(FOLDER_PATH):
        return []
    
    all_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith(".txt")]
    processed_files = get_processed_files()
    return [f for f in all_files if f not in processed_files]

### 2. Split document into chunks of tokens

In [6]:
def count_tokens(text):
    """Count tokens in text using the embedding model's tokenizer."""
    return len(tokenizer.tokenize(text))

def is_code_like(sentence):
    """Detect if a sentence contains code-like patterns."""
    code_keywords = ["from ", "import ", "def ", "return ", "class ", "for ", "if ", "else", "while ", "try:", "except", "print("]
    code_symbols = ["=", "()", "[]", "{}", "->", "::"]
    sentence_lower = sentence.strip().lower()
    return (
        any(sentence_lower.startswith(k) for k in code_keywords) or
        any(sym in sentence for sym in code_symbols)
    )


# ### CHUNKING LOGIC
def chunk_text(text, source_name, min_tokens=40, max_tokens=100, inspect=False):
    """Split text into semantic chunks with token limits."""
    raw_blocks = re.split(r"\n\s*\n", text.strip())                    # Split by double newlines
    chunks, current_chunk, current_tokens = [], [], 0                  # Initialize chunking variables
    metadata = []                                                      # Store metadata for each chunk

    for block in raw_blocks:                                           # Process each text block
        sentences = nltk.sent_tokenize(block.strip())                  # Split block into sentences

        for sentence in sentences:                                     # Process each sentence
            sentence = sentence.strip()
            tokens = count_tokens(sentence)                            # Count tokens in sentence

            # Handle code-like sentences separately
            if is_code_like(sentence):                                 # Check if sentence contains code
                if current_chunk:                                      # Save current chunk if exists
                    chunks.append(" ".join(current_chunk))
                    metadata.append({"source": source_name})
                    current_chunk, current_tokens = [], 0
                chunks.append(sentence)                                # Add code as separate chunk
                metadata.append({"source": source_name})
                continue

            # Handle conversation transitions
            if any(sentence.lower().startswith(w) for w in ["btw", "wait", "oh", "then", "also", "now", "next"]):
                if current_chunk:                                      # Break chunk at transition words
                    chunks.append(" ".join(current_chunk))
                    metadata.append({"source": source_name})
                    current_chunk, current_tokens = [], 0

            # Check if adding sentence would exceed max tokens
            if current_tokens + tokens > max_tokens and current_tokens >= min_tokens:
                chunks.append(" ".join(current_chunk))                # Save current chunk
                metadata.append({"source": source_name})
                current_chunk, current_tokens = [], 0                 # Reset for new chunk

            current_chunk.append(sentence)                            # Add sentence to current chunk
            current_tokens += tokens                                  # Update token count

        # Save any remaining chunk after block processing
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            metadata.append({"source": source_name})
            current_chunk, current_tokens = [], 0

    if inspect:                                                       # Optional: print chunks for debugging
        for i, chunk in enumerate(chunks):
            print(f"\nChunk {i+1} ({count_tokens(chunk)} tokens):\n{chunk}")

    return chunks, metadata

### 3. Embed all chunks

#### 3.1. Initialize sentence-transformer model

In [7]:
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

#### 3.2. Save embeddings into Chroma DB

In [8]:
# ### VECTOR STORE MANAGEMENT
class VectorStoreManager:
    def __init__(self, persist_dir, embeddings):
        self.persist_dir = persist_dir                                # Store database directory path
        self.embeddings = embeddings                                  # Store embedding model
        self.vectorstore = self._load_existing()                     # Load existing DB or set to None

    def _load_existing(self):
        """Load existing Chroma database if it exists."""
        try:
            if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir):
                print(f"Loading existing Chroma DB from {self.persist_dir}")
                return Chroma(
                    persist_directory=self.persist_dir,
                    embedding_function=self.embeddings,
                    collection_name="default"
                )
        except Exception as e:
            print(f"Error loading existing Chroma DB: {e}")
        
        print("No existing Chroma DB found. Will create new one.")
        return None

    def add_chunks(self, chunks, metadata):
        """Add text chunks to vector store with embeddings."""
        if not self.vectorstore:                                     # Create new database if none exists
            print(f"Creating new Chroma DB at {self.persist_dir}")
            self.vectorstore = Chroma.from_texts(
                texts=chunks,
                metadatas=metadata,
                embedding=self.embeddings,
                persist_directory=self.persist_dir,
                collection_name="default"
            )
        else:                                                        # Add to existing database
            self.vectorstore.add_texts(texts=chunks, metadatas=metadata)

    def get_chunks_with_embeddings(self, fname):
        """Retrieve chunks and embeddings for a specific file."""
        results = self.vectorstore.get(                              # Query database for file chunks
            where={"source": fname},
            include=["documents", "embeddings"]
        )
        
        chunks = results.get('documents', [])                       # Extract text chunks
        embeddings_list = results.get('embeddings', [])             # Extract embeddings
        
        # Fallback: compute embeddings if not stored
        if not embeddings_list:                                      # Generate embeddings if missing
            print(f"Computing embeddings for {fname}...")
            embeddings_list = [self.embeddings.embed_query(chunk) for chunk in chunks]
        
        return chunks, embeddings_list

### 4. Cluster the embeddings by semantic similarity

In [9]:
# ### CLUSTERING LOGIC
class ChunkClusterizer:
    def __init__(self, chunks, chunk_embeddings):
        self.chunks = chunks                                         # Store text chunks
        self.embeddings = np.array(chunk_embeddings)                 # Convert embeddings to numpy array for ML operations
        self.labels = []                                             # Initialize cluster labels

    def cluster_chunks(self, min_cluster_size=2, min_samples=1):
        """Cluster chunks using HDBSCAN algorithm."""
        print("Clustering chunks with HDBSCAN...")
        
        # Handle edge cases
        if len(self.chunks) < min_cluster_size:                      # Not enough chunks to cluster
            print(f"Not enough chunks to cluster ({len(self.chunks)} < {min_cluster_size})")
            self.labels = [-1] * len(self.chunks)                   # Assign all to noise
            return self.labels

        # Ensure embeddings are 2D
        if self.embeddings.ndim == 1:                               # Reshape 1D embeddings to 2D
            self.embeddings = self.embeddings.reshape(1, -1)
            
        print(f"Clustering {len(self.chunks)} chunks with embeddings shape: {self.embeddings.shape}")
            

        # Perform clustering
        clusterizer = hdbscan.HDBSCAN(                                     # Initialize HDBSCAN clustering algorithm
            min_cluster_size=min_cluster_size,                             # Minimum number of points in a cluster
            min_samples=min_samples,                                       # Minimum samples in neighborhood for core point
            metric='euclidean'                                             # Distance metric for clustering
        )
        self.labels = clusterizer.fit_predict(self.embeddings)             # Fit model and predict cluster labels = generate cluster labels

        # Display results
        num_clusters = len(set(self.labels)) - (1 if -1 in self.labels else 0)  # Count clusters (exclude noise)
        print(f"Found {num_clusters} clusters")
        print(f"Cluster distribution: {dict(zip(*np.unique(self.labels, return_counts=True)))}")
            
        # Show cluster contents
        for cluster_id in sorted(set(self.labels)):                 # Iterate through each cluster
            cluster_name = "Noise" if cluster_id == -1 else f"Cluster {cluster_id}"
            chunk_count = sum(1 for label in self.labels if label == cluster_id)
            print(f"\n--- {cluster_name} ({chunk_count} chunks) ---")
            
            for i, label in enumerate(self.labels):                 # Show chunks in this cluster
                if label == cluster_id:
                    preview = self.chunks[i][:200] + ('...' if len(self.chunks[i]) > 200 else '')
                    print(f"Chunk {i+1}: {preview}\n")
            
        return self.labels                                                # Return cluster labels

### PROCESSING PIPELINE - steps 1 to 4 for now...

In [10]:
# ### MAIN PROCESSING PIPELINE
def process_file(fname):
    """Process a single file through the complete pipeline."""
    print(f"\n=== Processing file: {fname} ===")

    # Step 1: Read file
    with open(os.path.join(FOLDER_PATH, fname), "r", encoding="utf-8") as f:
        text = f.read()

    # Step 2: Create chunks
    chunks, metadata = chunk_text(text, fname, inspect=True)        # Split text into chunks
    print(f"Created {len(chunks)} chunks")

    # Step 3: Store in vector database
    vector_manager = VectorStoreManager(PERSIST_DIR, embeddings)    # Initialize vector store
    vector_manager.add_chunks(chunks, metadata)                     # Add chunks with embeddings
    print(f"Stored {len(chunks)} chunks in vector database")
    
    # Step 4: Mark as processed
    with open(LOG_PATH, "a") as log:                                # Log processed file
        log.write(fname + "\n")

    # Step 5: Cluster chunks
    print(f"\n--- Clustering chunks from {fname} ---")
    new_chunks, new_embeddings = vector_manager.get_chunks_with_embeddings(fname)  # Retrieve stored data
    
    clusterizer = ChunkClusterizer(new_chunks, new_embeddings)      # Initialize clustering
    clusterizer.cluster_chunks()                                    # Perform clustering
    


def process_all_new():
    """Process all new files in the input directory."""
    new_files = get_new_files()                                     # Get list of unprocessed files
    
    if not new_files:                                               # Exit early if no new files
        print("No new files to process.")
        return
    
    print(f"Found {len(new_files)} new files: {new_files}")
    
    for fname in new_files:                                         # Process each new file
        process_file(fname)
    
    print(f"\n=== Completed processing {len(new_files)} files ===")




### FOR LATER: "MAIN GUARD" OR "ENTRY POINT PATTERN" 
# EXECUTION 
#if __name__ == "__main__":
#    process_all_new()                                               # Run the complete pipeline

### Option 1: Process all new files

In [11]:
process_all_new()

Found 3 new files: ['note-3-dt-r-ul.txt', 'note-1-dt-r.txt', 'note-2-ul.txt']

=== Processing file: note-3-dt-r-ul.txt ===

Chunk 1 (26 tokens):
lecture today was fast af. started with trees. entropy vs gini impurity — diff metrics to decide best split.

Chunk 2 (14 tokens):
both OK. CART = binary tree = each node has 2 splits.

Chunk 3 (6 tokens):
sklearn uses this.

Chunk 4 (47 tokens):
code: from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier(criterion='entropy', max_depth=4)
trees prone to overfit — esp if depth unbounded.

Chunk 5 (8 tokens):
pruning = way to fix.

Chunk 6 (8 tokens):
early stopping or post-prune.

Chunk 7 (6 tokens):
bagging helps = ensemble.

Chunk 8 (19 tokens):
RandomForest = multiple trees on bootstrapped samples + rand subset of features per split.

Chunk 9 (3 tokens):
reduces variance.

Chunk 10 (12 tokens):
trees = interpretability good, but unstable to data changes.

Chunk 11 (6 tokens):
then prof jumped into regression.

Chunk 

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### Option 2: Process 1 file

In [None]:
#process_file("note-1-dt-r.txt")

## Continue pipeline...
   5. Label each cluster based on existing labels in Chroma DB, if no match, create a new label
      1. maybe prompt to LLM to generate labels?
   6. Rewrite each labeled-cluster to connect all the chunks and create a coherent sub-document 
      1. use LLM for that.
      2. output should be like: sub-document 1 (label 1), sub-document 2 (label 2)
   7. Store each sub-document in correspondent label as a new file
      1. For test, generate a new folder locally and save the sub-document as .txt file in the folder with the correspondent label
      2. Use Notion API, the sub-document should be a new file in a Notion database, with the correpondent Label and metadata
   8. Add more documents in bulk and generate insights for simple EDA
       1.  Number of labels, sub-documents in each label, etc.

In [None]:
# 5. Label clusters with classifier

In [None]:
# 6. Rewrite selected outputs with [llama 3.2]

In [None]:
#    7. Store each sub-document in correspondent label as a new file
#      1. For test, generate a new folder locally and save the sub-document as .txt file in the folder with the correspondent label


In [None]:
# 8.    8. Add more documents in bulk and generate insights for simple EDA
