# Final Project: NeuroDump - thoughts organizer

## 1. Environment setup

### 1.1. Environment

In [1]:
#!pip install -r requirements.txt --- UPDATE THIS AT THE END

### 1.2. Tools and Libraries

### 1.3. Custom functions

# ### IMPORT

In [2]:
import warnings
warnings.filterwarnings('ignore')
import os
import re
import nltk
from transformers import AutoTokenizer
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from pathlib import Path
from IPython.display import Markdown, display
import hdbscan
import numpy as np

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/work/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 2. Craft pipeline

   1. Input unstructured document file
      1. Config paths
      2. Check if the input is already processed or is a new file
      3. Only continue if it's a new file
   2. Split document into chunks of tokens
   3. Embed all chunks
      1. sentence-transformers (all-MiniLM-L6-v2)
      2. save embeddings into Chroba DB
      3. keep metada to track original document file for each chunk
   4. Cluster the embeddings by semantic similarity (based on all existing vectors in Chroma DB, not only from new file)
   5. Label each cluster based on existing labels in Chroma DB, if no match, create a new label
      1. maybe prompt to LLM to generate labels?
   6. Rewrite each labeled-cluster to connect all the chunks and create a coherent sub-document 
      1. use LLM for that.
      2. output should be like: sub-document 1 (label 1), sub-document 2 (label 2)
   7. Store each sub-document in correspondent label as a new file
      1. For test, generate a new folder locally and save the sub-document as .txt file in the folder with the correspondent label
      2. Use Notion API, the sub-document should be a new file in a Notion database, with the correpondent Label and metadata
   8. Add more documents in bulk and generate insights for simple EDA
       1.  Number of labels, sub-documents in each label, etc.

### 1. Input unstructured document
Incremental version. It will process only new files.

In [4]:
# Config paths

FOLDER_PATH = "./data/mock_notes/"          #files
LOG_PATH = "./data/processed_files.txt"     #list of processed files
PERSIST_DIR = "./data/chroma_db"            #embeddings database (store vectorized chunks)

# Create directories if they don't exist
os.makedirs(FOLDER_PATH, exist_ok=True)
os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
os.makedirs(PERSIST_DIR, exist_ok=True)

#### 1.2. Check if the input is already processed or is a new file

In [5]:
def get_processed_files():
    if os.path.exists(LOG_PATH):
        with open(LOG_PATH, "r") as f:
            return set(line.strip() for line in f)
    return set()


def get_new_files():
    if not os.path.exists(FOLDER_PATH):
        print(f"Folder {FOLDER_PATH} does not exist.")
        return []
    all_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith(".txt")]
    processed_files = get_processed_files()
    return [f for f in all_files if f not in processed_files]

### 2. Split document into chunks of tokens

In [6]:
# Tokenizer & chunk helpers
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def count_tokens(text):
    return len(tokenizer.tokenize(text))

def is_code_like(sentence):
    code_keywords = ["from ", "import ", "def ", "return ", "class ", "for ", "if ", "else", "while ", "try:", "except", "print("]
    code_symbols = ["=", "()", "[]", "{}", "->", "::"]
    sentence_lower = sentence.strip().lower()
    return (
        any(sentence_lower.startswith(k) for k in code_keywords)
        or any(sym in sentence for sym in code_symbols)
    )


# Generate chunks
def chunk_text(text, source_name, min_tokens=40, max_tokens=100, inspect=False):
    raw_blocks = re.split(r"\n\s*\n", text.strip())
    chunks, current_chunk, current_tokens = [], [], 0
    metadata = []

    for block in raw_blocks:
        sentences = nltk.sent_tokenize(block.strip())

        for sentence in sentences:
            sentence = sentence.strip()
            tokens = count_tokens(sentence)

            if is_code_like(sentence):
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    metadata.append({"source": source_name})
                    current_chunk, current_tokens = [], 0
                chunks.append(sentence)
                metadata.append({"source": source_name})
                continue

            if any(sentence.lower().startswith(w) for w in ["btw", "wait", "oh", "then", "also", "now", "next"]):
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    metadata.append({"source": source_name})
                    current_chunk, current_tokens = [], 0

            if current_tokens + tokens > max_tokens and current_tokens >= min_tokens:
                chunks.append(" ".join(current_chunk))
                metadata.append({"source": source_name})
                current_chunk, current_tokens = [], 0

            current_chunk.append(sentence)
            current_tokens += tokens

        if current_chunk:
            chunks.append(" ".join(current_chunk))
            metadata.append({"source": source_name})
            current_chunk, current_tokens = [], 0

    if inspect:
        for i, chunk in enumerate(chunks):
            print(f"\nChunk {i+1} ({count_tokens(chunk)} tokens):\n{chunk}")

    return chunks, metadata

### 3. Embed all chunks

#### 3.1. Initialize sentence-transformer model

In [7]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

#### 3.2. Save embeddings into Chroma DB

In [8]:
class VectorStoreManager:
    def __init__(self, persist_dir, embeddings):
        self.persist_dir = persist_dir
        self.embeddings = embeddings
        self.vectorstore = self._load_existing()

    def _load_existing(self):
        """Attempts to load an existing Chroma DB from the persist directory."""
        try:
            if os.path.exists(self.persist_dir) and os.listdir(self.persist_dir):
                print(f"Loading existing Chroma DB from {self.persist_dir}")
                return Chroma(
                    persist_directory=self.persist_dir,
                    embedding_function=self.embeddings,
                    collection_name="default"  # Add this line
                )
        except Exception as e:
            print(f"Error loading existing Chroma DB: {e}")
            print("Will create a new one.")
        
        print("\n-> No existing Chroma DB found. A new one will be created when chunks are added.")
        return None

    def add_chunks(self, chunks, metadata):
        """Adds chunks to the vector store, creating it if it doesn't exist."""
        if not chunks:
            print("No chunks to add.")
            return

        if not self.vectorstore:
            print(f"\n-> Creating new Chroma DB at {self.persist_dir}")
            self.vectorstore = Chroma.from_texts(
                texts=chunks,
                metadatas=metadata,
                embedding=self.embeddings,  # Changed from embedding_function
                persist_directory=self.persist_dir,
                collection_name="default"   # Add this line
            )
        else:
            self.vectorstore.add_texts(texts=chunks, metadatas=metadata)

    def get_chunks_by_file(self, fname):
        """Retrieves all chunks associated with a specific source file."""
        if not self.vectorstore:
            print("Vectorstore is not initialized. Cannot retrieve chunks.")
            return []
        results = self.vectorstore.get(where={"source": fname})
        return results.get('documents', []) # Safely get documents

    def get_all_chunks(self):
        """Retrieves all chunks from the vector store."""
        if not self.vectorstore:
            print("Vectorstore is not initialized. Cannot retrieve chunks.")
            return []
        results = self.vectorstore.get(include=["documents"])
        return results.get('documents', [])
    
    def get_chunks_with_embeddings(self, fname):
        """Retrieves chunks and their embeddings for a specific file."""
        if not self.vectorstore:
            print("Vectorstore is not initialized. Cannot retrieve chunks.")
            return [], []
        
        try:
            results = self.vectorstore.get(
                where={"source": fname},
                include=["documents", "embeddings"]
            )
            chunks = results.get('documents', [])
            embeddings_list = results.get('embeddings', [])
            
            if not chunks:
                print(f"No chunks found for file: {fname}")
                return [], []
            
            if embeddings_list is None or len(embeddings_list) == 0:
                print(f"No embeddings found for file: {fname}. Computing embeddings...")
                # Fallback: compute embeddings manually if not stored
                embeddings_list = [self.embeddings.embed_query(chunk) for chunk in chunks]
            
            return chunks, embeddings_list
        except Exception as e:
            print(f"Error retrieving chunks with embeddings for file {fname}: {e}")
            return [], []

### 4. Cluster the embeddings by semantic similarity

In [9]:
class ChunkClusterizer:
    def __init__(self, chunks, chunk_embeddings):
        self.chunks = chunks
        self.embeddings = np.array(chunk_embeddings) if chunk_embeddings else None
        self.labels = []

    def cluster_chunks(self, min_cluster_size=2, min_samples=1):
        print("Clustering chunks with HDBSCAN...")
        
        if not self.chunks:
            print("No chunks to cluster.")
            return []
            
        if len(self.chunks) < min_cluster_size:
            print(f"Not enough chunks to cluster. Need at least {min_cluster_size}, but have {len(self.chunks)}.")
            self.labels = [-1] * len(self.chunks)
            return self.labels

        # Fix: Check numpy array properly
        if self.embeddings is None or self.embeddings.size == 0:
            print("No embeddings available for clustering.")
            self.labels = [-1] * len(self.chunks)
            return self.labels

        try:
            # Ensure embeddings is a 2D array
            if self.embeddings.ndim == 1:
                self.embeddings = self.embeddings.reshape(1, -1)
            
            print(f"Clustering {len(self.chunks)} chunks with embeddings shape: {self.embeddings.shape}")
            
            clusterizer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size, 
                min_samples=min_samples,
                metric='euclidean'
            )
            self.labels = clusterizer.fit_predict(self.embeddings)

            num_clusters = len(set(self.labels)) - (1 if -1 in self.labels else 0)
            print(f"\nFound {num_clusters} clusters in the document.")
            print(f"Cluster distribution: {dict(zip(*np.unique(self.labels, return_counts=True)))}")
            
            for cluster_id in sorted(set(self.labels)):
                cluster_name = "Noise (ungrouped chunks)" if cluster_id == -1 else f"Cluster {cluster_id}"
                chunk_count = sum(1 for label in self.labels if label == cluster_id)
                print(f"\n--- {cluster_name} ({chunk_count} chunks) ---")
                
                for i, label in enumerate(self.labels):
                    if label == cluster_id:
                        preview = self.chunks[i][:200] + ('...' if len(self.chunks[i]) > 200 else '')
                        print(f"Chunk {i+1}: {preview}\n")
            
            return self.labels
            
        except Exception as e:
            print(f"Error during clustering: {e}")
            self.labels = [-1] * len(self.chunks)
            return self.labels

### Entry point to run steps 1 to 4

In [10]:
def process_file(fname):
    if fname in get_processed_files():
        print(f"Skipping {fname} (already processed).")
        return

    print(f"\nProcessing new file: {fname}")
    # 1. Read text
    with open(os.path.join(FOLDER_PATH, fname), "r", encoding="utf-8") as f:
        text = f.read()

    # 2. Create chunks
    chunks, metadata = chunk_text(text, fname, inspect=True)

    # 3. Initialize vector manager
    vector_manager = VectorStoreManager(PERSIST_DIR, embeddings)
    vector_manager.add_chunks(chunks, metadata)

    # 4. Mark file as processed 
    with open(LOG_PATH, "a") as log:
        log.write(fname + "\n")

    print(f"\nAdded {len(chunks)} chunks from {fname} to Chroma DB.")

    # 5. Cluster chunks from new file
    print(f"\n--- Starting Clustering Process for '{fname}' ---")

    # Retrieve the chunks AND embeddings for the file just processed
    new_chunks, new_embeddings = vector_manager.get_chunks_with_embeddings(fname)

    
    if new_chunks and new_embeddings and a.any(len(emb) > 0 for emb in new_embeddings):
        # Pass the chunks and their pre-computed embeddings to the clusterizer
        clusterizer = ChunkClusterizer(new_chunks, new_embeddings)
        clusterizer.cluster_chunks()
    else:
        print(f"Could not retrieve chunks for {fname} to cluster.")    
    


def process_all_new():
    new_files = get_new_files()
    if not new_files:
        print("No new files to process.")
        return
    
    print(f"Found {len(new_files)} new files to process: {new_files}")
    
    for f in new_files:
        process_file(f)
    print("\nAll new files processed.")

### Option 1: Process all new files

In [11]:
process_all_new()

Found 3 new files to process: ['note-3-dt-r-ul.txt', 'note-1-dt-r.txt', 'note-2-ul.txt']

Processing new file: note-3-dt-r-ul.txt

Chunk 1 (26 tokens):
lecture today was fast af. started with trees. entropy vs gini impurity — diff metrics to decide best split.

Chunk 2 (14 tokens):
both OK. CART = binary tree = each node has 2 splits.

Chunk 3 (6 tokens):
sklearn uses this.

Chunk 4 (47 tokens):
code: from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier(criterion='entropy', max_depth=4)
trees prone to overfit — esp if depth unbounded.

Chunk 5 (8 tokens):
pruning = way to fix.

Chunk 6 (8 tokens):
early stopping or post-prune.

Chunk 7 (6 tokens):
bagging helps = ensemble.

Chunk 8 (19 tokens):
RandomForest = multiple trees on bootstrapped samples + rand subset of features per split.

Chunk 9 (3 tokens):
reduces variance.

Chunk 10 (12 tokens):
trees = interpretability good, but unstable to data changes.

Chunk 11 (6 tokens):
then prof jumped into regression.


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### Option 2: Process 1 file

In [None]:
#process_file("note-1-dt-r.txt")

## Continue pipeline...
   5. Label each cluster based on existing labels in Chroma DB, if no match, create a new label
      1. maybe prompt to LLM to generate labels?
   6. Rewrite each labeled-cluster to connect all the chunks and create a coherent sub-document 
      1. use LLM for that.
      2. output should be like: sub-document 1 (label 1), sub-document 2 (label 2)
   7. Store each sub-document in correspondent label as a new file
      1. For test, generate a new folder locally and save the sub-document as .txt file in the folder with the correspondent label
      2. Use Notion API, the sub-document should be a new file in a Notion database, with the correpondent Label and metadata
   8. Add more documents in bulk and generate insights for simple EDA
       1.  Number of labels, sub-documents in each label, etc.

In [None]:
# 5. Label clusters with classifier

In [None]:
# 6. Rewrite selected outputs with [llama 3.2]

In [None]:
#    7. Store each sub-document in correspondent label as a new file
#      1. For test, generate a new folder locally and save the sub-document as .txt file in the folder with the correspondent label


In [None]:
# 8.    8. Add more documents in bulk and generate insights for simple EDA
