Let's tinker with collections a bit. 
1. build a collection
2. add docs (Let's try to add these without ids)
3. try to retrieve docs


In [48]:
from chromadb import PersistentClient
DB_PATH = "./.chroma_db"
db = PersistentClient(path=DB_PATH)
print(db.list_collections())

['Protocols']


In [21]:
#fetch filenames
from docx import Document
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from chromadb import PersistentClient


DB_PATH = "./.chroma_db"
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

db = PersistentClient(path=DB_PATH)
if 'Protocols' in db.list_collections():
    db.delete_collection(name='Protocols')

protocol_collection = db.get_or_create_collection(name='Protocols')


def open_text_files(directory):
    
    for root, _, files in os.walk(directory):
        for file in files:          
            file_path = os.path.join(root, file)
            file_name = os.path.splitext(file)[0]
            if file.endswith(".docx"):
                try:
                    document = Document(file_path)
                    text = []
                    for paragraph in document.paragraphs:
                        text.append(paragraph.text)
                    content = "\n".join(text)  
                    #print(f"File: {file_path}\nContent:\n{content}\n---")
                except Exception as e:
                    print(f"Error opening {file_path}: {e}")
            else:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        #print(f"File: {file_path}\nContent:\n{content}\n---")
                except Exception as e:
                    print(f"Error opening {file_path}: {e}")
            
            chunks = text_splitter.split_text(content)
            documents  = []
            ids = []
            metadatas = []
            for i  in range(len(chunks)):
                documents.append(chunks[i])
                ids.append(f"{file_name}_{i}")
                metadatas.append({
                    'path': file_path,
                    'chunk': i
                })

            
            protocol_collection.add(documents=documents, ids=ids, metadatas=metadatas) 
                              
           
                

open_text_files("protocols/")


            

In [46]:
results = protocol_collection.query(
    query_texts=["fetch me the western blot protocol"], 
    n_results = 10
    )


In [47]:
from collections import Counter

def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]
  
print(most_frequent([x['path'] for x in results['metadatas'][0]]))

protocols/archive\western-blot-protocol.md


In [43]:
[x['path'] for x in results['metadatas'][0]]

['protocols/archive\\sub\\Protocol 1.docx',
 'protocols/dna-extraction-protocol.md',
 'protocols/southern-blot-protocol.md',
 'protocols/southern-blot-protocol.md',
 'protocols/dna-extraction-protocol.md',
 'protocols/dna-extraction-protocol.md',
 'protocols/southern-blot-protocol.md',
 'protocols/protein-isolation-protocol.md',
 'protocols/protein-isolation-protocol.md',
 'protocols/northern-blot-protocol.md']

In [None]:
from chromadb import PersistentClient, EmbeddingFunction, Embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List

MODEL_NAME = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
DB_PATH = "./.chroma_db"

class Protocol:
    def __init__(self, path: str, text:str):
        self.path = path
        self.text  = text

class CustomEmbeddingClass(EmbeddingFunction):
    def __init__(self, model_name):
        self.embedding_model = HuggingFaceEmbedding(model_name)

    def __call__(self, input_texts: List[str])->Embeddings:
        return [self.embedding_model.get_text_embedding(text) for text in input_texts]
    




db = PersistentClient(path=DB_PATH)

custom_embedding_function = CustomEmbeddingClass(MODEL_NAME)

#collection = db.get_or_create_collection(name='SOPs', embedding_function=custom_embedding_function)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards:   0%|                                                                 | 0/2 [00:00<?, ?it/s]

In [None]:
import os
import json
from pathlib import Path

def documents_to_json(folder_path):
    """
    Reads all text files in a folder (including subfolders) and stores their content
    in a JSON object along with their relative paths.
    
    Args:
        folder_path (str): Path to the folder containing documents
        
    Returns:
        dict: JSON-compatible dictionary with file paths and contents
    """
    # Convert the folder path to a Path object
    base_path = Path(folder_path)
    
    # Dictionary to store the results
    documents = {}
    
    # Supported text file extensions
    text_extensions = {'.txt', '.md', }
    
    try:
        # Walk through all files in the folder and subfolders
        for file_path in base_path.rglob('*'):
            # Check if it's a file and has a text extension
            if file_path.is_file() and file_path.suffix.lower() in text_extensions:
                try:
                    # Get the relative path from the base folder
                    relative_path = str(file_path.relative_to(base_path))
                    
                    # Read the file content
                    with open(file_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                    
                    # Store in dictionary
                    documents[relative_path] = content
                    
                except Exception as e:
                    print(f"Error reading file {file_path}: {str(e)}")
                    continue
        
        # Convert to JSON string
        json_output = json.dumps(documents, indent=2)
        
        # Save to a JSON file
        output_path = base_path / 'documents.json'
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(json_output)
            
        return documents
        
    except Exception as e:
        print(f"Error processing folder: {str(e)}")
        return None

# Example usage
if __name__ == "__main__":
    # Replace with your folder path
    folder_path = "./documents"
    result = documents_to_json(folder_path)
    
    if result:
        print(f"Successfully processed {len(result)} documents")
        print(f"Output saved to {folder_path}/documents.json")
    else:
        print("Failed to process documents")