In [31]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
import gradio as gr
import chromadb
import os
import cachetools
from typing import Iterator
import time
import logging
from collections import Counter


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
from langchain_core.embeddings import FakeEmbeddings
from langchain_community.document_loaders import DirectoryLoader
import os 
embeddings = FakeEmbeddings(size=4096)

In [25]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [26]:
import chromadb

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=embeddings,
)

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3


In [27]:
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
client = chromadb.HttpClient(
    host="localhost",
    port=8000,
    ssl=False,
    headers=None,
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)
client.get_or_create_collection("collection_name")

vector_store = Chroma(
    client=client,
    collection_name="collection_name",
    embedding_function=embeddings,
    # persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [32]:

data_dir = "data"
persist_directory = './my_chroma_data'
# Function to list all files in the data directory
def get_all_file_paths(directory, file_extension=".pdf"):
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(file_extension):
                file_paths.append(os.path.join(root, file))
    return file_paths

# Function to check if the documents are already loaded in Chroma
def documents_already_loaded(vector, file_paths):
    existing_metadata = vector.get()["metadatas"]
    existing_files = {metadata['source'].replace("/", "\\") for metadata in existing_metadata if 'source' in metadata}
    return all(file.replace("/", "\\") in existing_files for file in file_paths)

# Get the list of files in the data directory
files_in_data_dir = get_all_file_paths(data_dir)

# Check if Chroma has already loaded all the files
if os.path.exists(persist_directory):
    # Load existing Chroma vector store
    print("Loading existing Chroma vector store...")
    vector = Chroma(client=client,persist_directory=persist_directory, embedding_function=HuggingFaceEmbeddings())

    # Check if all files are already loaded in Chroma
    if documents_already_loaded(vector, files_in_data_dir):
        print("All files are already loaded in Chroma. Skipping loading process.")
    else:
        print("Not all files are loaded in Chroma. Loading remaining files.")
         # Get list of files that are not yet loaded
        existing_metadata = vector.get()["metadatas"]
        existing_files = {metadata['source'].replace("/", "\\") for metadata in existing_metadata if 'source' in metadata}
        new_files = [file for file in files_in_data_dir if file.replace("/", "\\") not in existing_files]
        
        # Load and process new files
        loader = DirectoryLoader(data_dir, glob="**/*.pdf", use_multithreading=True)
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        documents = text_splitter.split_documents(docs)
        vector.add_documents(documents)
else:
    # Load all files from the data directory
    print("No existing vector store found. Processing documents...")
    loader = DirectoryLoader(data_dir, glob="**/*.pdf", use_multithreading=True)
    docs = loader.load()
    # text_splitter = SemanticChunker(HuggingFaceEmbeddings())
    # Use a simpler text splitter for better performance
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.split_documents(docs)
    
    # Instantiate the embedding model
    embeddings = HuggingFaceEmbeddings()
    
    # Create the Chroma vector store
    vector = Chroma.from_documents(client=client,documents=documents, embedding=embeddings, persist_directory=persist_directory)

No existing vector store found. Processing documents...




In [28]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)


TypeError: Type is not JSON serializable: numpy.float64

In [13]:
updated_document_1 = Document(
    page_content="I had chocolate chip pancakes and fried eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

updated_document_2 = Document(
    page_content="The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees.",
    metadata={"source": "news"},
    id=2,
)

vector_store.update_document(document_id=uuids[0], document=updated_document_1)
# You can also update multiple documents at once
vector_store.update_documents(
    ids=uuids[:2], documents=[updated_document_1, updated_document_2]
)

In [14]:
vector_store.delete(ids=uuids[-1])

In [15]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* Wow! That was an amazing movie. I can't wait to see it again. [{'source': 'tweet'}]
