In [1]:
%pip install -qU "langchain-chroma>=0.1.2"

Note: you may need to restart the kernel to use updated packages.


https://python.langchain.com/docs/integrations/vectorstores/chroma/

In [2]:
from utils import tokenize
from nltk.stem import PorterStemmer
import joblib
import constants
import numpy as np
from typing import *
from langchain_chroma import Chroma
import gensim.downloader as api
from tqdm import tqdm



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuliagoryachev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/yuliagoryachev/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
#simple embeddings function
model = api.load("word2vec-google-news-300")
punctuation_map = constants.punctuation_map
model_size = 300

In [4]:
class Embeddings:
    def __init__(self):
        self.punctuation_map = constants.punctuation_map
        self.model_size = 300

    def embed_documents(self, texts: List[str]) -> list:
        print('texts: ', texts)
        res = []
        for text in tqdm(texts):
            tokenized = tokenize(text, punctuation_map=punctuation_map, stemmer=PorterStemmer(), junk_punctuations=True)
            res.append(self.average_pooling(tokenized))
        return res
    
    def embed_query(self, query: str) -> list:
        tokenized = tokenize(query, punctuation_map=punctuation_map, stemmer=PorterStemmer(), junk_punctuations=True)
        return self.average_pooling(tokenized)

    def average_pooling(self, tokens: List[str]):
        embed = np.zeros((1, self.model_size))
        size = len(tokens)
        for t in tokens:
            if t in model:
                embed+=model[t]
        res = embed/max(size, 1)
        return res.tolist()[0]

In [5]:
emb = Embeddings()

In [6]:
# emb.embed_documents(["hello world"])

In [7]:

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=emb,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [8]:
from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "search"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "search"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "search"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "search"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "search"},
    id=5,
)


documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

texts:  ['I had chocolate chip pancakes and scrambled eggs for breakfast this morning.', 'The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.', 'Building an exciting new project with LangChain - come check it out!', 'Robbers broke into the city bank and stole $1 million in cash.', "Wow! That was an amazing movie. I can't wait to see it again."]


100%|██████████| 5/5 [00:00<00:00, 4314.24it/s]


['d1b763a6-d6ca-4978-8920-d7a2dd339b9c',
 'eed9cb66-022e-47f8-817c-9cc19b685855',
 '62679535-6e32-4046-a4a9-35d1f2d763c7',
 'e9d67e6d-57fa-442d-a81d-e53eae6d9edc',
 'e318d458-0a72-45c7-819b-48a385a2ba4b']

In [9]:
results = vector_store.similarity_search(
    "What did I have for breakfast this morning?",
    k=2,
    filter={"source": "search"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")



* I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'search'}]
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'search'}]
