In [1]:
!pip -q install chromadb langchain openai tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m867.6/867.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

## Load and split documents

In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
!unzip -q sample_data/articles.zip -d sample_data/import

In [6]:
loader = DirectoryLoader("./sample_data/import/articles", glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()


In [7]:
len(documents) # we only have 10 docuemnts

10

In [8]:
documents[3].metadata

{'source': 'sample_data/import/articles/bad_speeling.txt'}

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [10]:
len(chunks)

155

## Use chromadb collections

In [11]:
chunks[0].page_content

'OpenAI may be synonymous with machine learning now and Google is doing its best to pick itself up off the floor, but both may soon face a new threat: rapidly multiplying open source projects that push the state of the art and leave the deep-pocketed but unwieldy corporations in their dust. This Zerg-like threat may not be an existential one, but it will certainly keep the dominant players on the defensive.'

In [13]:
import chromadb

In [14]:
client = chromadb.Client() # default in memory
collection = client.create_collection('articles')

In [15]:
collection.add(
    ids=[str(i) for i in range(0, len(chunks))],
    documents=[c.page_content for c in chunks],
    metadatas=[c.metadata for c in chunks],
) # using chroma build in embedding all-MiniLM-L6-v2

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:06<00:00, 12.1MiB/s]


In [16]:
query = 'what is Cherry'
collection.query(
    query_texts=query,
    n_results=2)

{'ids': [['20', '15']],
 'distances': [[0.6593900918960571, 0.8436243534088135]],
 'metadatas': [[{'source': 'sample_data/import/articles/cherry.txt'},
   {'source': 'sample_data/import/articles/cherry.txt'}]],
 'embeddings': None,
 'documents': [['In terms of the future, Cherry is focused on expanding its user base, integrating with online retailers and introducing personalized shopping recommendations in order to “cherry-pick” the best products for its users. The company is also exploring opportunities to partner with brands for exclusive deals and promotions.\n\nCherry is available on iOS and Android.',
   'Meet Cherry, an AI shopping assistant that helps you discover products using screenshots or images\nAisha Malik@aiishamalik1 / 2:18 PM GMT+1•March 21, 2024\n Comment\nCherry app displayed on smartphone screens\nImage Credits: Cherry\nA new app from a startup called Cherry is aiming to transform the online shopping experience with its AI assistant that allows users to discover pro

## Use different embedding function


In [17]:
from langchain.embeddings import OpenAIEmbeddings

In [18]:
embedding_vectors = OpenAIEmbeddings(model="text-embedding-3-large").embed_documents([c.page_content for c in chunks])

  warn_deprecated(


In [50]:
#client.delete_collection('articles_with_embeddings')

In [19]:
collection_embed = client.create_collection('articles_with_embeddings')

In [20]:
collection_embed.add(
    embeddings=embedding_vectors,
    ids=[str(i) for i in range(0, len(chunks))],
    documents=[c.page_content for c in chunks],
    metadatas=[c.metadata for c in chunks],
)

In [21]:
query = 'what is Cherry'
collection.query(
    query_texts=query,
    n_results=2)

{'ids': [['20', '15']],
 'distances': [[0.6593900918960571, 0.8436243534088135]],
 'metadatas': [[{'source': 'sample_data/import/articles/cherry.txt'},
   {'source': 'sample_data/import/articles/cherry.txt'}]],
 'embeddings': None,
 'documents': [['In terms of the future, Cherry is focused on expanding its user base, integrating with online retailers and introducing personalized shopping recommendations in order to “cherry-pick” the best products for its users. The company is also exploring opportunities to partner with brands for exclusive deals and promotions.\n\nCherry is available on iOS and Android.',
   'Meet Cherry, an AI shopping assistant that helps you discover products using screenshots or images\nAisha Malik@aiishamalik1 / 2:18 PM GMT+1•March 21, 2024\n Comment\nCherry app displayed on smartphone screens\nImage Credits: Cherry\nA new app from a startup called Cherry is aiming to transform the online shopping experience with its AI assistant that allows users to discover pro

## with langchain Chroma

In [22]:
from langchain.vectorstores import Chroma
# langchain offers different vector dbs https://python.langchain.com/docs/modules/data_connection/vectorstores/

In [23]:
embedding_function = OpenAIEmbeddings() # use default

In [24]:
vector_db = Chroma.from_documents(
    documents = chunks,
    embedding = embedding_function, # embedding function
    persist_directory = 'storage' # saves as sqlite3 into folder storage
)

In [25]:
vector_db.similarity_search(query='What is a Cherry', k=2)

[Document(page_content='Meet Cherry, an AI shopping assistant that helps you discover products using screenshots or images\nAisha Malik@aiishamalik1 / 2:18 PM GMT+1•March 21, 2024\n Comment\nCherry app displayed on smartphone screens\nImage Credits: Cherry\nA new app from a startup called Cherry is aiming to transform the online shopping experience with its AI assistant that allows users to discover products across the internet using just a screenshot or image. Cherry helps you find products that you’ve come across while scrolling through social media or have seen in real life.', metadata={'source': 'sample_data/import/articles/cherry.txt'}),
 Document(page_content='In terms of the future, Cherry is focused on expanding its user base, integrating with online retailers and introducing personalized shopping recommendations in order to “cherry-pick” the best products for its users. The company is also exploring opportunities to partner with brands for exclusive deals and promotions.\n\nCh

## Vector storage as retriever and chain with GPT

use the vector db as document retriever and combine with LLM

In [None]:
!pip install -U langchain langchainhub
# pip install -U langchain langchain-community
from langchain_community.chat_models import ChatOpenAI

In [54]:
retriever = vector_db.as_retriever()

In [81]:
llm = ChatOpenAI(temperature=0.1,
                 model_name="gpt-3.5-turbo",
                 api_key=userdata.get('openai_api_key'), # default is gpt-3.5 currently
                )

                    search_kwargs was transferred to model_kwargs.
                    Please confirm that search_kwargs is what you intended.


In [82]:
from langchain import hub
from langchain_community.chat_models import ChatOpenAI

In [66]:
from langchain.chains import create_retrieval_chain
#https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html#langchain.chains.retrieval.create_retrieval_chain


In [58]:
from langchain.chains.combine_documents import create_stuff_documents_chain
# https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html#langchain.chains.combine_documents.stuff.create_stuff_documents_chain
# This chain takes a list of documents and formats them all into a prompt,
# then passes that prompt to an LLM. It passes ALL documents, so you should make sure it fits within the context window the LLM you are using.

In [69]:
# with the help of llm creates natural language on inforamtion basis of the feeded documents (= context)
prompt = retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
llm_promt_chain = create_stuff_documents_chain(
    llm, prompt
)
vectordb_llm_chain = create_retrieval_chain(retriever, llm_promt_chain)

result = vectordb_llm_chain.invoke({"input": "What is a Cherry"})

In [87]:
result.get('answer')



'Cherry is an AI shopping assistant app developed by a startup that helps users discover products across the internet using screenshots or images. It allows users to find products they have seen on social media or in real life, sort results by price, bookmark products, and look at their image search history. Cherry is available on both iOS and Android platforms.'