In [1]:
#  retrieval of data-- from (vector) databases and other sources-- for integration with LLM workflows
# They are important for applications that fetch data to be reasoned over as part of model inference, 
# as in the case of retrieval-augmented generation, or RAG

In [None]:
# requires langchain,langchain-chroma,langchain_groq,langchain_huggingface

In [21]:
# loading variables
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")
# HF is hugging face---The Hugging Face Transformer Library is an open-source library that provides a vast array of pre-trained models primarily focused on NLP. 
# HF is built on PyTorch and TensorFlow, making it incredibly versatile and powerful



In [11]:
# sample documents
from langchain_core.documents import Document

sample_documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

# The metadata attribute can capture information about the source of the document, its relationship to other documents
##, and other information. Note that an individual Document object often represents a chunk of a larger document.



In [10]:
# to convert text into vectors we want word embeddings
from langchain_huggingface import HuggingFaceEmbeddings
text_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# all miniLM L6 v2 model is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space 
# can be used for tasks like clustering or semantic search.



  from tqdm.autonotebook import tqdm, trange







In [12]:
# to apply embeddings to text and store the vectors in vectors spaces

from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(embedding=text_embeddings,documents=sample_documents)

# after we have converted docs to vectors and stores in db we can query for example
vectorstore.similarity_search_with_score("cat") # low score is better

[(Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
  0.9351056814193726),
 (Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
  1.574089765548706),
 (Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
  1.595690131187439),
 (Document(metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.'),
  1.66579270362854)]

In [17]:
dog_vector = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2").embed_query("dogs")
print(len(dog_vector))
# query is converted to 384 dimensions and we can get op from database using vectors as well
print(vectorstore.similarity_search_by_vector(dog_vector))
vectorstore.similarity_search_by_vector_with_relevance_scores(dog_vector)



384
[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'), Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'), Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'), Document(metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care.')]


[(Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
  1.043582797050476),
 (Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
  1.3967427015304565),
 (Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
  1.5169434547424316),
 (Document(metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care.'),
  1.5877840518951416)]

### Retrievers
#### langchain vectorstore is not in subclass of langchain Runnables, so lc-vectorstore cann't be integrated with LCEL-chain(LLM CHAINS)
#### we convert vectorstore to retrievers so that it can be integrated with chain


In [20]:
# Using retrievers
retriever = vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":2})

# search_type can be similarity and mmr(max marginal relevance) and kwargs is select top 2 responses from database

retriever.batch(["dogs","cat"])

[[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
  Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')],
 [Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
  Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.')]]

#### Integrating with llm model

In [22]:

# using groq for llm model
groq_api_key = os.getenv('GROQ_API_KEY')
from langchain_groq import ChatGroq
llm = ChatGroq(model="llama3-8b-8192",groq_api_key=groq_api_key)


In [27]:
# basic RAG Retrieval Augumented Generation
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Runnable to passthrough inputs unchanged or with additional keys.

message = """
Answer this question using the provided context only.
{question}
Context:
{context}
"""
prompt = ChatPromptTemplate.from_messages([("human",message)])

chain = {"context":retriever,"question":RunnablePassthrough()} | prompt | llm
# here content in message should be given by retriever and question by human and this to be passed to prompt and then to llm model

In [33]:
#  here the llm model is only based on sample documents as the context is mentioned as retriever which came from sample docs vectorspace
response = chain.invoke("tell me about cats")
print(response.content)
response_1 = chain.invoke("tell me about owls")
print(response_1.content)

According to the provided context, cats are independent pets that often enjoy their own space.
I apologize, but there is no information about owls in the provided context. The context only mentions parrots and cats, with no mention of owls.
