In [1]:
#!pip install langchain 
#!pip install langchain-commmunity 
#!pip install langchain-chroma langchain-openai
#!pip install langchain-cohere
#!pip install unstructured
#!pip install python-magic-bin==0.4.14

In [1]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings




In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

In [3]:
from langchain_cohere import ChatCohere
llm = ChatCohere(model = "command-r")

In [4]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader('sample_data_files', show_progress = True)
documents = loader.load()
len(documents)

100%|██████████| 1/1 [00:08<00:00,  8.65s/it]


1

#### Text Splitter

The Recursive Text Splitter Module is a module in the LangChain library that can be used to split text recursively. This means that the module will try to split the text into different characters until the chunks are small enough.

Chunk size is the maximum number of characters that a chunk can contain.
Chunk overlap is the number of characters that should overlap between two adjacent chunks.

The chunk size and chunk overlap parameters can be used to control the granularity of the text splitting. A smaller chunk size will result in more chunks, while a larger chunk size will result in fewer chunks. A larger chunk overlap will result in more chunks sharing common characters, while a smaller chunk overlap will result in fewer chunks sharing common characters.

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
splits = text_splitter.split_documents(documents)

In [6]:
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

  embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
  from tqdm.autonotebook import tqdm, trange


In [7]:
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [8]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

In [9]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

In [10]:
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)

In [12]:
rag_chain.invoke("What is the data talking about?")

"The question is about the population of King's Landing which, according to Tyrion, is around one million people. This number is compared to the population of the entire North, which is significantly less. The scene then shifts the focus to interactions between several key characters in Game of Thrones, primarily Jon, Dany, and Tyrion, as they discuss the city and its inhabitants."