<a href="https://colab.research.google.com/github/clingala/Mini-RAG-Chatbot/blob/main/mini_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q langchain chromadb pypdf sentence-transformers openai tiktoken



In [None]:
!pip install -q langchain==0.1.16 chromadb==0.4.24 pypdf sentence-transformers


In [None]:
!pip uninstall -y opentelemetry-sdk opentelemetry-proto opentelemetry-exporter-otlp-proto-http


In [None]:
import warnings
warnings.filterwarnings('ignore')
!pip uninstall -y numpy packaging
!pip install numpy==1.26.4 packaging==23.2

In [None]:
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

In [None]:
# Initialize an embedding function
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Initialize Chroma client (in-memory by default)
chroma_client = chromadb.Client()

In [None]:
# Create a Chroma vector store. If a collection with the given name already exists, it will be loaded.
vectorstore = Chroma(client=chroma_client, collection_name="my_documents", embedding_function=embeddings)

In [None]:
!pip uninstall -y opentelemetry-sdk opentelemetry-proto opentelemetry-exporter-otlp-proto-http

In [None]:
!pip install -q \
langchain-community \
langchain-core \
chromadb \
pypdf \
sentence-transformers


In [None]:
import langchain
import chromadb
import sentence_transformers

print("Environment ready.")


In [None]:
from google.colab import files

uploaded = files.upload()


In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_path = list(uploaded.keys())[0]

loader = PyPDFLoader(pdf_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

chunks = text_splitter.split_documents(documents)

print("Total chunks:", len(chunks))


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)



In [None]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model
)


In [None]:
query = "What is the main contribution of this paper?"

docs = vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(docs):
    print(f"\n--- Chunk {i+1} ---\n")
    print(doc.page_content[:500])


In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API Key: ")


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

response = qa_chain.run(query)
print(response)


In [None]:
!pip install -q langchain-groq

In [None]:
!pip uninstall -y langchain langchain-core langchain-community langsmith langgraph langgraph-prebuilt langgraph-checkpoint


In [None]:
!pip install -q \
langchain==0.2.14 \
langchain-community==0.2.12 \
langchain-core==0.2.43 \
langchain-groq \
chromadb \
pypdf \
sentence-transformers


In [None]:
import langchain
import langchain_core
print("Stable environment ready")


In [None]:
!pip uninstall -y \
langchain \
langchain-core \
langchain-community \
langchain-openai \
langchain-groq \
langsmith \
langgraph \
langgraph-prebuilt \
langgraph-checkpoint


In [None]:
!pip install -q \
langchain==0.2.14 \
langchain-core==0.2.43 \
langchain-community==0.2.12 \
langchain-groq==0.1.6 \
chromadb==0.4.24 \
pypdf \
sentence-transformers


In [None]:
import langchain
import langchain_core
import langchain_community

print("LangChain fixed and ready âœ…")


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

pdf_path = list(uploaded.keys())[0]

loader = PyPDFLoader(pdf_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

chunks = text_splitter.split_documents(documents)

print("Total chunks:", len(chunks))


In [None]:
print(type(chunks))
print(len(chunks))
print(chunks[0].metadata)
print(chunks[0].page_content[:300])


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Test embedding works
test_vec = embedding_model.embed_query("test sentence")
print("Embedding size:", len(test_vec))


In [None]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model
)

print("Vectorstore created successfully")


In [None]:
query = "What is the main contribution of this paper?"

docs = vectorstore.similarity_search(query, k=3)

print("Retrieved:", len(docs), "documents")
print(docs[0].metadata)
print(docs[0].page_content[:400])


In [None]:
import os
from getpass import getpass

os.environ["GROQ_API_KEY"] = getpass("Enter Groq API Key: ")


In [None]:
llm = ChatGroq(
    model_name="mixtral-8x7b-32768",
    temperature=0
)




In [None]:
from groq import Groq
import os

client = Groq(api_key=os.environ["GROQ_API_KEY"])

models = client.models.list()
for m in models.data:
    print(m.id)


In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model_name="llama-3.1-8b-instant",
    temperature=0
)


In [None]:
from groq import Groq
import os

client = Groq(api_key=os.environ["GROQ_API_KEY"])

models = client.models.list()

for m in models.data:
    print(m.id)


In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4})
)

print("QA chain recreated successfully")


In [None]:
result = qa_chain.invoke({"query": query})
print(result["result"])


In [None]:
import os
print(os.listdir())
