In [4]:
!pip install -U langchain langchain-community chromadb plotly scikit-learn numpy

Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting plotly
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-communit

In [7]:
import os
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objs as go
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from google.colab import userdata
from langchain_core.documents import Document

In [8]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"  # Lightweight & fast
embedding = HuggingFaceEmbeddings(model_name=embedding_model)
print("Embedding model loaded successfully!")

Embedding model loaded successfully!


In [3]:
db_name = "/content/chroma_db"

In [4]:
if os.path.exists(db_name):
    vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)
    vectorstore.delete_collection()

In [9]:
chunks = [
    Document(page_content="Our products are top-notch.", metadata={"doc_type": "products"}),
    Document(page_content="Meet our amazing employees.", metadata={"doc_type": "employees"}),
    Document(page_content="The new contract terms are beneficial.", metadata={"doc_type": "contracts"}),
    Document(page_content="Our company is growing rapidly.", metadata={"doc_type": "company"}),
]


In [11]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory=db_name
)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 4 documents


In [12]:
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions} dimensions")

The vectors have 384 dimensions


In [18]:
llm_pipeline = pipeline("text-generation", model="mosaicml/mpt-7b", device_map="auto",max_new_tokens=50)
llm = HuggingFacePipeline(pipeline=llm_pipeline)
print("Hugging Face LLM loaded successfully!")

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)



The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-openai package and should be used instead. To use it run `pip install -U :class:`~langchain-openai` and import as `from :class:`~langchain_openai import ChatOpenAI``.



ValidationError: 1 validation error for ChatOpenAI
  Value error, Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. [type=value_error, input_value={'model_name': 'gpt-3.5-t...ne, 'http_client': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/value_error

In [None]:
query = "What are the benefits of our new contract?"
response = qa_chain.invoke(query)
print("Response from QA Chain:", response)