In [None]:
!pip uninstall numpy -y
!pip install numpy==1.26.4
!pip install -q langchain==0.1.16 langchain-openai==0.0.8 langchain-community==0.0.32 langchain_experimental sentence-transformers

In [1]:
import pandas as pd
import os
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.chains import RetrievalQA
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate

In [2]:
csv_file_path = '/content/sales_report.csv'

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(openai_api_base = "https://openrouter.ai/api/v1", openai_api_key = "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", model = "google/gemma-3-27b-it:free",temperature=0.5) #max_tokens=100

In [5]:
# --- 3. Load and Process the CSV Data ---
print("\nLoading CSV data...")
loader = CSVLoader(file_path=csv_file_path)
docs = loader.load()


print("\nSplitting documents into semantic chunks...")
# Using a local embedding model for chunking
text_splitter = SemanticChunker(HuggingFaceEmbeddings())
documents = text_splitter.split_documents(docs)
print(f"Created {len(documents)} chunks.")


Loading CSV data...

Splitting documents into semantic chunks...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Created 7 chunks.


In [7]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [8]:
# --- 4. Create Vector Store ---
print("\nCreating vector store with embeddings...")
# Instantiate the embedding model
embedder = HuggingFaceEmbeddings()

# Create the vector store
vector = FAISS.from_documents(documents, embedder)
retriever = vector.as_retriever(search_type="similarity", search_kwargs={"k": 3})
print("Vector store created successfully.")


Creating vector store with embeddings...
Vector store created successfully.


In [9]:
# --- 5. Define the RAG Chain ---
print("\nSetting up the RAG chain...")
# This prompt template is designed to guide the LLM to act as a sales analyst.
prompt_template = """
1. You are a helpful sales analyst.
2. Use the following pieces of context from the sales report to answer the question at the end.
3. If you don't know the answer from the context, just say that you don't know. Don't try to make up an answer.
4. Provide a concise answer based only on the provided sales data.

Context:
{context}

Question: {question}

Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt_template)

# LLM Chain
llm_chain = LLMChain(
    llm=llm,
    prompt=QA_CHAIN_PROMPT,
    verbose=False # Set to True for more detailed logs
)

# Document processing chain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context",
    document_prompt=PromptTemplate(
        input_variables=["page_content"],
        template="Row:\n{page_content}"
    ),
)

# The final RetrievalQA chain
qa_chain = RetrievalQA(
    combine_documents_chain=combine_documents_chain,
    verbose=False,
    retriever=retriever,
    return_source_documents=True,
)
print("RAG chain is ready.")


Setting up the RAG chain...
RAG chain is ready.


In [None]:
# --- 6. Query the Sales Data ---


query1 = "What was the total sale amount for the North region?"
print(f"\nQuestion: {query1}")
result1 = qa_chain({"query": query1})
print("Answer:", result1["result"].strip())



In [None]:
query2 = "how many laptops sold in south region"
print(f"\nQuestion: {query2}")
result2 = qa_chain({"query": query2})
print("Answer:", result2["result"].strip())