## Loading File

In [1]:
!pip install --q unstructured langchain
!pip install --q "unstructured[all-docs]"


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: C:\Users\Nitro 5\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: C:\Users\Nitro 5\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.schema import Document

In [2]:
local_path = "D:/Super AI SS4/AI-builder/Large-Language-Models-(LLMs)/try-it/RAG/data/TBL4-Online-Shopping-Dataset.csv/TBL4-Online-Shopping-Dataset.csv"

nrows = 200
df = pd.read_csv(local_path, nrows=nrows)

In [3]:
df.shape

(200, 15)

## Vector Embeddings

In [4]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED    
nomic-embed-text:latest	0a109f422b47	274 MB	3 hours ago	
mistral:latest         	2ae6f6dd7a3d	4.1 GB	2 days ago 	
llama3:latest          	365c0bd3c000	4.7 GB	2 days ago 	


In [5]:
!ollama pull nomic-embed-text

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                       

In [8]:
!pip install --q chromadb
!pip install --q langchain-text-splitters


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: C:\Users\Nitro 5\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: C:\Users\Nitro 5\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [6]:
documents = []
for _, row in df.iterrows():
    content = "\n".join([f"{col}: {val}" for col, val in row.items()])
    documents.append(Document(page_content=content))

In [7]:
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
chunks = text_splitter.split_documents(documents)

In [8]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|██████████| 886/886 [31:12<00:00,  2.11s/it]


## Retrieval

In [9]:
# LLM from Ollama
local_model = "llama3"
llm = ChatOllama(model=local_model)

In [10]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [11]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
Your answer should be a single line pandas command that can be applied to the dataframe 'df' to answer the question.
Make sure to use the correct column names as they appear in the context.
Do not include any explanations or print statements, just the pandas command.

Example:
Question: What is the average age of customers?
Answer: df['Age'].mean()

Now, answer the given question:
"""

In [12]:
prompt = ChatPromptTemplate.from_template(template)
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [13]:
# Example usage
question = "What is the total number of customers?"
result = chain.invoke(question)
print(f"Question: {question}")
print(f"Pandas command: {result}")

OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.26s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


Question: What is the total number of customers?
Pandas command: df['CustomerID'].nunique()


In [14]:
answer = eval(result)
print(f"Answer: {answer}")

Answer: 21


In [None]:
chain.invoke(input(""))

In [26]:
# Delete all collections in the db
vector_db.delete_collection()