## Load the csv data

In [1]:
import pandas as pd
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import CSVLoader
import re
import ollama
import subprocess
from langchain.schema import Document

In [2]:
data_file =  '../rag_chain/output.csv'

In [4]:
# process csv file function
def process_csv_file(data:str):
    """function creates a new vector embedding and a retriever from the vectorstore

    args:
    * data:str -  csv file string

    return:
    * retriever object
    * vectore store
    * embedding object
    
    """

    # check if the data object is available
    if data is None:
        return None
    
    # create and load data into document objects
    loader = CSVLoader(data)
    docs = loader.load()


    #load the embedding model object
    embeddings = OllamaEmbeddings(model='deepseek-r1')


    # create vector embeddings using chroma
    vectorstore = Chroma.from_documents(
        collection_name='amazon_collection',
        documents=docs,
        embedding=embeddings,
        # persist_directory='./chromadb'
    )

    # create the retriever object
    retriever = vectorstore.as_retriever()

    return retriever


In [5]:
# combine document function

def combine_docs(docs: list[Document]):
    """this function merges documents into a single string.
    this is necessary to format our inputs into a form understandable and processable by
    our model
    
    args:
    * doc: list[Document]
    
    """
    return '\n\n'.join(doc.page_content for doc in docs)

In [6]:

def chat_model(question:str, context:str):
    """ function formats the user’s question and the retrieved document context into a structured prompt. 
    
    args:
    * question:str - query
    * context:str - context from the retrieved document returned from the retriever
    
    """

    formatted_prompt = f'Question: {question}\n\nContext:{context}'

    # This formatted input is then sent to DeepSeek-R1 via ollama.chat(), 
    # which processes the question within the given context and returns a relevant answer.

    final_response = ollama.chat(
        model='deepseek-r1',
        messages=[{"role": "user", "content": formatted_prompt}],
        stream=False
    )
    response = final_response['message']['content']
    final_answer = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()
    return final_answer

    

In [7]:
# defining the rag chain function
def rag_chain(question, retriever):
    retrieved_docs = retriever.invoke(question)
    formatted_content = combine_docs(retrieved_docs)
    return chat_model(question, formatted_content)

In [8]:
retriever = process_csv_file(data=data_file)

result = rag_chain(
    question='what category has the cheapest product',
    retriever=retriever)


print(result)

The cheapest product among the given shoe options is **ID 38: Vibrant Runners: Bold Orange & Blue Sneakers** at $27.

**Answer:** The cheapest product is ID 38 with a price of \$27.
