In [1]:
import os
import warnings
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Dict
from langchain.chat_models import ChatOllama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
from langchain.output_parsers import PandasDataFrameOutputParser, PydanticOutputParser
from langchain.prompts import SystemMessagePromptTemplate, PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.document_loaders import CSVLoader


# Config
warnings.filterwarnings("ignore")

In [2]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
chat = ChatOllama(
    base_url="http://localhost:11434",
    model="mistral",
    callback_manager=callback_manager
)

In [3]:
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-V2")
loader = CSVLoader("./data/Kaa_store_product_data - Kaa_store_product_data.csv", encoding="utf-8")

In [4]:
documents = loader.load()[:500]

In [5]:
len(documents)

500

In [6]:
docsearch = Chroma(
    collection_name="docs_store",
    embedding_function=embedding_model,
    persist_directory="./docs-db",
)

In [7]:
import hashlib
import json
from langchain_core.documents import Document

def stable_hash(doc: Document) -> str:
    return hashlib.sha1(json.dumps(doc.metadata, sort_keys=True).encode()).hexdigest()

split_ids = list(map(stable_hash, documents))
docsearch.add_documents(documents, ids=split_ids)

['06a2ad2fd531276d465797c2ef16c247ecf264a5',
 '1017e8be9968b2ce1ba08bb23fbe9abed74ad611',
 '5764720b234f1e587edefd03367103aa97fbd4d6',
 'f76a766f62e8a196fb1dca7c2cbdb5c3cb62b3f3',
 'fe4eb7c146c46a3d26ffc982455c4d806d3e5ca1',
 '0867f7ea6d7bb326e51483efb363fcd2385e7c17',
 '1a3e43ee9cc4c657b8b476877157fc6951df5b0d',
 '0461eb50f81620475a01850bfefa4cee3d07cdeb',
 '088f3e6dd06bf47f677ac4d42e1369379faf97a6',
 'b859c55e693b76c0b0225665981e52e58fe64fd2',
 '308162197ff66c784bf50460024851facbff5d5e',
 'c8aad47529709b30086312cb57a38f11af4ab4cf',
 'bd9db85ed824d9c8e2264ff6e3170081983a8ad3',
 '43425019bcd1fee5744dffaefdca1dfe938c3897',
 '6c76abccf23c5a1c09fafc8ad9d6d8becbccdb1b',
 '75e0f7ea01eef89dfa61714a2f354b745768b331',
 '93fa38d55a64f0381da80edfd737d3bb4e7247c1',
 'a69961074817bb65a4e02abe0af182fb4cc1d3e1',
 'b251500e49b8efb1847583ffc87c82b0f8248f20',
 'd2687b2cd139473dfad7d2e54e19832dbb9fadde',
 '90185853f368ad2bfa09c30b65a4f6805653efa6',
 'b3eae98950efe0dae2ae66fda41f114b53e6aff8',
 '0c1785af

In [8]:
docsearch.persist()

In [9]:
import pandas as pd

response = docsearch.get(include=["metadatas", "documents", "embeddings"])
df = pd.DataFrame(
    {
        "id": response["ids"],
        "document": response["documents"],
        "embedding": response["embeddings"],
    }
)

In [10]:
df

Unnamed: 0,id,document,embedding
0,00140c72584745cf384d81b240f7eb68beeeedef,name: Gym Rings\ndescription: PS 7836 Gym Ring...,"[-0.05820460245013237, 0.030392680317163467, -..."
1,0033dcda2d184fb6d1af0bbc747ea6fb982ad0a1,name: adorne Under Cabinet Digital Music Kit -...,"[-0.03534001484513283, 0.013331648893654346, -..."
2,0040fccd71bede4a6151e71bac9142bfd727059a,name: 1 qt. #PPU11-07 Clary Sage Semi-Gloss En...,"[-0.04795233532786369, 0.03889307752251625, 0...."
3,006f1438a0340369ac53daa863a4fea97502372f,name: 5 gal. #MQ3-18 Ginger Sugar Hi-Gloss Ena...,"[-0.07589365541934967, 0.08348873257637024, 0...."
4,015c0cdb2104e72c7094ef485c1cda850a3ca69b,name: 1 gal. #680B-5 Strawberry Freeze Flat Lo...,"[-0.0312948115170002, 0.07809510827064514, 0.0..."
...,...,...,...
495,fdddeb40b6882bd81e8e5a79732213aa6c7c9186,name: 2.2-Ton Heavy-Duty Transmission Jack\nde...,"[-0.048424459993839264, -0.029662325978279114,..."
496,fe4eb7c146c46a3d26ffc982455c4d806d3e5ca1,name: Salzburg Itzling Light Blue 4' 0 x 4' 0 ...,"[0.007020460441708565, 0.026146333664655685, 0..."
497,fecc15acfd632c16cb60601bdda03cf87e47d122,name: Loft 42 in. Black and OnyxBench\ndescri...,"[-0.023166192695498466, 0.02913539856672287, 0..."
498,ff53d84c2ac7f8ed588d40cab4cd30a1bbedc039,name: Heather Gray 20 in. x 20 in. Tufted Squa...,"[0.00035136949736624956, 0.02545757405459881, ..."


In [24]:
class ChatBot:
    def __init__(self) -> None:
        self.system_template = """
        You are an AI assistant designed to provide detailed answers about products in a store.
        You can help users by providing information, answering questions, and offering recommendations based on the context of their inquiries.
        Your goal is to assist users in making informed decisions about the products they are interested in
        """
        self.human_template = """{question}"""
        self.memory = ConversationBufferMemory()
        
    def Chat(self, query: str) -> str:
        system_prompt = SystemMessagePromptTemplate.from_template(self.system_template)
        human_prompt = HumanMessagePromptTemplate.from_template(self.human_template)
        chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
        prompt = chat_prompt.format_prompt(question=query).to_messages()
        qa = RetrievalQA(
            llm=chat,
            chain_type='stuff',
            retriever=docsearch.as_retriever(),
        )
        response = chain(query)
        return response


In [25]:
bot = ChatBot()

In [26]:
res = bot.Chat("what are the Features of Kilim Rust 5 ft. x 8 ft. Area Rug")

ValidationError: 3 validation errors for RetrievalQA
combine_documents_chain
  field required (type=value_error.missing)
chain_type
  extra fields not permitted (type=value_error.extra)
llm
  extra fields not permitted (type=value_error.extra)