# RAG for FindAI

<img src="RAG_FindAI.jpg" alt="drawing" width="800"/>

In [2]:
### LLM
from langchain_ollama import ChatOllama

local_llm = "llama3.2:3b-instruct-fp16"
llm = ChatOllama(model=local_llm, temperature=0)
llm_json_mode = ChatOllama(model=local_llm, temperature=0, format="json")

In [3]:
#### Search ####

import os
import getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

# 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
_set_env("TAVILY_API_KEY")
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [5]:
#### Vectore Store ####

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings
from langchain_core.documents import Document
import pandas as pd

df = pd.read_csv('DataSets/ItemDataWithAiDescription.csv', index_col=0)

def get_document(row: pd.Series):

    metadata = {
        'product_name': row['ProductName'], 
        'brand': row['Brand'], 
        'price': row['Price'], 
        'category': row['Category'], 
        'image_url': row['ImageUrls'].split("', ")[0].replace("['", "")
        }
    
    document = Document(page_content=row['AIDescription'], metadata=metadata)

    return document

df['document'] = df.apply(lambda x: get_document(x), axis=1)

documents = df['document'].to_list()

# Add to vectorDB
vectorstore = SKLearnVectorStore.from_documents(
    documents=documents,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Create retriever
retriever = vectorstore.as_retriever(k=3)

In [9]:
retriever.invoke('White t shirt from Balenciaga. Tight fit')

[Document(metadata={'id': '6846c676-2d69-497e-b8a0-3745306de317', 'product_name': 'Balenciaga Technical Mesh Short Sleeve T-Shirt', 'brand': 'balenciaga', 'price': 425.0, 'category': 'T-Shirts', 'image_url': 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/1/8/18-07-24-LS_793288-4E3B3-1081_1_1.jpg'}, page_content="The Balenciaga Technical Mesh Short Sleeve T-Shirt is categorized under T-Shirts and showcases a modern activewear design. This item is crafted from a blend of materials, specifically 85% polyester and 15% elastane, which contributes to its lightweight and breathable quality.\n\nThe brand associated with this t-shirt is Balenciaga, known for its contemporary and innovative fashion. The primary color of the t-shirt is not specified, but it features a reflective activewear logo printed prominently at the front, enhancing visibility in low-light conditions, making it suitable for various activities. The design incorporates ribbed trims

In [8]:
### Query alignment
import json
from langchain_core.messages import HumanMessage, SystemMessage

categories = ['Hoodies & Sweats', 'Trousers', 'Coats & Jackets',
       'Shorts', 'Bags', 'T-Shirts', 'Jewellery',
       'Belts', 'Swimwear', 'Shirts',
       'Accessories', 'Hats', 'Knitwear',
       'Socks', 'Lifestyle',
       'Sweat Pants', 'Polo Shirts', 'Jeans',
       'Sunglasses', 'Scarves & Gloves', 'Wallets & Keychains', 'Publications', 'Sportswear',
       'Boots', 'Perfume & Fragrance', 'Sneakers', 'Sandals & Slides',
       'Home Decoration', 'Tableware',
       'Watches', 'Underwear', 'Loungewear', 'Shoe Care & Accessories',
       'Soft Furnishings', 'Lighting', 'Storage & Organisers',
       'Glassware', 'Home Fragrance', 'Shoes', 'Slippers', 'Running Shoes']

reference_description = "The item is categorized as a hoodie and falls under the category of Hoodies & Sweats. It is produced by the brand amiri, known for its blend of rock ‘n’ roll and Ivy League styles. The primary material of this hoodie is 100% French terry cotton, offering both warmth and comfort. \n\nIn terms of color, the main color is not specified; however, given the brand's aesthetic, it typically features muted or muted tones associated with casual streetwear. The hoodie may also have a core logo embossed on the chest, which is a notable design feature reflecting the label's bold graphic influence.\n\nThe fit is designed for comfort and practicality, featuring a fixed hood for additional warmth and coverage. It incorporates a kangaroo pocket, ideal for stashing small essentials or keeping hands warm. The ribbed trims add a touch of structure to the overall style, enhancing the hoodie’s casual yet refined appearance.\n\nThis hoodie represents a luxury item, appealing to individuals in the fashion-forward circles, like Hollywood celebrities. It is typically suitable for casual occasions, streetwear outings, or lounging, embodying a chic yet laid-back aesthetic that aligns with modern street style. Overall, this piece combines comfort with high-end design elements, making it a distinctive addition to any casual wardrobe."

# Prompt
router_instructions = f"""You are an expert at creating a detailed item description from a user query about an item.

The structure of the detailed description should begin by identifying the type and category, then specify the material, brand, main color, and any secondary colors or patterns. Describe the fit and style and include any notable design features and whether it is a luxury item or not. Add the typical occasion or style it's intended for, and any other unique characteristics that can be gathered.

Here are the possible categories: {', '.join(categories)}

The description should be purely descriptive and objective. If there is any information that can't be assumed from the user query then leave it out of the description. The response should be in full sentences without bullet points. Here is a reference of how the description should look like for a hooide: f{reference_description}"""

# Test query_aligner
test_hoodie = llm.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="I'm looking for a black hoodie from either Gucci or Yves Saint Laurent, oversized and no explicit brand names on the front of the hoodie."
        )
    ]
)

test_trousers= llm.invoke(
    [SystemMessage(content=router_instructions)]
    + [HumanMessage(content="Do you have wide legged trousers in plain navy or dark blue, that is under £200")]
)

test_fragrance = llm.invoke(
    [SystemMessage(content=router_instructions)]
    + [HumanMessage(content="Living room fragrances or candles. Wooden smells")]
)

print(
    test_hoodie.content,
    test_trousers.content,
    test_fragrance.content,
)

ConnectError: [Errno 61] Connection refused

If I want to do searching with filters on metadata - E.g., gathering the category and brand and then running on those it is possible using the filtering described here: https://github.com/langchain-ai/langchain/discussions/18196

In [10]:
retriever.invoke("I'm looking for a black hoodie from either Gucci or Yves Saint Laurent, oversized and no explicit brand names on the front of the hoodie.")

[Document(metadata={'id': '1c2acd14-42cc-4fdc-acf9-4c8bef1f5087', 'product_name': 'Saint Laurent Oversized Hoodie', 'brand': 'saint-laurent', 'price': 1185.0, 'category': 'Hoodies & Sweats', 'image_url': 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/0/5/05-08-2024-LB_799171-Y36SW-9950_1_1.jpg'}, page_content="The item is an **Oversized Hoodie**, categorized under **Hoodies & Sweats**. It is produced by the brand **Saint Laurent**, known for its luxury fashion offerings. \n\nThis hoodie is made entirely from **100% cotton**, which ensures a soft and comfortable feel against the skin, while also providing warmth due to its snug ribbed trims. The main color of the hoodie is a **pastel hue**, which offers a soft and stylish appearance, appealing to contemporary fashion sensibilities. Additionally, it features a **woven brand label** positioned by the hem, providing a subtle yet recognizable branding element that contrasts with the overall colo