#### **This entire ipynb was just me figuring out the best way to approach this assesment and creating the vector data base !!!**

### Initilaization and Creating the Embeddings

In [33]:
from pinecone import Pinecone
from pinecone import ServerlessSpec
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import os

from dotenv import load_dotenv

load_dotenv()

pinecone_api_key = os.getenv('pincone_key')
hugging_face_api = os.getenv('hugging_face_key')

# Initialize Pinecone client
pc = Pinecone(api_key = pinecone_api_key)

# Define the index name and embedding dimension (768 for BGE embeddings)
index_name = "product-embedding"
dimension = 1024  #rs

# Check if the index already exists, if not, create it
if index_name not in pc.list_indexes().names():

    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",  
        spec=ServerlessSpec(
            cloud="aws",  
            region="us-east-1"  
        ),
        deletion_protection="disabled"  
    )
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

model_name = "BAAI/bge-large-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to convert text to embedding (vector)
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
  
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding


df = pd.read_csv("products.csv")

embeddings = []
metadata = []

for index, row in df.iterrows():
    product_description = row['product_description']
    

    embedding = get_embedding(product_description)  
    
 
    product_metadata = {
        'product_code': row['product_code'],
        'product_price': row['product_price'],
        'product_unit': row['product_unit']
    }
    
    embeddings.append(embedding)
    metadata.append(product_metadata)
    
print("Embeddings and metadata prepared.")




Index 'product-embedding' created.
Embeddings and metadata prepared.


### Upserting 

In [34]:

index = pc.Index(index_name)


upsert_data = []

for i in range(len(embeddings)):
    # Each upsert item consists of a unique ID (use product_id), the vector, and metadata
    upsert_data.append((
        str(df['product_id'][i]),  # Unique ID (can be product_id)
        embeddings[i],             # The embedding vector
        metadata[i]                # The metadata dictionary
    ))

# Upsert the embeddings and metadata into Pinecone
index.upsert(vectors=upsert_data)
print("Embeddings and metadata have been upserted successfully.")


Embeddings and metadata have been upserted successfully.


### Testing 

In [44]:

sample_description = "Butter Milk"  

# Get the embedding (vector) of the sample description
query_vector = get_embedding(sample_description)

query_vector = query_vector.tolist()

# Perform the query to Pinecone
results = index.query(
    vector=query_vector,  
    top_k=5,              
    include_metadata=True 
)


print("Query results:", results)


Query results: {'matches': [{'id': '243',
              'metadata': {'product_code': 'BMILK',
                           'product_price': 2.5,
                           'product_unit': 'EACH'},
              'score': 0.932030916,
              'values': []},
             {'id': '308',
              'metadata': {'product_code': 'GD202',
                           'product_price': 0.0,
                           'product_unit': 'CASE'},
              'score': 0.893846929,
              'values': []},
             {'id': '293',
              'metadata': {'product_code': 'EVA',
                           'product_price': 2.0,
                           'product_unit': 'EACH'},
              'score': 0.88635546,
              'values': []},
             {'id': '280',
              'metadata': {'product_code': 'DM100',
                           'product_price': 1.45,
                           'product_unit': 'PCS'},
              'score': 0.881391823,
              'values': []},
        

### Llama Testing

In [50]:
from huggingface_hub import InferenceClient

# Initialize the client with your API key
client = InferenceClient(api_key = hugging_face_api)

# Define messages for the chat model
messages = [
    {
        "role": "user",
        "content": "What is the capital of France?"
    }
]

# Call the chat completion endpoint
try:
    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=messages,
        max_tokens=500
    )
    
    # Extract and print the content
    answer = completion.choices[0].message.content  ]
    print("Answer:", answer)
except Exception as e:
    print(f"An error occurred: {e}")


Answer: The capital of France is Paris.


### Testing of LLAMA model 

In [3]:
from huggingface_hub import InferenceClient
import pandas as pd
import json  

client = InferenceClient(api_key=hugging_face_api)


user_input = """Hey, Here's what I need for tomorrow:
2lbs Saku tuna blocks
2 8/12 shrimp
2lbs canela
1 queso blanco
10 lbs brisket
1 Mirin
Konbu 5 lb
3 house firm tofu
1 pcs nori
1 bag dashi mushroom
1 case onion beer
2 yuzu green
3 flat lid 16/24oz
1 cheesecloth"""


messages = [
    {
        "role": "user",
        "content": f"""
Please extract the grocery items and their quantities from the following text. Present them in the format:

    Item: Quantity

For example:
    - Carrot: 2 lbs
    - Tofu: 3 pcs

**Guidelines for Extraction:**
1. Only include items that are clear products with a quantity. For example: "Carrot: 2 lbs" or "Tofu: 3 pcs."
2. Ignore conversational filler or irrelevant details such as "Hi, I need," "I am Deepan," or "I want." Focus only on the grocery items.
3. If the quantity includes a size or measurement range (e.g., '8/12 shrimp' or 'flat lid 16/24 oz'), ensure it is captured correctly.
    Example: '2 8/12 shrimp' means 2 shrimp of size 8/12, and 'flat lid 16/24 oz' means flat lids ranging from 16 oz to 24 oz.
4. Be cautious with special phrases that might indicate quantity ranges or variations. Ensure those are captured as part of the quantity.

**Input text:**
{user_input}

Please only return the extracted grocery items in the format provided (Item: Quantity). Remove any unnecessary text.
"""
    }
]


# Call the chat completion endpoint
try:
    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=messages,
        max_tokens=500
    )
    
    # Extract the response from LLaMA
    response = completion.choices[0].message.content
    print("LLaMA response:", response)
    
    # Parse the structured response (assuming LLaMA provides the correct format)
    items = {}
    lines = response.split('\n')
    for line in lines:
        if ':' in line:
            item, quantity = line.split(':', 1)
            item = item.strip().strip('-')  # Strip hyphens and leading/trailing whitespace
            items[item] = quantity.strip()
    
    # Print the structured result
    print("Structured result:")
    for item, quantity in items.items():
        print(f"{item}: {quantity}")
    
    # Convert the result to JSON
    json_output = json.dumps(items, indent=4)
    print("\nJSON Output:")
    print(json_output)

except Exception as e:
    print(f"An error occurred: {e}")


LLaMA response: - A/B loins ( Quantity: not given, omitted)
- Canela: 2 lbs
- Queso blanco: 1 
- Brisket: 10 lbs
- Mirin: 1 
- Konbu: 5 lbs 
- Firm tofu: 3 pieces
- Nori: 1 piece
- Dashi mushroom: 1 bag 
- Onion beer: 1 case
- Yuzu green: 2
- Flat lid (cheese butter/ cream/ or cheese blan) :  3 pieces 16-24oz
Structured result:
 A/B loins ( Quantity: not given, omitted)
 Canela: 2 lbs
 Queso blanco: 1
 Brisket: 10 lbs
 Mirin: 1
 Konbu: 5 lbs
 Firm tofu: 3 pieces
 Nori: 1 piece
 Dashi mushroom: 1 bag
 Onion beer: 1 case
 Yuzu green: 2
 Flat lid (cheese butter/ cream/ or cheese blan): 3 pieces 16-24oz

JSON Output:
{
    " A/B loins ( Quantity": "not given, omitted)",
    " Canela": "2 lbs",
    " Queso blanco": "1",
    " Brisket": "10 lbs",
    " Mirin": "1",
    " Konbu": "5 lbs",
    " Firm tofu": "3 pieces",
    " Nori": "1 piece",
    " Dashi mushroom": "1 bag",
    " Onion beer": "1 case",
    " Yuzu green": "2",
    " Flat lid (cheese butter/ cream/ or cheese blan)": "3 pieces 16

### Here we have taked the code from the meta and checked it with the csv

In [21]:
from huggingface_hub import InferenceClient
import pandas as pd
import json
from pinecone import Pinecone
import torch
from transformers import AutoTokenizer, AutoModel




pc = Pinecone(api_key=pinecone_api_key)


index_name = "product-embedding"
dimension = 1024  # This is the size for the BGE model's output vectors

# Load the Pinecone index
index = pc.Index(index_name)

# Load the LLaMA model
client = InferenceClient(api_key=hugging_face_api)
model_name = "BAAI/bge-large-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to convert text to embedding (vector)
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# Define the input message (user's request for groceries)
user_input = """ Hey,Here's what I need for tomorrow:
•	2lbs Saku tuna blocks
•	2 8/12 shrimp
•	2lbs canela
•	1 queso blanco
•	10 lbs brisket
•	1 Mirin
•	Konbu 5lb
•	3 house firm tofu
•	1 pcs nori
•	1 bag dashi mushroom
•	1 case onion beer
•	2 yuzu green
•	3 flat lid 16/24oz
•	1 cheesecloth

"""


# Create a conversation with LLaMA to extract grocery items and quantities
messages = [
    {
        "role": "user",
        "content": f"""
You are an AI assistant tasked with extracting grocery items and their quantities from the provided text. The response must strictly adhere to the following rules:

**Output Format:**
- [Item Name and Size/Unit]: [Quantity]

**Examples:**
- Saku Tuna Blocks 2lbs: 1
- Shrimp 8/12: 2
- Brisket 10lbs: 1
- Onion Beer Case: 1
- Dashi Mushroom Bag: 1

**Important Extraction Guidelines:**
1. **Quantity vs. Size/Unit**:
   - If a size/unit like "2lbs" appears, it is always part of the item name.
   - The quantity refers only to the **number of items** (e.g., 1 block of Saku Tuna 2lbs → `Saku Tuna Blocks 2lbs: 1`).
   - Example:
     - Input: "2lbs Saku Tuna Blocks" → Output: Saku Tuna Blocks 2lbs: 1
     - Input: "2 8/12 shrimp" → Output: Shrimp 8/12: 2
2. **Units or Packaging Types**:
   - Include units like `case`, `bag`, `pcs`, or specific sizes (e.g., `16/24oz`) as part of the item name.
   - Example:
     - Input: "1 case onion beer" → Output: Onion Beer Case: 1
     - Input: "1 bag dashi mushroom" → Output: Dashi Mushroom Bag: 1
3. **Only Numeric Quantities in the Quantity Field**:
   - The numeric quantity represents the count of the item, not its size or weight.
   - Example:
     - Input: "10 lbs brisket" → Output: Brisket 10lbs: 1
4. **Exclude Filler Text**:
   - Ignore phrases like "Hi, I need," or "Can you get me."
   - Focus only on the product names and quantities.
5. **Capture Order of Words**:
   - If the size/unit appears before or after the item name (e.g., "2lbs brisket" or "brisket 2lbs"), treat it consistently as part of the item name.

**Input Text:**
{user_input}

**Output Requirements:**
- Each item should be on a new line in the format `[Item Name and Size/Unit]: [Quantity]`.
- Do not add extra explanations or comments. Only provide the extracted data in the specified format.

Begin extracting the grocery items and quantities.
"""
    }
]



try:
    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct",
        messages=messages,
        max_tokens=500
    )
    
    # Extract the response from LLaMA
    response = completion.choices[0].message.content
    print("LLaMA response:", response)
    

    items = {}
    lines = response.split('\n')
    for line in lines:
        if ':' in line:
            item, quantity = line.split(':', 1)
            # Remove leading bullet points or other unwanted characters
            item = item.strip().strip('-').lstrip('\u2022').strip()  
            items[item] = quantity.strip()

    # Remove the first unnecessary item from the dictionary
    if "Here are the extracted grocery items and their quantities" in items:
        del items["Here are the extracted grocery items and their quantities"]

    # Print the structured result
    print("Structured result:")
    for item, quantity in items.items():
        print(f"{item}: {quantity}")

    # Convert the result to JSON
    json_output = json.dumps(items, indent=4)
    print("\nJSON Output:")
    print(json_output)


    # Load the products CSV file
    df_products = pd.read_csv("products.csv")
    
    # Now, let's convert the items (keys) into embeddings and query Pinecone
    for item in items.keys():
        # Convert the key (item) into an embedding
        query_vector = get_embedding(item)
        query_vector = query_vector.tolist()  # Convert the numpy array to a list
        

        results = index.query(
            vector=query_vector, 
            top_k=1,              
            include_metadata=True 
        )
        
        # Print the results
        print(f"\nResults for '{item}':")
        print("Query results:", results)

        # Print the metadata of the top matches and fetch the product description
        for match in results['matches']:
            product_code = match['metadata']['product_code']  
            print(f"Product Code: {product_code}")
            
            # Look for the product_code in the products DataFrame
            product_row = df_products[df_products['product_code'] == product_code]
            
            if not product_row.empty:
                product_description = product_row.iloc[0]['product_description']
                print(f"Product Description: {product_description}")
            else:
                print("Product not found in the CSV.")
            
            print(f"Similarity score: {match['score']}")
            
            # Now, let's compare the original item (from the LLaMA response) and the matched product description
            print(f"Original Item from User Input: '{item}'")
            print(f"Matched Product Description from CSV: '{product_description}'")
            print("-" * 50)

except Exception as e:
    print(f"An error occurred: {e}")


LLaMA response: Saku Tuna Blocks 2lbs: 1
Shrimp 8/12: 2
Canela 2lbs: 1
Queso Blanco: 1
Brisket 10lbs: 1
Mirin: 1
Konbu 5lb: 1
House firm Tofu 3pcs: 1
Nori 1 pcs: 1
Dashi Mushroom Bag: 1
Onion Beer Case: 1
Yuzu green 2: 1
Flat Lid 16/24oz: 3
Cheesecloth: 1
Structured result:
Saku Tuna Blocks 2lbs: 1
Shrimp 8/12: 2
Canela 2lbs: 1
Queso Blanco: 1
Brisket 10lbs: 1
Mirin: 1
Konbu 5lb: 1
House firm Tofu 3pcs: 1
Nori 1 pcs: 1
Dashi Mushroom Bag: 1
Onion Beer Case: 1
Yuzu green 2: 1
Flat Lid 16/24oz: 3
Cheesecloth: 1

JSON Output:
{
    "Saku Tuna Blocks 2lbs": "1",
    "Shrimp 8/12": "2",
    "Canela 2lbs": "1",
    "Queso Blanco": "1",
    "Brisket 10lbs": "1",
    "Mirin": "1",
    "Konbu 5lb": "1",
    "House firm Tofu 3pcs": "1",
    "Nori 1 pcs": "1",
    "Dashi Mushroom Bag": "1",
    "Onion Beer Case": "1",
    "Yuzu green 2": "1",
    "Flat Lid 16/24oz": "3",
    "Cheesecloth": "1"
}

Results for 'Saku Tuna Blocks 2lbs':
Query results: {'matches': [{'id': '525',
              'metadat

In [15]:
from huggingface_hub import InferenceClient
import pandas as pd
import json
from pinecone import Pinecone
import torch
from transformers import AutoTokenizer, AutoModel


def backend(user_input):

    # Initialize Pinecone client
    pc = Pinecone(api_key=pinecone_api_key)


    index_name = "product-embedding"
    dimension = 1024  


    index = pc.Index(index_name)


    client = InferenceClient(api_key=hugging_face_api)
    model_name = "BAAI/bge-large-en"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Function to convert text to embedding (vector)
    def get_embedding(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        return embedding

  
    messages = [
        {
            "role": "user",
            "content": f"""
    Please extract the grocery items and their quantities from the following text in the format:

    Item: Quantity

    For example:
    Carrot: 2 lbs
    Tofu: 3 pcs

    Special cases to consider:
    1. If the quantity includes a size or measurement range, like '8/12 shrimp' or 'flat lid 16/24oz', ensure it is captured correctly.
    Example: '2 8/12 shrimp' means 2 shrimp of size 8/12, and 'flat lid 16/24 oz' means flat lids in a range of 16 oz to 24 oz.

    Here is the input text:

    {user_input}
    """
        }
    ]

    # Call the chat completion endpoint
    try:
        completion = client.chat.completions.create(
            model="meta-llama/Llama-3.2-3B-Instruct",
            messages=messages,
            max_tokens=500
        )
        
        # Extract the response from LLaMA
        response = completion.choices[0].message.content
        print("LLaMA response:", response)
        
     
        items = {}
        lines = response.split('\n')
        for line in lines:
            if ':' in line:
                item, quantity = line.split(':', 1)

                item = item.strip().strip('-').lstrip('\u2022').strip()  
                items[item] = quantity.strip()

        # Remove the first unnecessary item from the dictionary
        if "Here are the extracted grocery items and their quantities" in items:
            del items["Here are the extracted grocery items and their quantities"]

        # Print the structured result
        print("Structured result:")
        for item, quantity in items.items():
            print(f"{item}: {quantity}")

        # Convert the result to JSON
        json_output = json.dumps(items, indent=4)
        print("\nJSON Output:")
        print(json_output)


        # Load the products CSV file
        df_products = pd.read_csv("products.csv")
        
        # Now, let's convert the items (keys) into embeddings and query Pinecone
        for item in items.keys():
            # Convert the key (item) into an embedding
            query_vector = get_embedding(item)
            query_vector = query_vector.tolist()  # Convert the numpy array to a list
            
        
            results = index.query(
                vector=query_vector,  
                top_k=5,              
                include_metadata=True
            )
            
            # Print the results
            print(f"\nResults for '{item}':")
            print("Query results:", results)

            # Print the metadata of the top matches and fetch the product description
            for match in results['matches']:
                product_code = match['metadata']['product_code']  # Retrieve product_code from metadata
                print(f"Product Code: {product_code}")
                
                # Look for the product_code in the products DataFrame
                product_row = df_products[df_products['product_code'] == product_code]
                
                if not product_row.empty:
                    product_description = product_row.iloc[0]['product_description']
                    print(f"Product Description: {product_description}")
                else:
                    print("Product not found in the CSV.")
                
                print(f"Similarity score: {match['score']}")
                
                # Now, let's compare the original item (from the LLaMA response) and the matched product description
                print(f"Original Item from User Input: '{item}'")
                print(f"Matched Product Description from CSV: '{product_description}'")
                print("-" * 50)

    except Exception as e:
        print(f"An error occurred: {e}")
        
    


In [14]:
df = pd.read_csv("products.csv")

product = df['product_description']

product[:30]

0                               #Guajillo Mexicano - LB
1                                1” Cut Beef Shank - LB
2                 12 Oz Heavy Plastic Deli Combo - CASE
3                     12Oz Deli Combo Containers - CASE
4                16 Oz Heavy Plastics Deli Combo - EACH
5                     24 Ply Cotton Butcher Twine - PCS
6       24-32Oz To Go Microwavable Lids( Lh7200) - CASE
7                           2Oz Plastic Ramenkin - CASE
8                      2Oz Plastic Ramenkin Lids - CASE
9       32Oz Deli Plastic Container Bottom (480) - EACH
10         32Oz To Go Microwavable Bowls (M7232) - CASE
11                                 3M Tape Green - EACH
12              500G Ina Agar (Kona Kanten) 250G - EACH
13             64Oz Deli Plastic Container Combo - CASE
14            64Oz Heavy Plastic Deli Containers - CASE
15      64Oz Heavy Plastic Deli Containers Combo - CASE
16    720 Btl Ryujin Shuzo "Oze No Yukidore Hiyaoros...
17      720 Ml Btl Amabuki Gin No Kurenai Junmai