In [4]:
import pandas as pd
import numpy as np

## Load Data

In [5]:
product_df = pd.read_csv("../app/data/processed/product_metadata.csv")
product_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry


## OpenAI Access

In [9]:
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [10]:
from openai import OpenAI
client = OpenAI()

## Create embeddings and add to ChromaDB

In [2]:
import chromadb

#Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="../data/chroma_storage")

In [29]:
# Delete collection
# chroma_client.delete_collection("product_embeddings")

In [3]:
collection = chroma_client.create_collection(name="all_product_embeddings")

In [12]:
cc = chroma_client.get_collection(name="all_product_embeddings")

In [None]:
df = product_df.copy()

In [None]:
batch_size = 500
num_batches = int(np.ceil(len(df) / batch_size))

for i in range(num_batches):
    batch = df[i * batch_size: (i + 1) * batch_size]
    
    embeddings = []
    metadatas = []
    ids = []
    
    for index, row in batch.iterrows():
        response = client.embeddings.create(input=row['product_name'], model='text-embedding-3-small')
        embedding = response.data[0].embedding
        
        embeddings.append(embedding)
        metadatas.append({'product_name': row['product_name']})
        ids.append(str(row['product_id']))
    
    # Add batch to ChromaDB
    collection.add(
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids
    )


## Similar Product Recommendation

In [58]:
def recommend_products(product_name, top_n=3):
    # Get embedding for the input description
    response = client.embeddings.create(input=product_name, model='text-embedding-3-small')
    input_embedding = response.data[0].embedding

    # Query Chromadb for similar embeddings
    results = collection.query(query_embeddings=input_embedding, n_results=top_n)

    # Extract and return recommended products
    recommendations = []
    metadatas = results['metadatas']

    product_names = [item['product_name'] for sublist in metadatas for item in sublist]

    for product_name in product_names:
        recommendations.append({'product_name': product_name})

    return recommendations


In [59]:
recommendations = recommend_products('Green Tea')
print(recommendations)


[{'product_name': 'Classic Green Tea'}, {'product_name': 'Green Tea with Mint'}, {'product_name': 'Organic Pure Green Tea'}]


In [34]:
response = client.embeddings.create(input='milk tea', model='text-embedding-3-small')
input_embedding = response.data[0].embedding
results = collection.query(query_embeddings=input_embedding, n_results=3)
print(results)
print(type(results))

{'ids': [['36207', '9844', '5459']], 'distances': [[0.9575251340866089, 1.0038809776306152, 1.0046604871749878]], 'metadatas': [[{'product_name': 'Mint & Honey Green Tea'}, {'product_name': 'Raspberry Iced Tea'}, {'product_name': 'Chamomile Tea'}]], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
<class 'dict'>


In [43]:
recommendations = []
metadatas = results['metadatas']

product_names = [item['product_name'] for sublist in metadatas for item in sublist]

for product_name in product_names:
    recommendations.append({'product_name': product_name})

print(recommendations)

['Mint & Honey Green Tea', 'Raspberry Iced Tea', 'Chamomile Tea']
[{'product_name': 'Mint & Honey Green Tea'}, {'product_name': 'Raspberry Iced Tea'}, {'product_name': 'Chamomile Tea'}]


## Basket Recommendation

In [64]:
system_prompt = "You are an AI assistant functioning \
as a recommendation system for ecommerce website. \
Be specific and limit your answers to the requested format. \
Keep your answers short and concise."

In [68]:
def get_user_prompt(ordered_list_of_items):

    prompt=None
    if len(ordered_list_of_items) > 0:
        items = ', '.join(ordered_list_of_items)
        prompt = f"A user bought the following items: {items}. What next 3 items would he/she be likely to purchase next?"
        prompt += "Express your response as a JSON object with a key of 'next_items and a value representing your array of recommended items."
    
    return prompt


In [72]:
# Retrieve user prompt
user_prompt = get_user_prompt(
    ['bread', 'coffee']
)
print(user_prompt)

A user bought the following items: bread, coffee. What next 3 items would he/she be likely to purchase next?Express your response as a JSON object with a key of 'next_items and a value representing your array of recommended items.


In [73]:
# Test Prompt
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
       {'role': 'system', 'content': system_prompt},
       {'role': 'user', 'content': user_prompt} 
    ],
)

message = response.choices[0].message.content
print(message)

```json
{
  "next_items": ["butter", "jam", "milk"]
}
```


In [74]:
next_items = ['match green tea', 'milk tea']

In [75]:
def recommend_products(next_items, top_n=5):
    # Get embedding for the input description
    response = client.embeddings.create(input=next_items, model='text-embedding-3-small')
    input_embedding = response.data[0].embedding

    # Query Chromadb for similar embeddings
    results = collection.query(query_embeddings=input_embedding, n_results=top_n)

    # Extract and return recommended products
    recommendations = []
    metadatas = results['metadatas']

    product_names = [item['product_name'] for sublist in metadatas for item in sublist]

    for product_name in product_names:
        recommendations.append({'product_name': product_name})

    return recommendations


In [76]:
next_items = ['match green tea', 'milk tea']
recommendations = recommend_products(next_items)
print(recommendations)

[{'product_name': 'Classic Green Tea'}, {'product_name': 'Organic Pure Green Tea'}, {'product_name': 'China Green Tips Green Tea'}, {'product_name': 'Green Tea with Mint'}, {'product_name': 'Mint & Honey Green Tea'}]
