In [15]:
import pandas as pd
import numpy as np

## Load Data

In [16]:
product_df = pd.read_csv("../data/processed/product_metadata.csv")
product_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry


## OpenAI Access

In [17]:
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [18]:
from openai import OpenAI
client = OpenAI()

## Create embeddings and add to ChromaDB

In [19]:
import chromadb

#Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="../data/chroma_storage")

In [22]:
chroma_client.list_collections()

[]

In [21]:
# Delete collection
# chroma_client.delete_collection("product_embeddings")

In [23]:
collection = chroma_client.create_collection(name="product_embeddings")

In [10]:
cc = chroma_client.get_collection(name="product_embeddings")

In [29]:
df = product_df.copy()

In [27]:
# df = df[(df['product_id'] >=1) & (df['product_id'] <=5000)]

In [30]:
df = df.iloc[:5000]

In [31]:
print(df.shape[0])
print(df.head(1))
print(df.tail(1))

5000
   product_id                product_name  aisle_id  department_id  \
0           1  Chocolate Sandwich Cookies        61             19   

           aisle department  
0  cookies cakes     snacks  
      product_id                            product_name  aisle_id  \
4999        6691  Hard Boiled Eggs Peeled & Ready To Eat        86   

      department_id aisle  department  
4999             16  eggs  dairy eggs  


In [32]:
batch_size = 100
num_batches = int(np.ceil(len(df) / batch_size))

for i in range(num_batches):
    batch = df[i * batch_size: (i + 1) * batch_size]
    
    embeddings = []
    metadatas = []
    ids = []
    
    for index, row in batch.iterrows():
        response = client.embeddings.create(input=row['product_name'], model='text-embedding-3-small')
        embedding = response.data[0].embedding
        
        embeddings.append(embedding)
        metadatas.append({'product_name': row['product_name']})
        ids.append(str(row['product_id']))
    
    print(f"Batch-{i+1} done!")
    
    # Add batch to ChromaDB
    collection.add(
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids
    )

    print(f"Batch-{i+1} added to chroma!")



Batch-1 done!
Batch-1 added to chroma!
Batch-2 done!
Batch-2 added to chroma!
Batch-3 done!
Batch-3 added to chroma!
Batch-4 done!
Batch-4 added to chroma!
Batch-5 done!
Batch-5 added to chroma!
Batch-6 done!
Batch-6 added to chroma!
Batch-7 done!
Batch-7 added to chroma!
Batch-8 done!
Batch-8 added to chroma!
Batch-9 done!
Batch-9 added to chroma!
Batch-10 done!
Batch-10 added to chroma!
Batch-11 done!
Batch-11 added to chroma!
Batch-12 done!
Batch-12 added to chroma!
Batch-13 done!
Batch-13 added to chroma!
Batch-14 done!
Batch-14 added to chroma!
Batch-15 done!
Batch-15 added to chroma!
Batch-16 done!
Batch-16 added to chroma!
Batch-17 done!
Batch-17 added to chroma!
Batch-18 done!
Batch-18 added to chroma!
Batch-19 done!
Batch-19 added to chroma!
Batch-20 done!
Batch-20 added to chroma!
Batch-21 done!
Batch-21 added to chroma!
Batch-22 done!
Batch-22 added to chroma!
Batch-23 done!
Batch-23 added to chroma!
Batch-24 done!
Batch-24 added to chroma!
Batch-25 done!
Batch-25 added to c

In [33]:
collection.count()

5000

## Similar Product Recommendation

In [34]:
def recommend_products(product_name, top_n=3):
    # Get embedding for the input description
    response = client.embeddings.create(input=product_name, model='text-embedding-3-small')
    input_embedding = response.data[0].embedding

    # Query Chromadb for similar embeddings
    results = collection.query(query_embeddings=input_embedding, n_results=top_n)

    # Extract and return recommended products
    recommendations = []
    metadatas = results['metadatas']

    product_names = [item['product_name'] for sublist in metadatas for item in sublist]

    for product_name in product_names:
        recommendations.append({'product_name': product_name})

    return recommendations


In [35]:
recommendations = recommend_products('Green Tea')
print(recommendations)


[{'product_name': 'Diet Green Tea'}, {'product_name': 'Green Tea Pomegranate'}, {'product_name': 'Unsweetened Green Tea'}]


In [34]:
response = client.embeddings.create(input='milk tea', model='text-embedding-3-small')
input_embedding = response.data[0].embedding
results = collection.query(query_embeddings=input_embedding, n_results=3)
print(results)
print(type(results))

{'ids': [['36207', '9844', '5459']], 'distances': [[0.9575251340866089, 1.0038809776306152, 1.0046604871749878]], 'metadatas': [[{'product_name': 'Mint & Honey Green Tea'}, {'product_name': 'Raspberry Iced Tea'}, {'product_name': 'Chamomile Tea'}]], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
<class 'dict'>


In [43]:
recommendations = []
metadatas = results['metadatas']

product_names = [item['product_name'] for sublist in metadatas for item in sublist]

for product_name in product_names:
    recommendations.append({'product_name': product_name})

print(recommendations)

['Mint & Honey Green Tea', 'Raspberry Iced Tea', 'Chamomile Tea']
[{'product_name': 'Mint & Honey Green Tea'}, {'product_name': 'Raspberry Iced Tea'}, {'product_name': 'Chamomile Tea'}]


## Basket Recommendation

In [64]:
system_prompt = "You are an AI assistant functioning \
as a recommendation system for ecommerce website. \
Be specific and limit your answers to the requested format. \
Keep your answers short and concise."

In [68]:
def get_user_prompt(ordered_list_of_items):

    prompt=None
    if len(ordered_list_of_items) > 0:
        items = ', '.join(ordered_list_of_items)
        prompt = f"A user bought the following items: {items}. What next 3 items would he/she be likely to purchase next?"
        prompt += "Express your response as a JSON object with a key of 'next_items and a value representing your array of recommended items."
    
    return prompt


In [72]:
# Retrieve user prompt
user_prompt = get_user_prompt(
    ['bread', 'coffee']
)
print(user_prompt)

A user bought the following items: bread, coffee. What next 3 items would he/she be likely to purchase next?Express your response as a JSON object with a key of 'next_items and a value representing your array of recommended items.


In [73]:
# Test Prompt
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
       {'role': 'system', 'content': system_prompt},
       {'role': 'user', 'content': user_prompt} 
    ],
)

message = response.choices[0].message.content
print(message)

```json
{
  "next_items": ["butter", "jam", "milk"]
}
```


In [74]:
next_items = ['match green tea', 'milk tea']

In [75]:
def recommend_products(next_items, top_n=5):
    # Get embedding for the input description
    response = client.embeddings.create(input=next_items, model='text-embedding-3-small')
    input_embedding = response.data[0].embedding

    # Query Chromadb for similar embeddings
    results = collection.query(query_embeddings=input_embedding, n_results=top_n)

    # Extract and return recommended products
    recommendations = []
    metadatas = results['metadatas']

    product_names = [item['product_name'] for sublist in metadatas for item in sublist]

    for product_name in product_names:
        recommendations.append({'product_name': product_name})

    return recommendations


In [76]:
next_items = ['match green tea', 'milk tea']
recommendations = recommend_products(next_items)
print(recommendations)

[{'product_name': 'Classic Green Tea'}, {'product_name': 'Organic Pure Green Tea'}, {'product_name': 'China Green Tips Green Tea'}, {'product_name': 'Green Tea with Mint'}, {'product_name': 'Mint & Honey Green Tea'}]
