In [4]:
import pandas as pd

## Load Data

In [5]:
product_df = pd.read_csv("../data/raw/product_metadata.csv")
product_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry


#### Select Random Data

In [6]:
filtered_df = product_df[product_df['aisle_id'] == 94] # aisle is tea.
filtered_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
86,87,Classics Earl Grey Tea,94,7,tea,beverages
96,97,Organic Chamomile Lemon Tea,94,7,tea,beverages
176,177,Citrus Terere Yerba Mate,94,7,tea,beverages
232,233,Chinese Breakfast Black Tea,94,7,tea,beverages


In [5]:
filtered_rows = filtered_df.shape[0]
print(f'Number of rows: {filtered_rows}')

Number of rows: 894


In [7]:
sample_df = filtered_df.sample(n=50, random_state=42)
sample_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
39148,39149,Cinnamon Apple Spice Herb Tea,94,7,tea,beverages
23729,23730,Tahitian Vanilla Hazelnut Tea Bags,94,7,tea,beverages
28871,28872,Lemon Flavor Iced Tea,94,7,tea,beverages
39705,39706,Unsweetened Organic Ginger Oasis Tea,94,7,tea,beverages
1926,1927,Cool Brew Peach Black Iced Tea,94,7,tea,beverages


In [13]:
sample_df.to_csv("../data/raw/sample_product_metadata.csv", index=False)

In [8]:
df = sample_df.copy()

## OpenAI Access

In [9]:
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [10]:
from openai import OpenAI
client = OpenAI()

## Create Embedding

In [24]:
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [32]:
df['product_embedding'] = df.product_name.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [33]:
df.to_csv('../data/processed/product_embeddings.csv', index=False)

In [10]:
embedding_df = pd.read_csv("../data/processed/product_embeddings.csv")
embedding_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department,product_embedding
0,39149,Cinnamon Apple Spice Herb Tea,94,7,tea,beverages,"[-0.018135325983166695, 0.011038893833756447, ..."
1,23730,Tahitian Vanilla Hazelnut Tea Bags,94,7,tea,beverages,"[-0.038035646080970764, -0.024901220574975014,..."
2,28872,Lemon Flavor Iced Tea,94,7,tea,beverages,"[-0.0034146751277148724, 0.0018586806254461408..."
3,39706,Unsweetened Organic Ginger Oasis Tea,94,7,tea,beverages,"[-0.0067253680899739265, -0.025777382776141167..."
4,1927,Cool Brew Peach Black Iced Tea,94,7,tea,beverages,"[0.030188988894224167, -0.013948407024145126, ..."


In [20]:
result = get_embedding("Mix Popcorn	")
print(result)

[-0.028404494747519493, 0.03662276640534401, -0.022751789540052414, -0.044826943427324295, -0.04443224146962166, -0.08063211292028427, 0.013300069607794285, 0.049140479415655136, 0.017451496794819832, -0.009338947013020515, 0.036932893097400665, 0.031491633504629135, 0.03659457340836525, 0.05218533053994179, 0.0019699891563504934, -0.005272099748253822, -0.011530956253409386, -0.006946061737835407, 0.012595243752002716, -0.018424157053232193, 0.009014726616442204, 0.03408539295196533, -0.02007344923913479, 0.01755017228424549, 0.041359201073646545, 0.001356790424324572, -0.05164966359734535, 0.08835700899362564, 0.03211187943816185, 0.0028474978171288967, 0.010198834352195263, -0.03625625744462013, -0.026149051263928413, -0.03431093692779541, -0.026952553540468216, -0.05317208543419838, -0.011989093385636806, 0.01553436927497387, -0.025726156309247017, -1.885960591607727e-05, -0.020186221227049828, -0.015844492241740227, -0.012926512397825718, 0.024697110056877136, 0.062419407069683075

## Create embeddings and add to ChromaDB

In [2]:
import chromadb

#Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="../data/chroma_storage")

In [29]:
# Delete collection
# chroma_client.delete_collection("product_embeddings")

In [3]:
collection = chroma_client.create_collection(name="product_embeddings")

In [12]:
cc = chroma_client.get_collection(name="product_embeddings")

In [11]:
for index, row in df.iterrows():
    response = client.embeddings.create(input=row['product_name'], model='text-embedding-3-small')
    embedding = response.data[0].embedding
    
    # Add document to ChromaDB
    collection.add(
        embeddings=embedding,
        metadatas={'product_name': row['product_name']},
        ids=str(row['product_id'])
    )

## Similar Product Recommendation

In [58]:
def recommend_products(product_name, top_n=3):
    # Get embedding for the input description
    response = client.embeddings.create(input=product_name, model='text-embedding-3-small')
    input_embedding = response.data[0].embedding

    # Query Chromadb for similar embeddings
    results = collection.query(query_embeddings=input_embedding, n_results=top_n)

    # Extract and return recommended products
    recommendations = []
    metadatas = results['metadatas']

    product_names = [item['product_name'] for sublist in metadatas for item in sublist]

    for product_name in product_names:
        recommendations.append({'product_name': product_name})

    return recommendations


In [59]:
recommendations = recommend_products('Green Tea')
print(recommendations)


[{'product_name': 'Classic Green Tea'}, {'product_name': 'Green Tea with Mint'}, {'product_name': 'Organic Pure Green Tea'}]


In [34]:
response = client.embeddings.create(input='milk tea', model='text-embedding-3-small')
input_embedding = response.data[0].embedding
results = collection.query(query_embeddings=input_embedding, n_results=3)
print(results)
print(type(results))

{'ids': [['36207', '9844', '5459']], 'distances': [[0.9575251340866089, 1.0038809776306152, 1.0046604871749878]], 'metadatas': [[{'product_name': 'Mint & Honey Green Tea'}, {'product_name': 'Raspberry Iced Tea'}, {'product_name': 'Chamomile Tea'}]], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
<class 'dict'>


In [43]:
recommendations = []
metadatas = results['metadatas']

product_names = [item['product_name'] for sublist in metadatas for item in sublist]

for product_name in product_names:
    recommendations.append({'product_name': product_name})

print(recommendations)

['Mint & Honey Green Tea', 'Raspberry Iced Tea', 'Chamomile Tea']
[{'product_name': 'Mint & Honey Green Tea'}, {'product_name': 'Raspberry Iced Tea'}, {'product_name': 'Chamomile Tea'}]


## Basket Recommendation

In [64]:
system_prompt = "You are an AI assistant functioning \
as a recommendation system for ecommerce website. \
Be specific and limit your answers to the requested format. \
Keep your answers short and concise."

In [68]:
def get_user_prompt(ordered_list_of_items):

    prompt=None
    if len(ordered_list_of_items) > 0:
        items = ', '.join(ordered_list_of_items)
        prompt = f"A user bought the following items: {items}. What next 3 items would he/she be likely to purchase next?"
        prompt += "Express your response as a JSON object with a key of 'next_items and a value representing your array of recommended items."
    
    return prompt


In [72]:
# Retrieve user prompt
user_prompt = get_user_prompt(
    ['bread', 'coffee']
)
print(user_prompt)

A user bought the following items: bread, coffee. What next 3 items would he/she be likely to purchase next?Express your response as a JSON object with a key of 'next_items and a value representing your array of recommended items.


In [73]:
# Test Prompt
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
       {'role': 'system', 'content': system_prompt},
       {'role': 'user', 'content': user_prompt} 
    ],
)

message = response.choices[0].message.content
print(message)

```json
{
  "next_items": ["butter", "jam", "milk"]
}
```


In [74]:
next_items = ['match green tea', 'milk tea']

In [75]:
def recommend_products(next_items, top_n=5):
    # Get embedding for the input description
    response = client.embeddings.create(input=next_items, model='text-embedding-3-small')
    input_embedding = response.data[0].embedding

    # Query Chromadb for similar embeddings
    results = collection.query(query_embeddings=input_embedding, n_results=top_n)

    # Extract and return recommended products
    recommendations = []
    metadatas = results['metadatas']

    product_names = [item['product_name'] for sublist in metadatas for item in sublist]

    for product_name in product_names:
        recommendations.append({'product_name': product_name})

    return recommendations


In [76]:
next_items = ['match green tea', 'milk tea']
recommendations = recommend_products(next_items)
print(recommendations)

[{'product_name': 'Classic Green Tea'}, {'product_name': 'Organic Pure Green Tea'}, {'product_name': 'China Green Tips Green Tea'}, {'product_name': 'Green Tea with Mint'}, {'product_name': 'Mint & Honey Green Tea'}]
