## 1. Loading Dataset

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from openai import OpenAI
import time
import os
from dotenv import load_dotenv

In [12]:
# Load the embedding data
data_for_embedding = pd.read_csv('../data/data_for_embedding.csv')

# Display dataset
data_for_embedding

Unnamed: 0,product_id,product_name,full_text,product_brand,gender,primary_color,color_group
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,dkny unisex black & grey printed medium trolle...,DKNY,Unisex,Black,Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,ethnovogue women beige & grey made to measure ...,EthnoVogue,Women,Beige,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,spykar women pink alexa super skinny fit high-...,SPYKAR,Women,Pink,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,raymond men blue self-design single-breasted b...,Raymond,Men,Blue,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,parx men brown & off-white slim fit printed ca...,Parx,Men,White,White
...,...,...,...,...,...,...,...
11592,10261595,Laceandme Black Lace Non-Wired Lightly Padded ...,laceandme black lace non-wired lightly padded ...,Laceandme,Women,Red,Red
11593,10262843,Pepe Jeans Men Black Hammock Slim Fit Low-Rise...,pepe jeans men black hammock slim fit low-rise...,Pepe Jeans,Men,Black,Black
11594,10261721,Mochi Women Gold-Toned Solid Heels,mochi women gold-toned solid heels. a pair of ...,Mochi,Women,Gold,Yellow
11595,10261607,612 league Girls Navy Blue & White Printed Reg...,612 league girls navy blue & white printed reg...,612 league,Girls,Blue,Blue


## 2. Generate embeddings using OpenAI

In [5]:
# Load environment variables from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [6]:
# Function to get embedding with retry
def get_embeddings_batch(texts, model="text-embedding-ada-002"):
    max_retry = 3
    for attempt in range(max_retry):
        try:
            response = openai_client.embeddings.create(
                model=model,
                input=texts
            )
            embeddings = [item.embedding for item in response.data]
            return embeddings
        except Exception as e:
            print(f"Batch error: {e}, attempt {attempt+1}")
            time.sleep(2 ** attempt)
    return [None] * len(texts)

In [7]:
# Generate embeddings
batch_size = 50
all_embeddings = []
texts = data_for_embedding['full_text'].tolist()

total_batches = (len(texts) + batch_size - 1) // batch_size

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_embeddings = get_embeddings_batch(batch_texts)
    all_embeddings.extend(batch_embeddings)
    print(f"Processed batch {i//batch_size + 1} / {total_batches}")

Processed batch 1 / 232
Processed batch 2 / 232
Processed batch 3 / 232
Processed batch 4 / 232
Processed batch 5 / 232
Processed batch 6 / 232
Processed batch 7 / 232
Processed batch 8 / 232
Processed batch 9 / 232
Processed batch 10 / 232
Processed batch 11 / 232
Processed batch 12 / 232
Processed batch 13 / 232
Processed batch 14 / 232
Processed batch 15 / 232
Processed batch 16 / 232
Processed batch 17 / 232
Processed batch 18 / 232
Processed batch 19 / 232
Processed batch 20 / 232
Processed batch 21 / 232
Processed batch 22 / 232
Processed batch 23 / 232
Processed batch 24 / 232
Processed batch 25 / 232
Processed batch 26 / 232
Processed batch 27 / 232
Processed batch 28 / 232
Processed batch 29 / 232
Processed batch 30 / 232
Processed batch 31 / 232
Processed batch 32 / 232
Processed batch 33 / 232
Processed batch 34 / 232
Processed batch 35 / 232
Processed batch 36 / 232
Processed batch 37 / 232
Processed batch 38 / 232
Processed batch 39 / 232
Processed batch 40 / 232
Processed

In [8]:
# Save embeddings to the DataFrame
data_for_embedding['embedding'] = all_embeddings

data_for_embedding['embedding'].iloc[0]

len(data_for_embedding['embedding'].iloc[0])


1536

## 3. Sanity check for embeddings

In [9]:
# Check embedding dimension
embedding_lengths = data_for_embedding['embedding'].apply(len)
print(embedding_lengths.value_counts())

# Check for any None values
num_null_embeddings = data_for_embedding['embedding'].apply(lambda x: x is None).sum()
print(f"\nNumber of null embeddings: {num_null_embeddings}")

# Sample random embeddings
print("\nSample embedding from row 0:")
print(data_for_embedding['embedding'].iloc[0][:10])

# Check for all-zero embeddings
num_all_zeros = data_for_embedding['embedding'].apply(lambda x: np.allclose(x, 0)).sum()
print(f"\nNumber of embeddings that are all zeros: {num_all_zeros}")

embedding
1536    11597
Name: count, dtype: int64

Number of null embeddings: 0

Sample embedding from row 0:
[-0.003466130932793021, -0.0027112476527690887, 0.0174106415361166, -0.04953900724649429, -0.01087831798940897, 0.0036494359374046326, -0.028155647218227386, -0.024289578199386597, -0.00939188152551651, -0.026969164609909058]

Number of embeddings that are all zeros: 0


In [10]:
data_for_embedding

Unnamed: 0,product_id,product_name,full_text,product_brand,gender,primary_color,price_range,color_group,embedding
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,dkny unisex black & grey printed medium trolle...,DKNY,Unisex,Black,10000-20000,Black,"[-0.003466130932793021, -0.0027112476527690887..."
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,ethnovogue women beige & grey made to measure ...,EthnoVogue,Women,Beige,5000-10000,Beige,"[-0.014691106043756008, -0.023677324876189232,..."
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,spykar women pink alexa super skinny fit high-...,SPYKAR,Women,Pink,500-1000,Pink,"[-0.022698599845170975, -0.0005512937204912305..."
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,raymond men blue self-design single-breasted b...,Raymond,Men,Blue,5000-10000,Blue,"[-0.023501304909586906, -0.007294438313692808,..."
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,parx men brown & off-white slim fit printed ca...,Parx,Men,White,500-1000,White,"[-0.019166795536875725, -0.011621855199337006,..."
...,...,...,...,...,...,...,...,...,...
11592,10261595,Laceandme Black Lace Non-Wired Lightly Padded ...,laceandme black lace non-wired lightly padded ...,Laceandme,Women,Red,500-1000,Red,"[-0.029609959572553635, 0.005196043755859137, ..."
11593,10262843,Pepe Jeans Men Black Hammock Slim Fit Low-Rise...,pepe jeans men black hammock slim fit low-rise...,Pepe Jeans,Men,Black,1000-1500,Black,"[-0.026133066043257713, -0.019850818440318108,..."
11594,10261721,Mochi Women Gold-Toned Solid Heels,mochi women gold-toned solid heels. a pair of ...,Mochi,Women,Gold,1500-2000,Yellow,"[-0.007454282604157925, -0.021012337878346443,..."
11595,10261607,612 league Girls Navy Blue & White Printed Reg...,612 league girls navy blue & white printed reg...,612 league,Girls,Blue,500-1000,Blue,"[-0.011221136897802353, -0.007862606085836887,..."


## 4. Save embeddings to disk

In [11]:
data_for_embedding.to_pickle('../data/embedding_with_metadata.pkl')

print(f"\nEmbedding generation complete.")


Embedding generation complete.
