# Retail Recommendations via AI-Based Embeddings Vector DB
- Please note that the "womens_clothing_e-commerce_reviews.csv" file used is unavailable as this project was completed via an online course

In [None]:
# Load the dataset
import pandas as pd
reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")

# Display the first few entries
reviews.head()

In [None]:
# Cleaning the data

reviews = reviews[reviews['Review Text'].notna()]                     # remove NaN
reviews = reviews[reviews['Review Text'].astype(str).str.strip() != '']  # remove empty strings

In [None]:
# Basic Embeddings Setup & OpenAI Call

import os
from openai import OpenAI
# Define the model to use
model = "text-embedding-3-small"
# Define the client
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

# Create Embeddings Endpoint
review_text = reviews['Review Text'].tolist()

response = client.embeddings.create(
    model=model,
    input= review_text
)

In [None]:
# Dimensionality Reduction (t-SNE) - for the sole purpose of Visualization
embeddings = [item.embedding for item in response.data]  

from sklearn.manifold import TSNE
import numpy as np

# Transform into an array
embeddings_array = np.array(embeddings)

tsne = TSNE(n_components = 2, perplexity = 5)
embeddings_2d = tsne.fit_transform(np.array(embeddings_array))

In [None]:
# Visualizing the embeddings

import matplotlib.pyplot as plt

plt.scatter(embeddings_2d[:,0], embeddings_2d[:,1])
plt.show()

In [None]:
# Creating a "Create Embeddings" Function I can reuse:

def create_embeddings(texts):
    response = client.embeddings.create(
        model = model,
        input = texts
    )
    response_dict = response.model_dump()

    return [data['embedding'] for data in response_dict['data']]

In [None]:
# Creating Topics for Classification
# First create a dictionary of topics - product quality, comfort, style, price, sustainability, customer service
topics = [
     {'label': 'quality'},
     {'label': 'comfort'},
     {'label': 'style'},
     {'label': 'price'},
     {'label': 'sustainability'},
     {'label': 'customer service'},
]
class_descriptions = [topic['label'] for topic in topics]
class_embeddings = create_embeddings(class_descriptions)

In [None]:
# Computing Cosine Distances for all embeddings

from scipy.spatial import distance

# Single Comparison Function
def find_closest(query_vector, embeddings):
    distances = []
    for index, embedding in enumerate(embeddings):
        dist = distance.cosine(query_vector, embedding)
        distances.append({"distance": dist, "index": index})
    return min(distances, key = lambda x: x['distance'])

# Looped over all my embeddings
results = []
for i, query_vec in enumerate(embeddings):  
    closest = find_closest(query_vec, class_embeddings)
    label = topics[closest["index"]]["label"]       
    results.append(label)

In [None]:
# Shelling out Three Reviews for Each Category

def find_top_n_reviews(topic_vector, review_vectors, n=3):
    """
    topic_vector: embedding of the topic
    review_vectors: list of embeddings (one per review)
    n: how many closest reviews to return
    """
    distances = []
    for i, review_vec in enumerate(review_vectors):
        dist = distance.cosine(topic_vector, review_vec)
        distances.append({"index": i, "distance": dist})
    
    distances = sorted(distances, key=lambda x: x["distance"])
    return distances[:n]  

for i, topic in enumerate(topics):
    topic_label = topic["label"]
    topic_vec = class_embeddings[i]   
    top_reviews = find_top_n_reviews(topic_vec, vectors, n=3)
    
    print(f"\n Top 3 reviews for topic: {topic_label.upper()}")
    for rank, item in enumerate(top_reviews, start=1):
        review_text = reviews.iloc[item["index"]]["Review Text"]
        print(f"{rank}. {review_text.strip()}")

In [None]:
# Reusing the Above Function to Shell out Most Similar Reviews for a Given REVIEW

most_similar = find_top_n_reviews(embeddings[0], embeddings, n=3)

most_similar_reviews = [
    reviews.iloc[item["index"]]["Review Text"].strip()
    for item in most_similar
]