Using this notebook to explore the dataset I created using an LLM which contains features of the items such as: colour, fabric, price_category etc.. 

I'll be looking at create a vector database from this which can then be searched on for new items. Need to figure out what this looks like exactly.

I will also be looking at creating an additional "style" vector from the description maybe that captures the vibe of the item. This will be taken from the description and additional features column.

Looking at the data now I should I have created more clear options for the categories. Now I have to go through and do some cleaning.

Points that I should take into account with final draft of data cleaning using LLMS:
- Clear options for categories that it makes sense for: Category, Colour
- Use a separate column for colour shade: light, medium, dark
- Create a Category (e.g., clothing, footwear, accessories, homeware) and a Sub Catgeory: (Coats & Jackets, Hoodies, T-shirts, boots, sneakers, rings, hats)

In [3]:
import pandas as pd 
import numpy as np 

In [4]:
df = pd.read_csv('ProductStructuredInfo.csv', index_col=0)
df['currency'] = 'GBP'

## 1. Generating embeddings for unstructured text data

I'm gonna generate embeddings using CLIP for the following data:
- Some structured data -> specific category, primary_colour, secondary_colour, pattern, fabric, price_category, fit
- The descriptions -> additional features, description
- Key words

I'll start using the pre-trained CLIP embeddings, to see how well this works. I'll create the embeddings, aggregate them in a way that makes sense then pick a couple items of clothing and see which ones are close to it.

In [7]:
structured_data_for_embeddings = ['fit', 'specific_category', 'primary_colour', 'secondary_colour', 'pattern', 'fabric', 'price_category']

def get_structured_encoding_texts(row: pd.Series):

    structured_encoding_texts = []

    for column in structured_data_for_embeddings:
        if type(row[column]) == str:
            structured_encoding_texts.append(row[column])

    return structured_encoding_texts

def get_descriptive_encoding_texts(row: pd.Series):

    descriptive_encoding_texts = []

    descriptive_encoding_texts += [x for x in row['additional_features'].split(',') if type(x) == str]
    descriptive_encoding_texts += row['description']

    return descriptive_encoding_texts

def get_key_word_encoding_texts(row: pd.Series):

    return [x for x in row['key_words'].split(',') if type(x) == str]

In [37]:
import torch
import clip

device = "cpu"
i = 0

# Load the pre-trained CLIP model and its tokenizer.
model, preprocess = clip.load("ViT-B/32", device=device)

def get_item_embeddings(row: pd.Series):

    # Tokenize the descriptors using CLIP's tokenizer.
    structured_text = get_structured_encoding_texts(row)
    structured_text_tokens = clip.tokenize(structured_text).to(device)

    descriptive_text = get_structured_encoding_texts(row)
    descriptive_text_tokens = clip.tokenize(descriptive_text).to(device)

    key_word_text = get_structured_encoding_texts(row)
    key_word_text_tokens = clip.tokenize(key_word_text).to(device)

    # Generate embeddings for each descriptor.
    with torch.no_grad():
        structured_text_embeddings = model.encode_text(structured_text_tokens)
        descriptive_text_embeddings = model.encode_text(descriptive_text_tokens)
        key_word_text_embeddings = model.encode_text(key_word_text_tokens)

    # Aggregate embeddings by taking the mean (simple average).
    item_structured_info_embedding = structured_text_embeddings.mean(dim=0, keepdim=True)
    item_descriptive_info_embedding = descriptive_text_embeddings.mean(dim=0, keepdim=True)
    item_key_word_info_embedding = key_word_text_embeddings.mean(dim=0, keepdim=True)

    # Optionally, normalize the aggregated embedding.
    item_structured_info_embedding = item_structured_info_embedding / item_structured_info_embedding.norm(dim=-1, keepdim=True)
    item_descriptive_info_embedding = item_descriptive_info_embedding / item_descriptive_info_embedding.norm(dim=-1, keepdim=True)
    item_key_word_info_embedding = item_key_word_info_embedding / item_key_word_info_embedding.norm(dim=-1, keepdim=True)

    return item_structured_info_embedding, item_descriptive_info_embedding, item_key_word_info_embedding

def get_combined_embeddings(row: pd.Series, weights=None):

    item_structured_info_embedding, item_descriptive_info_embedding, item_key_word_info_embedding = get_item_embeddings(row)

    if weights is None:
        weights = [1/3, 1/3, 1/3]

    # Ensure weights sum to 1
    total_weight = sum(weights)

    if total_weight != 0:
        weights = [w / total_weight for w in weights]

    # Combine the embeddings using the specified weights.
    combined_embedding = (weights[0] * item_structured_info_embedding +
                          weights[1] * item_descriptive_info_embedding +
                          weights[2] * item_key_word_info_embedding)
    
    # Normalize the combined embedding.
    combined_embedding = combined_embedding / combined_embedding.norm(dim=-1, keepdim=True)

    return combined_embedding

In [38]:
# Compute embeddings for all items.
embeddings_list = [get_combined_embeddings(row) for _, row in df.iterrows()]

embeddings = torch.cat(embeddings_list, dim=0)  # Shape: (num_items, embedding_dim)

# Since embeddings are normalized, cosine similarity is just the dot product.
cosine_similarity_matrix = embeddings @ embeddings.T

# Replace the nan cosine similarity values with 0
cosine_similarity_matrix = torch.nan_to_num(cosine_similarity_matrix, nan=0.0)

In [24]:
top_k = 3  # Adjust this to see more similar items.
for i in [50, 100, 1000, 2000, 3000]:
    # Get similarity scores for the i-th item.
    sim_scores = cosine_similarity_matrix[i]
    # Sort indices in descending order of similarity.
    # Exclude the item itself (which will have a similarity of 1.0).
    similar_indices = torch.topk(sim_scores, top_k + 1).indices.cpu().numpy()
    similar_indices = [idx for idx in similar_indices if idx != i][:top_k]
    print(f"Item {i} (descriptors: {df.iloc[i]['image_urls'].split("'")[1]}) is similar to items: {[(idx, df.iloc[idx]['image_urls'].split("'")[1]) for idx in similar_indices]}")

Item 50 (descriptors: https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/2/0/20-09-2024-JW_V66186-P117_1_1.jpg) is similar to items: [(5187, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/2/6/26-04-2024-JC_1104268648_1_1.jpg'), (12344, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/0/2/02-08-2024-BLR_782943770001_1_1.jpg'), (6241, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/1/1/11-11-2022_LLx_541580_1_1.jpg')]
Item 100 (descriptors: https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/2/8/28-08-2024-JC_001CSK801007M-GRN_1_1.jpg) is similar to items: [(5793, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/0/9/09-08-23-JF_GMP01220-P001203-35863_1_1.jpg'), (4548, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prod

In [25]:
# Compute embeddings for all items. Only looking at the descriptive embeddings and the key word embeddings.
embeddings_list = [get_combined_embeddings(row, weights=[0, 0.5, 0.5]) for _, row in df.iterrows()]

embeddings = torch.cat(embeddings_list, dim=0)  # Shape: (num_items, embedding_dim)

# Since embeddings are normalized, cosine similarity is just the dot product.
cosine_similarity_matrix = embeddings @ embeddings.T

# Replace the nan cosine similarity values with 0
cosine_similarity_matrix = torch.nan_to_num(cosine_similarity_matrix, nan=0.0)

In [26]:
top_k = 3  # Adjust this to see more similar items.
for i in [50, 100, 1000, 2000, 3000]:
    # Get similarity scores for the i-th item.
    sim_scores = cosine_similarity_matrix[i]
    # Sort indices in descending order of similarity.
    # Exclude the item itself (which will have a similarity of 1.0).
    similar_indices = torch.topk(sim_scores, top_k + 1).indices.cpu().numpy()
    similar_indices = [idx for idx in similar_indices if idx != i][:top_k]
    print(f"Item {i} (descriptors: {df.iloc[i]['image_urls'].split("'")[1]}) is similar to items: {[(idx, df.iloc[idx]['image_urls'].split("'")[1]) for idx in similar_indices]}")

Item 50 (descriptors: https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/2/0/20-09-2024-JW_V66186-P117_1_1.jpg) is similar to items: [(5187, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/2/6/26-04-2024-JC_1104268648_1_1.jpg'), (12344, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/0/2/02-08-2024-BLR_782943770001_1_1.jpg'), (6241, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/1/1/11-11-2022_LLx_541580_1_1.jpg')]
Item 100 (descriptors: https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/2/8/28-08-2024-JC_001CSK801007M-GRN_1_1.jpg) is similar to items: [(5793, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/0/9/09-08-23-JF_GMP01220-P001203-35863_1_1.jpg'), (4548, 'https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prod

In [40]:
embeddings_list_structured = [get_combined_embeddings(row, weights=[1, 0, 0]) for _, row in df.iterrows()]
embeddings_list_descriptive = [get_combined_embeddings(row, weights=[0, 1, 0]) for _, row in df.iterrows()]
embeddings_list_keyword = [get_combined_embeddings(row, weights=[0, 0, 1]) for _, row in df.iterrows()]

df['structured_embeddings'] = [embeddings_list_structured[i].cpu().numpy() for i in range(len(embeddings_list_structured))]
df['descriptive_embeddings'] = [embeddings_list_descriptive[i].cpu().numpy() for i in range(len(embeddings_list_descriptive))]
df['keyword_embeddings'] = [embeddings_list_keyword[i].cpu().numpy() for i in range(len(embeddings_list_keyword))]

df.to_csv('ProductStructuredInfoWithEmbeddings.csv', index=False)

In [28]:
embeddings_list_all = [get_combined_embeddings(row) for _, row in df.iterrows()]
embeddings_list_split_descriptive_keyword = [get_combined_embeddings(row, weights=[0, 0.5, 0.5]) for _, row in df.iterrows()]
embeddings_list_descriptive = [get_combined_embeddings(row, weights=[0, 1, 0]) for _, row in df.iterrows()]
embeddings_list_keyword = [get_combined_embeddings(row, weights=[0, 0, 1]) for _, row in df.iterrows()]

In [30]:
embeddings_list_all[0]

tensor([[ 6.8361e-03, -7.3256e-03, -6.2675e-04,  2.3367e-02,  8.4577e-03,
         -1.2771e-02, -1.5162e-02, -1.2694e-01,  2.1795e-02,  8.4503e-03,
         -5.5618e-03, -1.8069e-02, -2.3803e-02, -8.9944e-03, -1.6894e-03,
          2.1849e-02,  2.7212e-02,  7.1085e-03,  1.2457e-03,  7.0266e-03,
          1.4086e-02, -1.0944e-02,  1.2900e-02,  4.0064e-03, -2.2470e-02,
         -7.8123e-04, -2.0724e-02, -2.2986e-03,  4.3670e-03,  1.1829e-02,
         -4.5569e-03, -6.4352e-03,  2.2341e-03,  3.0442e-02, -5.1545e-03,
          1.0208e-02,  7.1234e-03,  1.6434e-02, -4.0992e-04,  5.3224e-03,
         -1.9843e-02, -1.4645e-02,  4.0405e-03, -1.6330e-02,  2.0935e-02,
          1.7120e-02, -8.6162e-03, -3.6416e-03,  1.7573e-02,  1.9455e-03,
          2.1454e-03, -2.2090e-02,  8.9318e-03, -6.2192e-03,  7.1635e-03,
         -1.6911e-02, -2.7450e-03,  2.5696e-03, -2.9815e-02,  1.3900e-02,
          6.0852e-03, -3.1406e-02,  1.3249e-02, -1.2197e-02, -3.0038e-03,
         -1.4968e-02, -1.5483e-02, -3.

In [36]:
embeddings_list_descriptive[0] @ embeddings_list_keyword[0].T

tensor([[1.]])

In [39]:
def get_combined_embeddings(row: pd.Series, weights=None):

    item_structured_info_embedding, item_descriptive_info_embedding, item_key_word_info_embedding = get_item_embeddings(row)

    if weights is None:
        weights = [1/3, 1/3, 1/3]

    # Ensure weights sum to 1
    total_weight = sum(weights)

    if total_weight != 0:
        weights = [w / total_weight for w in weights]

    

    # Combine the embeddings using the specified weights.
    combined_embedding = (weights[0] * item_structured_info_embedding +
                          weights[1] * item_descriptive_info_embedding +
                          weights[2] * item_key_word_info_embedding)
    
    # Normalize the combined embedding.
    combined_embedding = combined_embedding / combined_embedding.norm(dim=-1, keepdim=True)

    return combined_embedding

In [47]:
df = pd.read_csv('ImageEmbeddings.csv', header=None)

In [48]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.021941,0.007341,0.01245,-0.006881,0.002544,0.027262,-0.041029,0.035903,0.056407,0.022258,...,-0.009007,-0.023353,0.010537,-0.012616,-0.035318,0.004427,-0.008665,0.040466,-0.000854,0.04145
1,0.009438,0.016171,0.000421,0.034141,-0.036617,0.013452,-0.042607,0.002757,0.060865,-0.00428,...,-0.031366,-0.021499,0.003531,-0.013308,-0.027678,-0.005887,-0.002814,0.073143,-0.007298,0.06392
2,0.021674,0.027032,0.018431,-0.002317,-0.002001,0.007704,-0.054976,0.037239,0.068838,0.010605,...,-0.026102,0.001598,0.044905,-0.0073,-0.027224,-0.012658,-0.018067,0.056675,0.020806,0.050511
3,-0.000614,0.041346,0.016351,-0.003159,-0.028911,-0.019515,-0.049184,-0.006748,0.053963,-0.02285,...,-0.028399,-0.009819,0.003378,-0.029098,-0.011516,-0.006054,-0.003234,0.05802,-0.000327,0.038571
4,0.007424,0.035297,0.032492,0.00785,-0.008046,0.009427,-0.073587,0.020023,0.058955,0.005529,...,-0.014333,-0.001263,0.041731,-0.009086,-0.024412,-0.01087,-0.007801,0.073651,0.003466,0.064285
5,0.019975,0.035504,0.017329,-0.009869,-0.003345,0.022369,-0.069728,0.043092,0.055619,0.015073,...,-0.046293,0.006483,0.043155,-0.002985,-0.028547,0.010435,-0.018724,0.062319,-0.00494,0.048564
6,0.007601,0.019003,0.021572,0.014649,-0.010884,0.010585,-0.054022,0.038994,0.031059,0.004289,...,-0.020539,0.005201,0.036903,-0.008517,-0.025109,0.008413,-0.005523,0.047735,0.021587,0.060495
7,-0.023843,-0.005564,0.035614,-0.005475,-0.037241,0.001322,-0.019514,0.006147,0.032842,0.010656,...,-0.052601,-0.009565,-0.006058,-0.018826,-0.031022,0.004995,0.003231,0.068595,0.023699,0.029575
8,-0.008593,-0.006366,0.020832,0.014902,-0.021305,-0.006184,-0.016481,0.031014,0.04314,0.004957,...,-0.056304,-0.009199,0.009224,-0.026027,-0.042306,0.004781,0.003418,0.074225,0.019379,0.021653
9,-0.005505,0.028986,-0.000938,0.013149,-0.044339,0.013578,-0.036148,0.04771,0.043861,0.030804,...,-0.036777,0.016647,0.034929,-0.014275,-0.041193,-0.007958,-0.009339,0.051277,0.020266,0.045975
