## Product Classification with LLM + RAG + Few Shot Prompting

## Overview

Leverage ML algorithms from notebooks "categorization_embedding_approaches.ipynb" and "categorization_kesler_onevsrest.ipynb" to add LLM capabilities. The goal is to add a RAG implementation from the categories generated from traditional ML and perform a one-shot prompt to obtain better results.

## Data Preparation

In [6]:
%%bigquery category_df
select c0_name, c1_name, c2_name from `solutions-2023-mar-107.mercari.13K_synthetic_attributes_embeddings`
group by c0_name, c1_name, c2_name

Query is running:   0%|          |

Downloading:   0%|          |

In [43]:
%%bigquery df
SELECT *, CONCAT('Name: \n ', name, ' \n ', 
                 "Description: \n ", description, ' \n ',
                 "Labels: \n ", TO_JSON_STRING(vision_api_labels)
                ) as attr 
FROM solutions-2023-mar-107.mercari.13K_synthetic_attributes_embeddings 
WHERE id IN (
    SELECT id FROM solutions-2023-mar-107.mercari.13K_synthetic_attributes_embeddings_golden_test
    WHERE manual_validation = 1
)
AND rand() < 1.0

Query is running:   0%|          |

Downloading:   0%|          |

In [44]:
len(df)

116

In [45]:
df.head(3)

Unnamed: 0,id,name,description,brand_name,item_condition_name,c0_name,c1_name,c2_name,url,created,image_uri,vision_api_labels,attributes,scores,text_embedding,image_embedding,attr
0,m14193490298,Wooden Magnet/ Dot Art Acrylic Paint,Handfree painted mandalas in a thin wooden dis...,Handmade,New,Home,Artwork,Paintings,https://www.mercari.com/us/item/m14193490298,2023-02-06 04:48:31+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Book"",""m...","[Book, Art, Publication, Creative arts, Materi...","[0.87862146, 0.83368653, 0.81959659, 0.8156618...","[0.00826694351, 0.00405315682, 0.0570168681, -...","[0.0284959301, 0.0240893103, 0.017579874, -0.0...",Name: \n Wooden Magnet/ Dot Art Acrylic Paint ...
1,m74667116621,POSTER PRINT: FUNHOUSE,ALL POSTER PRINTS ARE 11 X 17 INCHES (( GREAT ...,,New,Home,Artwork,Posters,https://www.mercari.com/us/item/m74667116621,2023-01-25 07:01:14+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Poster"",...","[Poster, Publication, Font, Art, Book cover, B...","[0.86772853, 0.83406168, 0.81918359, 0.6839227...","[-0.0161228031, -0.0427463576, -0.0123792794, ...","[-0.0586416498, 0.0408442616, -0.012638161, -0...",Name: \n POSTER PRINT: FUNHOUSE \n Description...
2,m21554068673,Athleta Elation Purple Velvet High Rise Tight ...,Athleta Elation Purple Blue Velvet Tight Leggi...,Athleta,Like new,Women,Athletic apparel,Athletic Leggings,https://www.mercari.com/us/item/m21554068673,2023-03-27 17:33:50+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Arm"",""mi...","[Arm, Shoulder, yoga pant, Leg, Active pants, ...","[0.94758928, 0.93993074, 0.93505448, 0.9231663...","[0.0108732795, -0.0630845651, 0.036399167, -0....","[0.0493304357, -0.0114150364, 0.0233411063, 0....",Name: \n Athleta Elation Purple Velvet High Ri...


In [46]:
def get_category_name(row):
    if row["c2_name"] is None:
        cat_name = row["c0_name"]+">"+row["c1_name"]+">"+"Other"
    else:
        cat_name = row["c0_name"]+">"+row["c1_name"]+">"+row["c2_name"]
    return cat_name

df["category"] = df.apply(lambda x: get_category_name(x), axis=1)

### Import ML Models

In [47]:
import joblib # (if needed)
model_one = joblib.load('model_textemb.pkl')
model_two = joblib.load('model_imageemb.pkl')
model_three = joblib.load('model_combemb.pkl')
embed_one = "text_embedding"
embed_two = "image_embedding"
embed_three = "comb_embedding"

In [48]:
df["comb_embedding"] = df.apply(lambda x: x["text_embedding"].tolist()+x["image_embedding"].tolist(), axis=1)

## LLM RAG Prompting

In [49]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel
import time

ncandidates = 3

vertexai.init(project="solutions-2023-mar-107", location="us-central1")
parameters = {
    "max_output_tokens": 30,
    "temperature": 0.1,
    "top_p": 0.8,
    "top_k": 40
}

def generate_candidates(ln):
    prediction_one = model_one.predict_proba([ln[embed_one]])[0]
    prediction_two = model_two.predict_proba([ln[embed_two]])[0]
    prediction_three = model_three.predict_proba([ln[embed_three]])[0]
    
    topn_one = sorted(range(len(prediction_one)), key=lambda i: prediction_one[i])[-3:]
    topn_two = sorted(range(len(prediction_two)), key=lambda i: prediction_two[i])[-3:]
    topn_three = sorted(range(len(prediction_three)), key=lambda i: prediction_three[i])[-3:]
    
    classes_one = model_one.classes_
    classes_two = model_two.classes_
    classes_three = model_three.classes_
    
    candidates = [classes_one[x] for x in topn_one] + [classes_two[y] for y in topn_two] + [classes_three[z] for z in topn_three]
    
    return candidates
    
    
def get_one_candidate(line):
    
    time.sleep(1)
    
    cand = generate_candidates(line)
    
    query = "Based on the following description of a product can you identify which product category it belongs to from the candidates of categories: \n\n" + \
        "\n\n\n" + \
        "DESCRIPTION\n" + \
        line["name"] + \
        line["description"] + \
        "\n\n" + \
        "PRODUCT CATEGORY CANDIDATES: \n\n" + \
        "\n".join(cand) + \
        "\n\n" + \
        "INSTRUCTIONS:" + \
        "On the first line only respond with one of the category names from the PRODUCT CATEGORY CANDIDATES list, do not make up your own category. " + \
        "On the second line please explain why you think that is the right category based on the description " + \
        "\n\n" + \
        "ANSWER:" + \
        "The category name is: "
            
    model = TextGenerationModel.from_pretrained("text-bison@001")
    response = model.predict(
        query,
        **parameters
    )
    
    resp = response.text.splitlines()

    return [resp[0], resp[1]]



In [50]:
# try one example

ind=5
row = df.iloc[ind]
candidate = get_one_candidate(row)
print("\n\n predicted category: " + candidate[0])
print("\n\n why? " + candidate[1])

print("\n\n actual category: " + row["category"])



 predicted category: Women>Athletic apparel>Athletic Leggings


 why?  The product is a pair of Nike Sculpt hyper leggings. It is a type of athletic leggings.


 actual category: Women>Athletic apparel>Athletic Leggings


### Run prompt on dataset

In [51]:
df["predicted_category"] = df.apply(lambda x : get_one_candidate(x)[0], axis=1)

In [52]:
df.head(3)

Unnamed: 0,id,name,description,brand_name,item_condition_name,c0_name,c1_name,c2_name,url,created,image_uri,vision_api_labels,attributes,scores,text_embedding,image_embedding,attr,category,comb_embedding,predicted_category
0,m14193490298,Wooden Magnet/ Dot Art Acrylic Paint,Handfree painted mandalas in a thin wooden dis...,Handmade,New,Home,Artwork,Paintings,https://www.mercari.com/us/item/m14193490298,2023-02-06 04:48:31+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Book"",""m...","[Book, Art, Publication, Creative arts, Materi...","[0.87862146, 0.83368653, 0.81959659, 0.8156618...","[0.00826694351, 0.00405315682, 0.0570168681, -...","[0.0284959301, 0.0240893103, 0.017579874, -0.0...",Name: \n Wooden Magnet/ Dot Art Acrylic Paint ...,Home>Artwork>Paintings,"[0.00826694351, 0.00405315682, 0.0570168681, -...",Home>Home decor>Home decor accents
1,m74667116621,POSTER PRINT: FUNHOUSE,ALL POSTER PRINTS ARE 11 X 17 INCHES (( GREAT ...,,New,Home,Artwork,Posters,https://www.mercari.com/us/item/m74667116621,2023-01-25 07:01:14+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Poster"",...","[Poster, Publication, Font, Art, Book cover, B...","[0.86772853, 0.83406168, 0.81918359, 0.6839227...","[-0.0161228031, -0.0427463576, -0.0123792794, ...","[-0.0586416498, 0.0408442616, -0.012638161, -0...",Name: \n POSTER PRINT: FUNHOUSE \n Description...,Home>Artwork>Posters,"[-0.0161228031, -0.0427463576, -0.0123792794, ...",Men>Tops>T-shirts
2,m21554068673,Athleta Elation Purple Velvet High Rise Tight ...,Athleta Elation Purple Blue Velvet Tight Leggi...,Athleta,Like new,Women,Athletic apparel,Athletic Leggings,https://www.mercari.com/us/item/m21554068673,2023-03-27 17:33:50+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Arm"",""mi...","[Arm, Shoulder, yoga pant, Leg, Active pants, ...","[0.94758928, 0.93993074, 0.93505448, 0.9231663...","[0.0108732795, -0.0630845651, 0.036399167, -0....","[0.0493304357, -0.0114150364, 0.0233411063, 0....",Name: \n Athleta Elation Purple Velvet High Ri...,Women>Athletic apparel>Athletic Leggings,"[0.0108732795, -0.0630845651, 0.036399167, -0....",Women>Athletic apparel>Athletic Leggings


## Evaluation

In [60]:
# hierarchical classification

import numpy as np

def get_hierarchical_score(y_pred, y_true):
    
    if len(y_pred) !=len(y_true):
        print("erro: y_pred and y_true should have the same length")
        return None
    
    scores = []
    
    def get_score(first_cat ,second_cat):
        first_ls = first_cat.split(">")
        second_ls = second_cat.split(">")
        if len(first_ls) != 3 or len(second_ls) != 3:
            print("error: category does not have 3 levels")
            return 0
        
        score=0
        
        if first_ls[0] == second_ls[0]:
            score += 1/3
        if first_ls[1] == second_ls[1]:
            score += 1/3
        if first_ls[2] == second_ls[2]:
            score += 1/3

        return score
    
    for i in range(0,len(y_pred)):
        scores.append(get_score(y_pred[i], y_true[i]))
                      
    return np.mean(scores)

# test one item
x = df.iloc[0:5]
predicted = model_three.predict(x["comb_embedding"].tolist())
get_hierarchical_score(x["category"].tolist(),predicted)

0.3333333333333333

In [62]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

#df_val = df[df["manual_validation"]==1]

print("Hierarchical Score: " + str(get_hierarchical_score(df["predicted_category"], df["category"])))
print("Accuracy: " + str(accuracy_score(df["predicted_category"], df["category"])))
print("f1: " + str(f1_score(df["category"], df["predicted_category"],average='weighted')))

Hierarchical Score: 0.6149425287356322
Accuracy: 0.3793103448275862
f1: 0.27590596266184947
