# Product Classification with Embedding Approaches

## Overview

This research is a continuation of the categorization-kesler-onevsrest notebook. The focus of this one is to use the one-vs-rest algorithm which performed quite well to test out the performance with a few embedding approaches:
* text embedding
* image embedding
* text + image embedding

## Getting Started

Authenticate your notebook environment (Colab only)
If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using Vertex AI Workbench.

In [6]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### Define Google Cloud project information (Colab only)

In [8]:
if "google.colab" in sys.modules:
    # Define project information
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
    LOCATION = "us-central1"  # @param {type:"string"}

    # Initialize Vertex AI
    import vertexai

    vertexai.init(project=PROJECT_ID, location=LOCATION)

### Install and Import Packages

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
import joblib

## Data Preparation

In [20]:
%%bigquery df
SELECT *, CONCAT('Name: \n ', name, ' \n ', 
                 "Description: \n ", description, ' \n ',
                 "Labels: \n ", TO_JSON_STRING(vision_api_labels)
                ) as attr 
FROM solutions-2023-mar-107.mercari.13K_synthetic_attributes_embeddings 
WHERE name NOT IN (
    SELECT name FROM solutions-2023-mar-107.mercari.13K_synthetic_attributes_embeddings_golden_test
)
AND rand() < 1.0

Query is running:   0%|          |

Downloading:   0%|          |

In [21]:
df.head(3)

Unnamed: 0,id,name,description,brand_name,item_condition_name,c0_name,c1_name,c2_name,url,created,image_uri,vision_api_labels,attributes,scores,text_embedding,image_embedding,attr
0,m36156458954,Lululemon hooded zip up,"Size 6, light gray/light purple hooded. Great ...",lululemon athletica,Like new,Women,Sweaters,Hooded,https://www.mercari.com/us/item/m36156458954,2023-02-11 18:48:48+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Outerwea...","[Outerwear, Dress shirt, Product, Coat, Neck, ...","[0.9548111, 0.90878934, 0.90797353, 0.89374167...","[-0.0240250099, -0.00749946106, 0.0358330719, ...","[0.0248925369, -0.00216861023, 0.0498555452, 0...",Name: \n Lululemon hooded zip up \n Descriptio...
1,m49677820034,Tory Burch Flip Flops Crazy Logo Allover NWT,Tory Burch flip flops NWT size 9. Navy with re...,Tory Burch,New,Women,Shoes,Sandals,https://www.mercari.com/us/item/m49677820034,2023-03-25 21:07:40+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""Sleeve"",...","[Sleeve, Font, Jersey, Rectangle, Pattern, Ele...","[0.87241977, 0.77027929, 0.7640627, 0.72455895...","[-0.0101064555, -0.0298294984, 0.0125869382, -...","[-0.000829317723, 0.0507648326, 0.0393093489, ...",Name: \n Tory Burch Flip Flops Crazy Logo Allo...
2,m76798724941,O. Vianca Black Beige Cheetah Print Crop Top S...,O. Vianca Black Beige Cheetah Print Crop Top S...,,Good,Women,Tops & blouses,Blouse,https://www.mercari.com/us/item/m76798724941,2023-01-15 17:13:21+00:00,gs://genai-product-catalog/mercari_images_13K/...,"{""label_annotations"":[{""description"":""White"",""...","[White, One-piece garment, Clothes hanger, Nec...","[0.92186475, 0.89643246, 0.8930552, 0.87433636...","[-0.0152879842, -0.0348733254, -0.0182892475, ...","[0.00666147517, 0.0276971422, -0.00610722462, ...",Name: \n O. Vianca Black Beige Cheetah Print C...


In [22]:
%%bigquery category_df
select c0_name, c1_name, c2_name from `solutions-2023-mar-107.mercari.13K_synthetic_attributes_embeddings`
group by c0_name, c1_name, c2_name

Query is running:   0%|          |

Downloading:   0%|          |

In [23]:
def get_category_name(row):
    if row["c2_name"] is None:
        cat_name = row["c0_name"]+">"+row["c1_name"]+">"+"Other"
    else:
        cat_name = row["c0_name"]+">"+row["c1_name"]+">"+row["c2_name"]
    return cat_name

df["category"] = df.apply(lambda x: get_category_name(x), axis=1)

In [24]:
df["category"].head()

0                 Women>Sweaters>Hooded
1                   Women>Shoes>Sandals
2           Women>Tops & blouses>Blouse
3    Home>Home decor>Home decor accents
4                  Women>Shoes>Slip-Ons
Name: category, dtype: object

In [25]:
# Split the data into train and test sets

df["category"] = df["category"].fillna("None")
# embeddings have already been pre-populated
df["comb_embedding"] = df.apply(lambda x: x["text_embedding"].tolist()+x["image_embedding"].tolist(), axis=1)
df_train, df_test = train_test_split(df, test_size=0.25)

## Data Modeling

In [26]:
# algorithm

base_lr = LogisticRegression()
model_combemb = OneVsRestClassifier(base_lr)
model_textemb = OneVsRestClassifier(base_lr)
model_imageemb = OneVsRestClassifier(base_lr)

model_combemb.fit(df_train["comb_embedding"].tolist(), df_train["category"].tolist())
joblib.dump(model_combemb, 'model_combemb.pkl')

model_textemb.fit(df_train["text_embedding"].tolist(), df_train["category"].tolist())
joblib.dump(model_textemb, 'model_textemb.pkl')

model_imageemb.fit(df_train["image_embedding"].tolist(), df_train["category"].tolist())
joblib.dump(model_textemb, 'model_imageemb.pkl')



KeyboardInterrupt: 

In [None]:
df_test["predicted_category"] = model.predict(df_test["comb_embeddings"].tolist())

y_true = df_test[target_col].tolist()
y_pred = df_test["predicted_category"].tolist()

print("Accuracy: " + str(accuracy_score(df_test["predicted_category"], df_test[target_col])))
print("f1: " + str(f1_score(df_test["predicted_category"], df_test["category"],average='weighted')))

In [None]:
# hierarchical classification

def get_similarity_score(y_true, y_pred):
    
    score = 0
    
    y_true_ls = y_true.split(">")
    y_pred_ls = y_pred.split(">")
    if len(y_true_ls) != 3 or len(y_pred_ls) != 3:
        print("error: category does not have 3 levels")
        return None
    
    if y_true_ls[0] == y_pred_ls[0]:
        score += 1/3
    if y_true_ls[1] == y_pred_ls[1]:
        score += 1/3
    if y_true_ls[2] == y_pred_ls[2]:
        score += 1/3
    
    return score

df_test.apply(lambda x: get_similarity_score(x["category"],x["predicted_category"]))

### Appendix

In [11]:
df_manual_eval = df_test[df_test['predicted_category'] != df_test[target_col]]

In [14]:
df_manual_eval.to_csv("product_catalog_manual_eval.csv")

In [173]:
row=df_manual_eval.iloc[6]
description = row["description"]

In [76]:
prediction = clf2.predict_proba([df_manual_eval.iloc[6]["comb_embeddings"]])[0]
topn = sorted(range(len(prediction)), key=lambda i: prediction[i])[-5:]
classes = clf2.classes_
for i in topn:
    if i not in classes:
        try: 
            print(parents[parents[classes[i]]] + "->" +parents[classes[i]] + "->" + classes[i] )
        except:
            pass
    else:
        print(None)

Maternity->Dresses->Midi
Women->Skirts->Maxi
Maternity->Dresses->Above knee, mini
Maternity->Dresses->Knee-length


  if i not in classes:


In [77]:
parents["Tumbler"]

'Serving'

In [345]:
row=df_manual_eval.iloc[4]
description = (row["name"] + "\n\n" + row["description"]).replace("\"", " inch").lower()

description

'3ce multi eye color palette #overtake\n\nused only a few times'

In [346]:
%%bash -s "$description"

echo '{
   "instances": [ 
       { "text": "'"$1"'"  }
   ]
}' > instances.json

In [347]:
%%bash -s "$description"
echo $1

set -o xtrace

curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json; charset=utf-8" \
"https://us-central1-aiplatform.googleapis.com/v1/projects/solutions-2023-mar-107/locations/us-central1/publishers/google/models/multimodalembedding@001:predict" \
 -d @instances.json > embeddings.json

3ce multi eye color palette #overtake used only a few times


++ gcloud auth print-access-token
+ curl -X POST -H 'Authorization: Bearer ya29.c.b0Aaekm1LdgAiJPflsrryyJAXLkNygyU4fAcsm2Ds-_ow5ug_TXYUp3Z393S2dAw97Bs-qKVwKZEjWs92vmSD2mLpQA_s6cIhnki4Z03sC6O9Gqy4Mg51VXnS1vzRT8O90yr51Gvs2EKC56Gs5qJnCrUK7DGcHm_gXBMv4xEkhFLoXfRD3XOinXqkpWu-ttqy2KRIV8h4At88GkcL4Bh7dk-Oj3nktC0Vvw0SZ-GfNQjqVJgm1NOmLuGJSoXApRq5GvI5n3Q6OmAhdQfQtHHkP0RMkf2Pde-VvyflRqAA-uWXyPanHcS0OpUVjOia-TAleLgq9bzPyrApImX4EvHc5_o0eLMAOSr_3Ls46eXpYZgN371AR7sOcMqaF8X1sqBxXMOV2ybd3JM0S_SuwZlXbI-99O_S9u-toZpugaOX657uJ_5sbpOyXck5veYegQp7hpUu4Ft9t1j0bpFRR47f7MaV6OIFmfvsz45Vo9VFVyibI6M4zWFzxfy-I3y2R5if19pkFUhZylzxrMuhwBj7dubgFOlyiMRVjw4cUb5-s3dyyiz0Wph0Flnw9haQgs4FtMhcZX0fsVFs-egW7x1M8Mvkyqo48vtZFhVMF1W2B_356q4291_hIMxhoiO7dfdxjRv4fwFi10zlzslm9ts-b3rwsnBa3uJOuocF0RkniZ-nbys_ghf5Ma3O2JFrImjg1qc9feojt1F7Q0hZ0Z_mUfxRZ4Oc_Fd4rzMmzryZIOaj8JursVsOiOVYVnvI6fc72OOnlVFpi4Zl325BF4-b1lUf0Qm8OVfzeMq0IyR9dZ31bwcfzv_qqQ8jWs0pbnkcXg5nop09x7q3SJ00rZ7_VeyYjgZRwS4ulkUzfjbROjYirQzMcF8W8cQfhj17v_o4lmh0O0cj4g_98otcoQZi_

In [348]:
import json

with open('embeddings.json', 'r') as f:
  embedding = json.load(f)

emb=embedding['predictions'][0]['textEmbedding']

In [349]:
new_emb = emb+df_manual_eval.iloc[4]["image_embedding"].tolist()

prediction = clf2.predict_proba([new_emb])[0]
topn = sorted(range(len(prediction)), key=lambda i: prediction[i])[-5:]
classes = clf2.classes_
for i in topn:
    if i not in classes:
        try: 
            print(parents[parents[classes[i]]] + "->" +parents[classes[i]] + "->" + classes[i] )
        except:
            pass
    else:
        print(None)

Beauty->Skin care->Lips
Beauty->Skin care->Eyes
Beauty->Makeup->Face
Beauty->Makeup->Makeup palettes


  if i not in classes:
