## Testing

Small scale testing to make sure the models work

In [62]:
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium."
]

theme = "weather"

In [12]:
import numpy as np

def cosine_similarity(vec_a, vec_b):
    """
    Calculate the cosine similarity between two vectors.

    :param vec_a: First vector.
    :param vec_b: Second vector.
    :return: Cosine similarity.
    """
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)

In [13]:
from sentence_transformers import SentenceTransformer

model_e5_b = SentenceTransformer("intfloat/multilingual-e5-base")

embeddings_e5_b = model_e5_b.encode(sentences)
embeddings_theme_e5_b = model_e5_b.encode(theme)
similarities_e5_b = model_e5_b.similarity(embeddings_e5_b, embeddings_theme_e5_b)


# [3, 3]

In [14]:
from sentence_transformers import SentenceTransformer

model_m3 = SentenceTransformer("BAAI/bge-m3")

embeddings_m3 = model_m3.encode(sentences)
embeddings_theme_m3 = model_m3.encode(theme)
similarities_m3 = model_m3.similarity(embeddings_m3, embeddings_theme_m3)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
from sentence_transformers import SentenceTransformer

model_e5_l = SentenceTransformer("intfloat/multilingual-e5-large")

embeddings_e5_l = model_e5_l.encode(sentences)
embeddings_theme_e5_l = model_e5_l.encode(theme)
similarities_e5_l = model_e5_l.similarity(embeddings_e5_l, embeddings_theme_e5_l)

In [16]:
from openai import OpenAI

client = OpenAI(api_key= "x")

embedding_openai = []

for sentence in sentences:
    themed_sentence = sentence
    response = client.embeddings.create(
        input=themed_sentence,
        model="text-embedding-3-large"
    )
    embedding_openai.append(response.data[0].embedding)

response = client.embeddings.create(
        input=theme,
        model="text-embedding-3-large"
    )
embedding_theme_openai = response.data[0].embedding
similarities_openai = []

for embedding in embedding_openai:
    similarities_openai.append(cosine_similarity(embedding, embedding_theme_openai))


In [17]:
for i in range(3):
    for j in range(1):
        print(f"Similarity m3 between '{sentences[i]}' and '{theme}': {similarities_m3[i, j].item()}")
for i in range(3):
    for j in range(1):
        print(f"Similarity e5_l between '{sentences[i]}' and '{theme}': {similarities_e5_l[i, j].item()}")
for i in range(3):
    for j in range(1):
        print(f"Similarity e5_b between '{sentences[i]}' and '{theme}': {similarities_e5_b[i, j].item()}")
for i in range(3):
    for j in range(1):
        print(f"Similarity openai between '{sentences[i]}' and '{theme}': {similarities_openai[i].item()}")

Similarity m3 between 'The weather is lovely today.' and 'weather': 0.7082294225692749
Similarity m3 between 'It's so sunny outside!' and 'weather': 0.6439048647880554
Similarity m3 between 'He drove to the stadium.' and 'weather': 0.5115762948989868
Similarity e5_l between 'The weather is lovely today.' and 'weather': 0.8403042554855347
Similarity e5_l between 'It's so sunny outside!' and 'weather': 0.8034236431121826
Similarity e5_l between 'He drove to the stadium.' and 'weather': 0.7406305074691772
Similarity e5_b between 'The weather is lovely today.' and 'weather': 0.8751693964004517
Similarity e5_b between 'It's so sunny outside!' and 'weather': 0.8215470314025879
Similarity e5_b between 'He drove to the stadium.' and 'weather': 0.7676297426223755
Similarity openai between 'The weather is lovely today.' and 'weather': 0.4374969161875635
Similarity openai between 'It's so sunny outside!' and 'weather': 0.3117060202206963
Similarity openai between 'He drove to the stadium.' and 'w

# Comparison

## Preprocessing


In [18]:
import pandas as pd

input_path = '../../data/train.csv'
data = pd.read_csv(input_path)
df_reviews = data[["review_body", "language", "product_category"]]
df_categories = data[["product_category"]]

sample_categories = df_categories.drop_duplicates().reset_index(drop=True)


languages = ['de', 'en', 'es', 'fr', 'ja', 'zh']
sampled_data = []

for lang in languages:
    lang_data = df_reviews[df_reviews['language'] == lang].sample(n=1000, random_state=42)
    sampled_data.append(lang_data)

sampled_data = pd.concat(sampled_data)

In [19]:
lan_counts = sampled_data["language"].value_counts()
print(lan_counts)
cat_counts = sample_categories["product_category"].value_counts()
print(cat_counts)

language
de    1000
en    1000
es    1000
fr    1000
ja    1000
zh    1000
Name: count, dtype: int64
product_category
sports                      1
kitchen                     1
industrial_supplies         1
camera                      1
jewelry                     1
luggage                     1
watch                       1
other                       1
musical_instruments         1
digital_video_download      1
pet_products                1
video_games                 1
apparel                     1
office_product              1
personal_care_appliances    1
digital_ebook_purchase      1
home_improvement            1
beauty                      1
furniture                   1
automotive                  1
shoes                       1
pc                          1
wireless                    1
home                        1
book                        1
electronics                 1
lawn_and_garden             1
baby_product                1
toy                         1
drugstore   

## Embedding

In [64]:
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import time

models = {
    "e5_base": SentenceTransformer("intfloat/multilingual-e5-base"),
    "bge-m3": SentenceTransformer("BAAI/bge-m3"),
    "e5-large": SentenceTransformer("intfloat/multilingual-e5-large")
}

def apply_model_to_df(model, df, column_name):
    embeddings = model.encode(df[column_name].tolist())
    return embeddings.tolist()

for model_name, model in tqdm(models.items(), desc="Processing models"):
    start_time = time.time()
    sampled_data[f'{model_name}_reviews'] = apply_model_to_df(model, sampled_data, 'review_body')
    sample_categories[f'{model_name}_categories'] = apply_model_to_df(model, sample_categories, 'product_category')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"{model_name} took {elapsed_time:.2f} seconds to process.\n")



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing models:  33%|███▎      | 1/3 [00:31<01:02, 31.26s/it]

e5_base took 31.26 seconds to process.



Processing models:  67%|██████▋   | 2/3 [02:09<01:10, 70.46s/it]

bge-m3 took 97.90 seconds to process.



Processing models: 100%|██████████| 3/3 [03:43<00:00, 74.35s/it]

e5-large took 93.88 seconds to process.






In [30]:
sampled_data['OpenAI_reviews'] = None
sampled_data['OpenAI_categories'] = None

openai_review_embeddings = []
for sentence in sampled_data['review_body']:
    response = client.embeddings.create(
        input=sentence,
        model="text-embedding-3-large"
    )
    openai_review_embeddings.append(response.data[0].embedding)

sampled_data['OpenAI_reviews'] = openai_review_embeddings


In [51]:
openai_category_embeddings = []
for theme in sample_categories['product_category']:
    response = client.embeddings.create(
        input=theme,
        model="text-embedding-3-large"
    )
    openai_category_embeddings.append(response.data[0].embedding)

sample_categories['OpenAI_categories'] = openai_category_embeddings

In [52]:
print(sampled_data.columns)

Index(['review_body', 'language', 'product_category', 'e5_base_reviews',
       'bge-m3_reviews', 'e5-large_reviews', 'OpenAI_reviews'],
      dtype='object')


In [53]:
print(sample_categories.columns)

Index(['product_category', 'e5_base_categories', 'bge-m3_categories',
       'e5-large_categories', 'OpenAI_categories'],
      dtype='object')


In [57]:
sample_categories.head(40)

Unnamed: 0,product_category,e5_base_categories,bge-m3_categories,e5-large_categories,OpenAI_categories
0,sports,"[0.0404379777610302, 0.016666794195771217, -0....","[0.007773629389703274, 0.017449799925088882, -...","[0.04635049030184746, 0.015139766968786716, -0...","[0.05249757319688797, 0.03604929521679878, -0...."
1,home_improvement,"[0.00920741818845272, 0.0361715704202652, -0.0...","[0.0018572566332295537, 0.029306387528777122, ...","[0.011592144146561623, 0.04191767796874046, -0...","[-0.01942252367734909, 0.02513502910733223, -0..."
2,drugstore,"[0.03261622413992882, 0.04194597899913788, -0....","[-0.0013396964641287923, -0.0179598331451416, ...","[0.02461204305291176, -0.0031880501192063093, ...","[-0.02030802145600319, 0.03622787818312645, 0...."
3,toy,"[-0.0036610043607652187, 0.009792091324925423,...","[-0.011956618167459965, 0.01631239801645279, -...","[0.05033637583255768, 0.011232344433665276, -0...","[-0.05364340543746948, -0.029369451105594635, ..."
4,baby_product,"[0.03296276926994324, 0.028297562152147293, -0...","[-0.009079738520085812, 0.012797278352081776, ...","[0.007595886941999197, 0.03445391356945038, -0...","[-0.05625683069229126, -0.0005081882118247449,..."
5,lawn_and_garden,"[0.025170622393488884, 0.02868179976940155, -0...","[-0.028989581391215324, 0.07103437185287476, -...","[0.012603607028722763, 0.029779676347970963, -...","[-0.018068082630634308, 0.0085217310115695, -0..."
6,electronics,"[-0.012476944364607334, 0.026424763724207878, ...","[-0.014170749112963676, 0.014405746944248676, ...","[0.03743162751197815, 0.008007933385670185, -0...","[-0.005578492768108845, 0.022827954962849617, ..."
7,book,"[0.026823027059435844, 0.017802968621253967, -...","[-0.013646664097905159, 0.037420690059661865, ...","[0.019163290038704872, 0.00443122535943985, -0...","[0.005490604322403669, 0.0007047722465358675, ..."
8,home,"[0.009325407445430756, 0.03267315775156021, -0...","[0.00912525411695242, 0.0261226873844862, -0.0...","[0.032022714614868164, 0.026004798710346222, -...","[-0.008504516445100307, 0.003743005683645606, ..."
9,wireless,"[0.01549568958580494, 0.04060305655002594, -0....","[-0.03862132877111435, 0.025423217564821243, -...","[0.012674892321228981, 0.026906708255410194, -...","[-0.005462470930069685, 0.024429624900221825, ..."


## Finding the accuracy

In [55]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm.auto import tqdm

predictions = []

for idx, row in tqdm(sampled_data.iterrows(), total=len(sampled_data), desc="Processing comments"):

    predicted_categories = {}
    for model_name in ['e5_base', 'bge-m3', 'e5-large', 'OpenAI']:
        review_embedding = row[f'{model_name}_reviews']

        categories_embeddings = sample_categories[f'{model_name}_categories'].tolist()

        similarities = cosine_similarity([review_embedding], categories_embeddings)[0]

        predicted_category_idx = np.argmax(similarities)

        predicted_category = sample_categories.iloc[predicted_category_idx]['product_category']

        predicted_categories[model_name] = predicted_category

    true_category = row['product_category']

    predictions.append([
        row['review_body'],
        predicted_categories['e5_base'],
        predicted_categories['bge-m3'],
        predicted_categories['e5-large'],
        predicted_categories['OpenAI'],
        true_category
    ])

predicted_df = pd.DataFrame(predictions, columns=[
    'comment',
    'predicted_category_e5_base',
    'predicted_category_bge-m3',
    'predicted_category_e5-large',
    'predicted_category_OpenAI',
    'true_category'
])


Processing comments: 100%|██████████| 6000/6000 [00:35<00:00, 169.96it/s]


In [58]:
predicted_df.head(6000)

Unnamed: 0,comment,predicted_category_e5_base,predicted_category_bge-m3,predicted_category_e5-large,predicted_category_OpenAI,true_category
0,Ist ok ...blondierung quillt schnell auf,beauty,beauty,personal_care_appliances,watch,beauty
1,Kein typischer Geruch oder Geschmack von einem...,baby_product,baby_product,pet_products,grocery,grocery
2,Dieses Buch hat mir sehr geholfen mit dem erst...,book,book,digital_ebook_purchase,book,book
3,"super Schale, wunderschön, gutes Produkt ABER ...",jewelry,jewelry,personal_care_appliances,baby_product,baby_product
4,"Artikel ist niemals angekommen, habe ihn aber ...",digital_ebook_purchase,luggage,digital_ebook_purchase,digital_ebook_purchase,book
...,...,...,...,...,...,...
5995,在亚马逊买是为了安全健康，收到的包裹包装又脏又差，就只有一层快递的黑色塑料袋，简直不能相信。...,digital_ebook_purchase,luggage,digital_ebook_purchase,baby_product,baby_product
5996,产品很好.快递不送到楼上产品很好.快递不送到楼上产品很好.快递不送到楼上,office_product,grocery,personal_care_appliances,baby_product,baby_product
5997,可以想象，原著应该是很生动、简洁、有针对性的。但是翻译的水平实在太差。有些话看不懂。通篇都是...,other,digital_ebook_purchase,watch,book,digital_ebook_purchase
5998,基本是举一反三的配方，也能凑成一本书卖，不容易。,digital_ebook_purchase,book,pet_products,book,book


In [59]:
from sklearn.metrics import accuracy_score

e5_base_accuracy = accuracy_score(predicted_df['true_category'], predicted_df['predicted_category_e5_base'])
bge_m3_accuracy = accuracy_score(predicted_df['true_category'], predicted_df['predicted_category_bge-m3'])
e5_large_accuracy = accuracy_score(predicted_df['true_category'], predicted_df['predicted_category_e5-large'])
OpenAI_accuracy = accuracy_score(predicted_df['true_category'], predicted_df['predicted_category_OpenAI'])

print(f"Accuracy of e5_base model: {e5_base_accuracy:.4f}")
print(f"Accuracy of bge-m3 model: {bge_m3_accuracy:.4f}")
print(f"Accuracy of e5-large model: {e5_large_accuracy:.4f}")
print(f"Accuracy of e5-large model: {OpenAI_accuracy:.4f}")


Accuracy of e5_base model: 0.1558
Accuracy of bge-m3 model: 0.1667
Accuracy of e5-large model: 0.1160
Accuracy of e5-large model: 0.2253


In [61]:
from sklearn.metrics import accuracy_score
import numpy as np

valid_rows = predicted_df[
    (predicted_df['predicted_category_e5_base'] == predicted_df['true_category']) |
    (predicted_df['predicted_category_bge-m3'] == predicted_df['true_category']) |
    (predicted_df['predicted_category_e5-large'] == predicted_df['true_category']) |
    (predicted_df['predicted_category_OpenAI'] == predicted_df['true_category'])

]

e5_base_accuracy = accuracy_score(valid_rows['true_category'], valid_rows['predicted_category_e5_base'])
bge_m3_accuracy = accuracy_score(valid_rows['true_category'], valid_rows['predicted_category_bge-m3'])
e5_large_accuracy = accuracy_score(valid_rows['true_category'], valid_rows['predicted_category_e5-large'])
OpenAI_accuracy = accuracy_score(valid_rows['true_category'], valid_rows['predicted_category_OpenAI'])

print(f"Accuracy of e5_base model (valid rows): {e5_base_accuracy:.4f}")
print(f"Accuracy of bge-m3 model (valid rows): {bge_m3_accuracy:.4f}")
print(f"Accuracy of e5-large model (valid rows): {e5_large_accuracy:.4f}")
print(f"Accuracy of OpenAI model (valid rows): {OpenAI_accuracy:.4f}")


Accuracy of e5_base model (valid rows): 0.4421
Accuracy of bge-m3 model (valid rows): 0.4728
Accuracy of e5-large model (valid rows): 0.3291
Accuracy of OpenAI model (valid rows): 0.6392
