In [None]:
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium."
]

theme = "weather"

In [None]:
import numpy as np

def cosine_similarity(vec_a, vec_b):
    """
    Calculate the cosine similarity between two vectors.

    :param vec_a: First vector.
    :param vec_b: Second vector.
    :return: Cosine similarity.
    """
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)

In [None]:
from sentence_transformers import SentenceTransformer

model_e5_b = SentenceTransformer("intfloat/multilingual-e5-base")

embeddings_e5_b = model_e5_b.encode(sentences)
embeddings_theme_e5_b = model_e5_b.encode(theme)
similarities_e5_b = model_e5_b.similarity(embeddings_e5_b, embeddings_theme_e5_b)


# [3, 3]

In [None]:
from sentence_transformers import SentenceTransformer

model_m3 = SentenceTransformer("BAAI/bge-m3")

embeddings_m3 = model_m3.encode(sentences)
embeddings_theme_m3 = model_m3.encode(theme)
similarities_m3 = model_m3.similarity(embeddings_m3, embeddings_theme_m3)

In [None]:
from sentence_transformers import SentenceTransformer

model_e5_l = SentenceTransformer("intfloat/e5-large-v2")

embeddings_e5_l = model_e5_l.encode(sentences)
embeddings_theme_e5_l = model_e5_l.encode(theme)
similarities_e5_l = model_e5_l.similarity(embeddings_e5_l, embeddings_theme_e5_l)

In [None]:
from openai import OpenAI

client = OpenAI(api_key= "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")

embedding_openai = []

for sentence in sentences:
    themed_sentence = sentence
    response = client.embeddings.create(
        input=themed_sentence,
        model="text-embedding-3-large"
    )
    embedding_openai.append(response.data[0].embedding)

response = client.embeddings.create(
        input=theme,
        model="text-embedding-3-large"
    )
embedding_theme_openai = response.data[0].embedding
similarities_openai = []

for embedding in embedding_openai:
    similarities_openai.append(cosine_similarity(embedding, embedding_theme_openai))


In [None]:
for i in range(3):
    for j in range(1):
        print(f"Similarity m3 between '{sentences[i]}' and '{theme}': {similarities_m3[i, j].item()}")
for i in range(3):
    for j in range(1):
        print(f"Similarity e5_l between '{sentences[i]}' and '{theme}': {similarities_e5_l[i, j].item()}")
for i in range(3):
    for j in range(1):
        print(f"Similarity e5_b between '{sentences[i]}' and '{theme}': {similarities_e5_b[i, j].item()}")
for i in range(3):
    for j in range(1):
        print(f"Similarity openai between '{sentences[i]}' and '{theme}': {similarities_openai[i].item()}")

In [76]:
import pandas as pd

input_path = '../../data/train.csv'
data = pd.read_csv(input_path)
df_reviews = data[["review_body", "language", "product_category"]]
df_categories = data[["product_category"]]

sample_categories = df_categories.drop_duplicates().reset_index(drop=True)


languages = ['de', 'en', 'es', 'fr', 'ja', 'zh']
sampled_data = []

for lang in languages:
    lang_data = df_reviews[df_reviews['language'] == lang].sample(n=1000, random_state=42)
    sampled_data.append(lang_data)

sampled_data = pd.concat(sampled_data)

In [74]:
lan_counts = sampled_data["language"].value_counts()
print(lan_counts)
cat_counts = sample_categories["product_category"].value_counts()
print(cat_counts)

language
de    1000
en    1000
es    1000
fr    1000
ja    1000
zh    1000
Name: count, dtype: int64
product_category
sports                      1
kitchen                     1
industrial_supplies         1
camera                      1
jewelry                     1
luggage                     1
watch                       1
other                       1
musical_instruments         1
digital_video_download      1
pet_products                1
video_games                 1
apparel                     1
office_product              1
personal_care_appliances    1
digital_ebook_purchase      1
home_improvement            1
beauty                      1
furniture                   1
automotive                  1
shoes                       1
pc                          1
wireless                    1
home                        1
book                        1
electronics                 1
lawn_and_garden             1
baby_product                1
toy                         1
drugstore   

In [80]:
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

models = {
    "e5_base": SentenceTransformer("intfloat/multilingual-e5-base"),
    "bge-m3": SentenceTransformer("BAAI/bge-m3"),
    "e5-large": SentenceTransformer("intfloat/e5-large-v2")
}

def apply_model_to_df(model, df, column_name):
    embeddings = model.encode(df[column_name].tolist())
    return embeddings.tolist()

for model_name, model in tqdm(models.items()):

    sampled_data[f'{model_name}_reviews'] = apply_model_to_df(model, sampled_data, 'review_body')
    sample_categories[f'{model_name}_categories'] = apply_model_to_df(model, sample_categories, 'product_category')


  0%|          | 0/3 [00:00<?, ?it/s]

In [81]:
print(sampled_data.columns)

Index(['review_body', 'language', 'product_category', 'e5_base_reviews',
       'bge-m3_reviews', 'e5-large_reviews'],
      dtype='object')


In [82]:
print(sample_categories.columns)

Index(['product_category', 'e5_base_categories', 'bge-m3_categories',
       'e5-large_categories'],
      dtype='object')


In [90]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm.auto import tqdm

predictions = []

for idx, row in tqdm(sampled_data.iterrows(), total=len(sampled_data), desc="Processing comments"):

    predicted_categories = {}
    for model_name in ['e5_base', 'bge-m3', 'e5-large']:
        review_embedding = row[f'{model_name}_reviews']

        categories_embeddings = sample_categories[f'{model_name}_categories'].tolist()

        similarities = cosine_similarity([review_embedding], categories_embeddings)[0]

        predicted_category_idx = np.argmax(similarities)

        predicted_category = sample_categories.iloc[predicted_category_idx]['product_category']

        predicted_categories[model_name] = predicted_category

    true_category = row['product_category']

    predictions.append([
        row['review_body'],
        predicted_categories['e5_base'],
        predicted_categories['bge-m3'],
        predicted_categories['e5-large'],
        true_category
    ])

predicted_df = pd.DataFrame(predictions, columns=[
    'comment',
    'predicted_category_e5_base',
    'predicted_category_bge-m3',
    'predicted_category_e5-large',
    'true_category'
])


Processing comments:   0%|          | 0/6000 [00:00<?, ?it/s]

In [93]:
predicted_df.head(6000)

Unnamed: 0,comment,predicted_category_e5_base,predicted_category_bge-m3,predicted_category_e5-large,true_category
0,Ist ok ...blondierung quillt schnell auf,beauty,beauty,beauty,beauty
1,Kein typischer Geruch oder Geschmack von einem...,baby_product,baby_product,baby_product,grocery
2,Dieses Buch hat mir sehr geholfen mit dem erst...,book,book,digital_ebook_purchase,book
3,"super Schale, wunderschön, gutes Produkt ABER ...",jewelry,jewelry,baby_product,baby_product
4,"Artikel ist niemals angekommen, habe ihn aber ...",digital_ebook_purchase,luggage,digital_ebook_purchase,book
...,...,...,...,...,...
5995,在亚马逊买是为了安全健康，收到的包裹包装又脏又差，就只有一层快递的黑色塑料袋，简直不能相信。...,digital_ebook_purchase,luggage,digital_ebook_purchase,baby_product
5996,产品很好.快递不送到楼上产品很好.快递不送到楼上产品很好.快递不送到楼上,office_product,grocery,electronics,baby_product
5997,可以想象，原著应该是很生动、简洁、有针对性的。但是翻译的水平实在太差。有些话看不懂。通篇都是...,other,digital_ebook_purchase,digital_ebook_purchase,digital_ebook_purchase
5998,基本是举一反三的配方，也能凑成一本书卖，不容易。,digital_ebook_purchase,book,electronics,book


In [95]:
from sklearn.metrics import accuracy_score

e5_base_accuracy = accuracy_score(predicted_df['true_category'], predicted_df['predicted_category_e5_base'])
bge_m3_accuracy = accuracy_score(predicted_df['true_category'], predicted_df['predicted_category_bge-m3'])
e5_large_accuracy = accuracy_score(predicted_df['true_category'], predicted_df['predicted_category_e5-large'])

print(f"Accuracy of e5_base model: {e5_base_accuracy:.4f}")
print(f"Accuracy of bge-m3 model: {bge_m3_accuracy:.4f}")
print(f"Accuracy of e5-large model: {e5_large_accuracy:.4f}")

Accuracy of e5_base model: 0.1558
Accuracy of bge-m3 model: 0.1667
Accuracy of e5-large model: 0.0853


In [96]:
from sklearn.metrics import accuracy_score
import numpy as np

valid_rows = predicted_df[
    (predicted_df['predicted_category_e5_base'] == predicted_df['true_category']) |
    (predicted_df['predicted_category_bge-m3'] == predicted_df['true_category']) |
    (predicted_df['predicted_category_e5-large'] == predicted_df['true_category'])
]

e5_base_accuracy = accuracy_score(valid_rows['true_category'], valid_rows['predicted_category_e5_base'])
bge_m3_accuracy = accuracy_score(valid_rows['true_category'], valid_rows['predicted_category_bge-m3'])
e5_large_accuracy = accuracy_score(valid_rows['true_category'], valid_rows['predicted_category_e5-large'])

print(f"Accuracy of e5_base model (valid rows): {e5_base_accuracy:.4f}")
print(f"Accuracy of bge-m3 model (valid rows): {bge_m3_accuracy:.4f}")
print(f"Accuracy of e5-large model (valid rows): {e5_large_accuracy:.4f}")

Accuracy of e5_base model (valid rows): 0.5392
Accuracy of bge-m3 model (valid rows): 0.5767
Accuracy of e5-large model (valid rows): 0.2953
