## Capstone V2

The second phase of the NLP Capstone focuses on Hybrid Retrieval Search. This phase focuses on comparing the performances in search retrieval models between sparse retrieval techniques using term weighting schemes such as BM25 and TF-IDF against hybrid search retrieval methods which incorporates dense retrieval from embeddings generated using BAAI BGE-M3 sentence transformer combined with the sparse retrieval methods. The models explored are BM25 Alone, TF-IDF Alone, BM25 + BGE, and TFIDF + BGE.

In [None]:
#All Imports
import numpy as np
import pandas as pd
from pinecone import Pinecone, ServerlessSpec

from dotenv import load_dotenv
import os
load_dotenv(dotenv_path='../.env')
CHARRAN_API = os.getenv('CHARRAN_API')
CHERYL_API = os.getenv('CHERYL_API')

from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import jieba

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-m3")

## 1. Hybrid Retrieval Phase for Italian

In [None]:
#Read the Italy Embeddings
italian_embeddings = pd.read_pickle("en_to_it_embeddings.pkl")
italian_embeddings

Unnamed: 0,title,title_italian,english_embedding,italian_embedding
0,zwilling pro 2pc prep knife set,Zwilling pro 2pc set coltello prep,"[-0.07585622, -0.006632321, -0.039237764, 0.04...","[-0.058768444, 0.012960452, -0.029929288, 0.05..."
1,womens slim fit drape wrap tshirt a new day,donne slim fit drappeggio avvolgere tshirt un ...,"[-0.023722176, -0.02756558, -0.07540757, 0.011...","[-0.056372743, -0.038858823, -0.07786548, 0.00..."
2,mens teenage mutant ninja turtles group shot l...,mens adolescente mutante ninja tartarughe grup...,"[-0.02781372, 0.004972987, -0.055929173, 0.013...","[-0.004044311, 0.008419336, -0.05591273, 0.015..."
3,mens wwe triple h the game logo tshirt,mens wwe triplo h il gioco logo tshirt,"[-0.037347108, -0.009183998, -0.082188, 0.0122...","[-0.03912319, -0.015832098, -0.07382396, 0.008..."
4,purina fancy feast grilled gravy delights feas...,purina fantasia festa grigliato sugo delizie f...,"[-0.0551254, 0.024768988, -0.02036258, -0.0108...","[-0.0431343, 0.017949222, -0.023515861, -0.024..."
...,...,...,...,...
868,multi collagen protein powder types i ii ii b...,proteine multi collageno in polvere ii ii ii o...,"[-0.007071648, 0.013024846, -0.026673753, -0.0...","[0.036605842, 0.03628495, -0.027457794, -0.008..."
869,hope henry mens waffle knit pullover sweater,speranza henry uomo waffle maglia pullover mag...,"[-0.026264952, -0.008562797, -0.05641582, -0.0...","[-0.0020557789, -0.0039152885, -0.04709097, -0..."
870,noritake colortrio 16piece coupe dinnerware set,noritake colortio 16 pezzi coupé set per la cena,"[-0.0055267178, -0.03823672, -0.024558328, 0.0...","[0.0011833841, -0.014286156, -0.01681679, 0.03..."
871,hope henry mens fine gauge vneck pullover swe...,speranza henry mens maglione pullover fine gau...,"[-0.03244666, -0.026627203, -0.07740165, 0.004...","[-0.017570777, -0.00990813, -0.071718924, 0.01..."


In [None]:
#Extracting English and Italian product titles from the dataset
entoit_english_titles = italian_embeddings['title']
entoit_italian_titles = italian_embeddings['title_italian']

#Tokenize the english and italian titles by splitting on whitespaces
entoit_tokenized_en = [title.split() for title in entoit_english_titles]
entoit_tokenized_it = [title.split() for title in entoit_italian_titles]

#Create a BM25 index for each language titles
bm25_en = BM25Okapi(entoit_tokenized_en)
bm25_it = BM25Okapi(entoit_tokenized_it)


In [None]:
#Even though the BM25 could be modularised by declaring the BM25 indexes in a dictionary, 
#it has not be done so that each language can be shown visually

"""" 
This function performs a BM25-based search for a given query and language. 
   Parameters:
        query (str): The input search string.
        lang (str): The language of the search corpus ('en' or 'it').
        top_k (int): The number of top matching results to return. Default is 5.

    Returns:
        top_k_ids (List[int]): Indices of the top_k most relevant documents.
        top_k_scores (List[float]): Corresponding BM25 relevance scores.


"""


def search(query, lang='en', top_k=5):
    tokens = query.lower().split() #Tokenize and lowercase the query

    #Compute BM25 relevance scores using the appropriate model
    if lang == 'en':
        scores = bm25_en.get_scores(tokens)
    else:
        scores = bm25_it.get_scores(tokens)
    
    #Rank results by score and get indices of top_k matches
    top_k_ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

    # Return both the document indices and their scores
    return top_k_ids, [scores[i] for i in top_k_ids]

In [None]:
#This function is similar to the previous code, but it has the autodetection of english or italian queries using the langdetect library
def search_bm25(query, top_k=5):
    lang = detect(query)  # auto-detect 'en', 'it', etc.
    tokens = query.lower().split()  # simple tokenization

    if lang == 'it':
        scores = bm25_it.get_scores(tokens)
    else:
        scores = bm25_en.get_scores(tokens)

    # Get top-k ranked indices
    top_k_ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return top_k_ids, [scores[i] for i in top_k_ids]

In [7]:
#Performs a BM25 search for the italian query 
results, scores = search_bm25("cuffie senza fili")

# Loop through the top results and print each score with its corresponding English product title
for i, score in zip(results, scores):
    print(f"{score:.4f} | {italian_embeddings['title'][i]}")


7.1469 | lands end womens anyweather fleece adjustable earmuffs
4.9215 | girls39 fleece footless tights  cat 38 jack8482
4.6791 | womens short sleeve vneck seamless tshirt  wild fable
4.6791 | womens seamless short sleeve shirt  all in motion
4.4594 | lands end womens no iron supima cotton long sleeve shirt


In [None]:
#Loading PineCone vectordatabase library
pc = Pinecone(api_key=CHARRAN_API)

#Initialising Pinecone index
#This is the Index for the English to Italian vector database
index = pc.Index('product-title-embeddings')

In [10]:
# Helper function to batch upsert
def batch_upsert(index, vectors, batch_size=50):
    """
    Helper function to upsert (insert or update) vectors into a vector index in batches.

    Parameters:
        index: The vector index (e.g., Pinecone index or FAISS wrapper) supporting the `upsert` method.
        vectors (list): A list of vector records, typically in the format expected by the index (e.g., [{"id": ..., "values": [...]}]).
        batch_size (int): Number of vectors to upsert in each batch. Default is 50.

    Returns:
        None
    """
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i+batch_size] #Slicing the current  batch
        index.upsert(vectors=batch) #Upsert the batch into index


In [11]:
# Generate unique IDs for English embeddings in the format: en-0, en-1, ..., en-N
en_ids = [f"en-{i}" for i in range(len(italian_embeddings['english_embedding']))]

# Generate unique IDs for Italian embeddings in the format: it-0, it-1, ..., it-N
it_ids = [f"it-{i}" for i in range(len(italian_embeddings['italian_embedding']))]


In [12]:
# Combine English + Italian vectors with unique IDs
# Each tuple is in the form (id, embedding)
combined_vectors = (
    list(zip(en_ids, italian_embeddings['english_embedding'])) +
    list(zip(it_ids, italian_embeddings['italian_embedding']))
)

#Convert each tuple to the expected format for upsert
# Required format: {"id": str, "values": List[float]}
to_upsert = [{"id": id, "values": vector} for id, vector in combined_vectors]

# Step 3: Upsert to index in batches
batch_upsert(index, to_upsert, batch_size=50)

In [13]:
# Combine English + Italian vectors with unique IDs
# Each tuple is in the form (id, embedding)
to_upsert = list(zip(en_ids, italian_embeddings['english_embedding'])) + \
            list(zip(it_ids, italian_embeddings['italian_embedding']))

# Run batch upload
batch_upsert(index, to_upsert, batch_size=50)


#With this code, the vectorDB has been established for en to it.   

At this point, the vectorDB has been established for english and italian vectors.

Since BM25 is working, the next step is to proceed building a hybrid retrieval of BM25 + BGE-M3 search engine
for the product titles

In [None]:
def scores_to_ranking(scores: list[float]) -> list[int]:
    """
    Converts a list of float scores into integer rankings.

    Higher scores get better (lower) ranks — e.g., the highest score gets rank 1.

    Args:
        scores (list[float]): List of similarity or relevance scores.

    Returns:
        list[int]: List of ranks where 1 is the highest rank.
    """
    # argsort returns indices that would sort the array — we reverse to get descending order
    return list(np.argsort(scores)[::-1] + 1)  # Ranks start at 1


In [15]:
def rrf(keyword_rank: int, semantic_rank: int, k: int = 60) -> float:
    """
    Computes a Reciprocal Rank Fusion (RRF) score from keyword and semantic ranks.

    RRF helps combine multiple ranking signals (e.g., BM25 + embeddings) into a single score
    that rewards high rank in either list.

    Args:
        keyword_rank (int): Rank from keyword-based retrieval (e.g., BM25).
        semantic_rank (int): Rank from semantic search (e.g., BGE/LaBSE embeddings).
        k (int, optional): Smoothing constant to dampen large ranks. Default is 60.

    Returns:
        float: The combined RRF score (higher is better).
    """
    return 1 / (k + keyword_rank) + 1 / (k + semantic_rank)


1.1 BM25 + BGE Italian

In [16]:
def hybrid_search_rrf(query, top_k=5):

    #To detect the language from the query
    from langdetect import detect

    lang = detect(query)
    #tokenising the query to be suitable for BM25
    tokens = query.lower().split()

    # --- BM25 Retrieval ---(Routes the tokenized query to the appropiate BM25 engine)
    if lang == 'it':
        bm25_scores = bm25_it.get_scores(tokens)
    else:
        bm25_scores = bm25_en.get_scores(tokens) #returns a list of scores, one for each document in the dataset

    bm25_ranks = scores_to_ranking(bm25_scores)  #Converts the float BM25 scores to rankings for Reciprocal rank fusion(rrf)

    # --- Semantic Retrieval (Pinecone) ---
    query_vec = model.encode(query).tolist() #generate the query's embedding and convert it a list
    pinecone_results = index.query(vector=query_vec, top_k=top_k, include_metadata=False) #Submits the query vector to Pinecone to retrieve top-k similar vectors, based on cosine similarity

    pinecone_ids = [int(match['id'].split('-')[1]) for match in pinecone_results['matches']] #extracts og row index from pinecone's ID
    pinecone_scores = [match['score'] for match in pinecone_results['matches']] #Obtain cosine similarity scores from pinecone
    semantic_ranks = scores_to_ranking(pinecone_scores) #Converting pinecone scores to ranks (lower rank = better match)

    # --- Combine using RRF ---
    combined_scores = {} #placeholder to store RRF combined scores for each shortlisted document
    for idx in pinecone_ids:
        #For each doc idx returned by pinecone
        # Retrieve the BM25 rank and semantic rank, and using rrf function defined on prev cell to combine them into one score
        rrf_score = rrf(
            keyword_rank=bm25_ranks[idx],
            semantic_rank=semantic_ranks[pinecone_ids.index(idx)]
        )
        combined_scores[idx] = rrf_score

    # Sort by RRF score
    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)  #Sorting the docs by their RRF score, with the highest first.

    return ranked[:top_k]  # list of (doc_id, final_score)


In [None]:
#Testing phase for hybrid (Italian to English)

hybrid_results = hybrid_search_rrf("giacca da donna")

for idx, score in hybrid_results:
    print(f"{score:.4f} | {italian_embeddings['title'][idx]}")

NameError: name 'scores_to_ranking' is not defined

In [None]:
#Testing phase for hybrid (English to Italian)

hybrid_results = hybrid_search_rrf("women jacket")

for idx, score in hybrid_results:
    print(f"{score:.4f} | {italian_embeddings['title_italian'][idx]}")

0.0203 | ragazze39 solido giacca trapuntato gatto 38 jack8482
0.0200 | ragazze puffer giacca tutto in movimento
0.0184 | jockey generazione donne cotone biologico stretch cropped tshirt
0.0178 | wink pro donne snap giacca di riscaldamento anteriore
0.0173 | ragazze solido puffer giacca classe d'arte


1.2 BM25 Alone Italian

In [None]:
def BM25(query, top_k=5):
    """
    Perform BM25 keyword search based on query language.

    Args:
        query (str): The user's search query.
        top_k (int): Number of top documents to return.

    Returns:
        list of (doc_id, bm25_score)
    """
    #To detect the language from the query
    from langdetect import detect
    lang = detect(query)
    tokens = query.lower().split()

    # Score retrieval
    if lang == 'it':
        bm25_scores = bm25_it.get_scores(tokens)
    else:
        bm25_scores = bm25_en.get_scores(tokens)

    # Get top-k doc IDs based on raw BM25 scores
    top_k_ids = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k]
    
    return [(i, bm25_scores[i]) for i in top_k_ids] #Returns a list of (document ID, score) pairs.

In [29]:
#Testing phase for BM25 alone

results = BM25("giacca da donna")

for idx, score in results:
    print(f"{score:.4f} | {italian_embeddings['title'][idx]}")

7.1452 | womens highrise straight jeans  universal thread
6.4611 | womens linen short sleeve buttondown camp shirt  a new day
6.4048 | hanes comfort fit scrubs womens scrub pants
6.4048 | timberland womens dunstan short sleeve tshirt
6.0893 | womens fitted short sleeve tshirt  universal thread


In [None]:
#Testing phase for BM25 alone
results = BM25("women jacket")

for idx, score in results:
    print(f"{score:.4f} | {italian_embeddings['title_italian'][idx]}")

5.4298 | ragazze puffer giacca tutto in movimento
5.4298 | ragazze solido puffer giacca classe d'arte
5.1272 | Bambini39 giacca gonfiabile solida tutto in movimento8482
5.1272 | ragazze39 solido giacca trapuntato gatto 38 jack8482
5.1272 | wink pro donne snap giacca di riscaldamento anteriore


1.3 TF-IDF + BGE Italian

In [None]:
#For both Sparse and Sparse + Dense
tfidf_vectorizer_en = TfidfVectorizer()
tfidf_matrix_en = tfidf_vectorizer_en.fit_transform(italian_embeddings['title'])

tfidf_vectorizer_it = TfidfVectorizer()
tfidf_matrix_it = tfidf_vectorizer_it.fit_transform(italian_embeddings['title_italian'])

In [21]:
def hybrid_search_tfidf(query, top_k=5):

    # To detect the language from the query
    from langdetect import detect
    lang = detect(query)
    
    # Ensure correct tf-idf matrix based on language
    if lang == 'it':
        query_vec = tfidf_vectorizer_it.transform([query])  # Spanish vectorizer
        similarities = cosine_similarity(query_vec, tfidf_matrix_it)[0]  # Cosine similarity for Spanish
    else:
        query_vec = tfidf_vectorizer_en.transform([query])
        similarities = cosine_similarity(query_vec, tfidf_matrix_en)[0]  # Cosine similarity for English

    # Rank documents by similarity
    ranked_indices = np.argsort(similarities)[::-1]
    
    # Return top_k document indices with their similarity scores
    tfidf_rank_map = {int(idx): rank for rank, idx in enumerate(ranked_indices[:top_k], start=1)}


    # --- Semantic Retrieval (Pinecone) ---
    query_vec = model.encode(query).tolist()  # Generate the query's embedding and convert it to a list

    # Submits the query vector to Pinecone to retrieve top-k similar vectors
    pinecone_results = index.query(vector=query_vec, top_k=top_k * 2, include_metadata=False)

    # Extracts original row index from Pinecone's vector ID (e.g., "en-123" → 123)
    pinecone_ids = [int(match['id'].split('-')[1]) for match in pinecone_results['matches']]

    # Obtain cosine similarity scores from Pinecone
    pinecone_scores = [match['score'] for match in pinecone_results['matches']]

    # Converting Pinecone scores to ranks (lower rank = better match)
    semantic_ranks = scores_to_ranking(pinecone_scores)

    # Create a mapping from document index to semantic rank for fast lookup (converted to dictionary)
    semantic_rank_map = {idx: rank for idx, rank in zip(pinecone_ids, semantic_ranks)}

    # --- Combine using RRF ---
    combined_scores = {}

    # For each doc idx returned by Pinecone
    # Retrieve the BM25 rank and semantic rank, and use rrf() to combine them into a hybrid score
    for idx in pinecone_ids:
        rrf_score = rrf(
            keyword_rank=tfidf_rank_map.get(idx, top_k + 1),  # Fallback rank if not found in BM25
            semantic_rank=semantic_rank_map.get(idx, top_k + 1)  # Fallback rank if not found in semantic
        )
        combined_scores[idx] = rrf_score  # Store the fused score

    # Sort the docs by their RRF score, with the highest first
    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    return ranked[:top_k]  # Return top-k results as a list of (doc_id, final_score)


In [None]:
#Testing phase for TFIDF 
results = hybrid_search_tfidf("giacca da donna")

for idx, score in results:
    print(f"{score:.4f} | {italian_embeddings['title_italian'][idx]}")

0.0313 | toddler girls printed short sleeve tshirt  cat  jack
0.0310 | girls disney princess friends make everything better short sleeve graphic tshirt  black
0.0308 | wink pro womens snap front warmup jacket
0.0305 | toddler boys39 dino graphic short sleeve graphic tshirt  cat 38 jack8482
0.0303 | womens teddy bear pumpkin short sleeve graphic boyfriend tshirt  beige


1.4 TF-IDF Alone Italian

In [None]:
def TFIDF(query, top_k=5):
    lang = detect(query)
    
    if lang == 'it':
        query_vec = tfidf_vectorizer_it.transform([query])  # query vector
        similarities = cosine_similarity(query_vec, tfidf_matrix_it)[0]  # cosine similarity with each doc
    else:
        query_vec = tfidf_vectorizer_en.transform([query])
        similarities = cosine_similarity(query_vec, tfidf_matrix_en)[0]

    # Rank documents by similarity
    ranked_indices = np.argsort(similarities)[::-1]
    
    # Return top_k document indices with their similarity scores
    top_results = [(int(idx), float(similarities[idx])) for idx in ranked_indices[:top_k]]
    
    return top_results

In [None]:
#Testing phase for TFIDF alone
results = TFIDF("giacca da donna")

for idx, score in results:
    print(f"{score:.4f} | {italian_embeddings['title'][idx]}")

0.3772 | timberland womens dunstan short sleeve tshirt
0.3743 | womens linen short sleeve buttondown camp shirt  a new day
0.3412 | womens fitted short sleeve tshirt  universal thread
0.3339 | mens rocky mobilite waterproof work boot
0.3271 | girls puffer jacket  all in motion


1.5 Comparison of scores obtained using BM25 vs. Hybrid Retrieval Using NDCG

In [24]:
english_queries = [
    "men's white dress shirt",
    "women's floral summer dress",
    "black women's ankle boots",
    "men's crewneck t-shirt",
    "women's cropped denim jacket"
]

italian_queries = [
    "camicia bianca da uomo",                  # men's white dress shirt
    "vestito estivo floreale da donna",        # women's floral summer dress
    "stivaletti neri da donna",                # black women's ankle boots
    "maglietta girocollo uomo",                # men's crewneck t-shirt
    "giacca di jeans corta da donna"           # women's cropped denim jacket
]


In [25]:
import json

with open("ground_truth_fashion.json", "r") as f:
    ground_truth = json.load(f)

In [None]:
import numpy as np  # Import NumPy for efficient numerical operations

# --- Function to calculate Discounted Cumulative Gain (DCG) ---
# DCG measures how good a ranked list is by rewarding relevant documents appearing earlier in the list
def dcg(relevances):
    # For each position i, divide the relevance score by log2(i+2) to apply discounting
    # i + 2 ensures that the first position is divided by log2(2) = 1
    return sum(rel / np.log2(i + 2) for i, rel in enumerate(relevances))

# --- Function to calculate Normalized Discounted Cumulative Gain (NDCG) ---
# NDCG compares the DCG of a predicted ranking to the ideal ranking
def ndcg(ranked_ids, relevance_dict, k=5):
    # Retrieve the relevance score for each of the top-k ranked document IDs
    # If a doc ID is not in the relevance_dict, assume relevance = 0
    relevances = [relevance_dict.get(str(doc_id), 0) for doc_id in ranked_ids[:k]]
    
    # Sort all known relevance scores in descending order to get the ideal ranking
    ideal_relevances = sorted(relevance_dict.values(), reverse=True)[:k]
    
    # Compute NDCG as the ratio of actual DCG to ideal DCG
    # If there are no ideal relevances (e.g., empty dict), return 0.0 to avoid division by zero
    return dcg(relevances) / dcg(ideal_relevances) if ideal_relevances else 0.0


In [None]:
# List of test queries from the ground truth dictionary
test_queries = list(ground_truth.keys())

# Evaluation depth: compute NDCG@10
k = 10

print(" NDCG Comparison (BM25 vs Hybrid RRF)\n")

# Loop through each test query
for query in test_queries:
    # gt: the ground truth relevance scores for documents related to this query
    gt = ground_truth[query]
    
    # Get top-k ranked document IDs from BM25 (ignore scores)
    bm25_ids = [doc_id for doc_id, _ in BM25(query, top_k=k)]

    # Get top-k ranked document IDs from the hybrid RRF system
    hybrid_ids = [doc_id for doc_id, _ in hybrid_search_rrf(query, top_k=k)]

    # Compute NDCG score for BM25 rankings using the ground truth
    score_bm25 = ndcg(bm25_ids, gt, k)

    # Compute NDCG score for Hybrid (BM25 + semantic) rankings
    score_hybrid = ndcg(hybrid_ids, gt, k)

    # Print the results for this query
    print(f"Query: {query}")
    print(f"  NDCG@{k} - BM25   : {score_bm25:.4f}")
    print(f"  NDCG@{k} - Hybrid : {score_hybrid:.4f}")
    print("-" * 40)


🔍 NDCG Comparison (BM25 vs Hybrid RRF)

Query: men's white dress shirt
  NDCG@10 - BM25   : 0.0000
  NDCG@10 - Hybrid : 0.1210
----------------------------------------
Query: women's floral summer dress
  NDCG@10 - BM25   : 0.0692
  NDCG@10 - Hybrid : 0.3230
----------------------------------------
Query: black women's ankle boots
  NDCG@10 - BM25   : 0.0316
  NDCG@10 - Hybrid : 0.0000
----------------------------------------
Query: men's crewneck t-shirt
  NDCG@10 - BM25   : 0.0000
  NDCG@10 - Hybrid : 0.0000
----------------------------------------
Query: women's cropped denim jacket
  NDCG@10 - BM25   : 0.0000
  NDCG@10 - Hybrid : 0.0000
----------------------------------------


## 2. Hybrid Retrieval Phase for Spanish

In [None]:
#Read the Spanish Embeddings
spanish_embeddings = pd.read_pickle("en_to_sp_embeddings.pkl")
spanish_embeddings

Unnamed: 0,title,title_spanish,english_embedding,spanish_embedding
0,brother genuine high yield toner cartridge tn4...,hermano genuino cartucho tóner de alto rendimi...,"[-0.03431117, 0.025899883, -0.00967014, -0.019...","[0.012239528, 0.02652684, 0.002397126, -0.0288..."
1,fitbit inspire 3 health and fitness tracker wi...,fitbit inspirar 3 seguimiento de salud y fitne...,"[-0.0016011602, -0.002595037, -0.07348455, 0.0...","[-0.011861571, -0.009732766, -0.06545575, -0.0..."
2,mikes hot honey americas 1 brand of hot honey ...,mikes miel caliente américas 1 marca de miel c...,"[-0.0004525112, -0.009976895, -0.015700651, 0....","[-0.031901788, 0.017521167, -0.04371976, 0.039..."
3,krema kréma red fruits 100 recyclable 240g,krema kréma frutos rojos 100 reciclables 240g,"[-0.011189645, 0.033041686, -0.005376764, -0.0...","[-0.013215443, 0.0015486346, -0.020853952, -0...."
4,drsalts calming therapy epsom salts soothing ...,drsalts calmante terapia epsom sales calmantes...,"[0.018024862, -0.015684763, -0.062142983, -0.0...","[0.008137982, 0.009916707, -0.07349886, -0.013..."
...,...,...,...,...
991,ruimen smart watches for men women answermake ...,ruimen relojes inteligentes para hombres mujer...,"[-0.022698322, 0.004262252, -0.06492456, -0.01...","[-0.015060791, 0.010321501, -0.057668064, -0.0..."
992,musicozy sleep headphones bluetooth 54 headban...,auriculares musicozy sueño bluetooth 54 diadem...,"[-0.0110038, 0.028441783, -0.065515295, 0.0328...","[0.006726083, 0.042338137, -0.0548927, 0.00642..."
993,sun ninja pop up beach tent sun shelter upf50 ...,sun ninja pop up playa refugio de sol upf50 co...,"[-0.018024122, -0.008911157, -0.09137453, 0.00...","[-0.0045234896, 0.003032705, -0.079418756, 0.0..."
994,rhino usa trailer hitch pin 2 inch patented 58...,enganche de remolque de rinoceronte usa pin de...,"[-0.011390688, -0.004701349, -0.009233302, 0.0...","[0.023057196, 0.013233271, 0.0004464224, 0.017..."


In [39]:
#Building BM25 for Spanish
entoes_english_titles = spanish_embeddings['title']
entoes_spanish_titles = spanish_embeddings['title_spanish']

entoes_tokenized_en = [title.split() for title in entoes_english_titles]
entoes_tokenized_es = [title.split() for title in entoes_spanish_titles]

from rank_bm25 import BM25Okapi

bm25_en = BM25Okapi(entoes_tokenized_en)
bm25_es = BM25Okapi(entoes_tokenized_es)

In [40]:
#This function is especially made for en and es, need to be redeclared for en and es
def search(query, lang='en', top_k=5):
    tokens = query.lower().split()
    if lang == 'en':
        scores = bm25_en.get_scores(tokens)
    else:
        scores = bm25_es.get_scores(tokens)
    
    top_k_ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return top_k_ids, [scores[i] for i in top_k_ids]

In [None]:
#This function is made for en and es
def search_bm25(query, top_k=5):
    lang = detect(query)  # auto-detect 'en', 'es', etc.
    tokens = query.lower().split()  # simple tokenization

    if lang == 'es':
        scores = bm25_es.get_scores(tokens)
    else:
        scores = bm25_en.get_scores(tokens)

    # Get top-k ranked indices
    top_k_ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return top_k_ids, [scores[i] for i in top_k_ids]

In [42]:
results, scores = search_bm25("hermano genuino cartucho")

for i, score in zip(results, scores):
    print(f"{score:.4f} | {spanish_embeddings['title'][i]}")

19.4874 | brother genuine high yield toner cartridge tn450 replacement black toner page yield up to 2600 pages
17.1048 | brother genuine tn436 super high yield toner black
13.1930 | compatible toner cartridge replacement for brother tn770 tn770 toner for brother printer hll2370dw l2370dwxl mfcl2750dw l2750dwxl 4500 page black 2 pack
6.3196 | lxtek compatible toner cartridge replacement for canon 137 black toner cartridge 137 crg137 to use with imageclass d570 mf232w mf242dw mf240 mf230 mf216n mf236n2 pack 137 black
5.8940 | tokyoink 232xl ink cartridges combo pack replacement for epson 232 xl t232 ink cartridge for expression home xp4200 xp4205 workforce wf2930 wf2950 printer ink cartridge cyan magenta yellow black


In [None]:
pc = Pinecone(api_key=CHERYL_API)
#Initialising Pinecone index
#This is the Index for entoes
index = pc.Index('entoes')

In [45]:
en_ids = [f"en-{i}" for i in range(len(spanish_embeddings['english_embedding']))]
es_ids = [f"es-{i}" for i in range(len(spanish_embeddings['spanish_embedding']))]

In [None]:
# Combine English + Spanish as before
to_upsert = list(zip(en_ids, spanish_embeddings['english_embedding'])) + \
            list(zip(es_ids, spanish_embeddings['spanish_embedding']))

# Run batch upload
batch_upsert(index, to_upsert, batch_size=50)

In [None]:
def scores_to_ranking(scores: list[float]) -> list[int]:
    """Convert float scores into int rankings (1 = best)."""
    return np.argsort(scores)[::-1] + 1  # ranks start at 1

def rrf(keyword_rank: int, semantic_rank: int, k: int = 60) -> float:
    """Combine keyword rank and semantic rank into a hybrid score using RRF."""
    return 1 / (k + keyword_rank) + 1 / (k + semantic_rank)


2.1 BM25 + BGE Spanish

In [47]:
def hybrid_search_rrf(query, top_k=5):

    #To detect the language from the query
    from langdetect import detect

    lang = detect(query)
    #tokenising the query to be suitable for BM25
    tokens = query.lower().split()

    # --- BM25 Retrieval ---(Routes the tokenized query to the appropiate BM25 engine)
    if lang == 'es':
        bm25_scores = bm25_es.get_scores(tokens)
    else:
        bm25_scores = bm25_en.get_scores(tokens) #returns a list of scores, one for each document in the dataset

    bm25_ranks = scores_to_ranking(bm25_scores)  #Converts the float BM25 scores to rankings for Reciprocal rank fusion(rrf)

    # --- Semantic Retrieval (Pinecone) ---
    query_vec = model.encode(query).tolist() #generate the query's embedding and convert it a list
    pinecone_results = index.query(vector=query_vec, top_k=top_k, include_metadata=False) #Submits the query vector to Pinecone to retrieve top-k similar vectors, based on cosine similarity

    pinecone_ids = [int(match['id'].split('-')[1]) for match in pinecone_results['matches']] #extracts og row index from pinecone's ID
    pinecone_scores = [match['score'] for match in pinecone_results['matches']] #Obtain cosine similarity scores from pinecone
    semantic_ranks = scores_to_ranking(pinecone_scores) #Converting pinecone scores to ranks (lower rank = better match)

    # --- Combine using RRF ---
    combined_scores = {} #placeholder to store RRF combined scores for each shortlisted document
    for idx in pinecone_ids:
        #For each doc idx returned by pinecone
        # Retrieve the BM25 rank and semantic rank, and using rrf function defined on prev cell to combine them into one score
        rrf_score = rrf(
            keyword_rank=bm25_ranks[idx],
            semantic_rank=semantic_ranks[pinecone_ids.index(idx)]
        )
        combined_scores[idx] = rrf_score

    # Sort by RRF score
    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)  #Sorting the docs by their RRF score, with the highest first.

    return ranked[:top_k]  # list of (doc_id, final_score)


In [None]:
#Testing phase
results = hybrid_search_rrf("chaqueta de mujer")

for idx, score in results:
    print(f"{score:.4f} | {spanish_embeddings['title'][idx]}")

0.0194 | simplicity creative patterns sleeves for tops vest jackets coats a 10121416182022
0.0189 | fit  fresh lunch bag for women insulated womens lunch bag for work leakproof  stainresistant large lunch box for women with containers tumbler  ice pack zipper closure wichita bag palm leaves
0.0166 | artelaris lunch backpack for women stylish insulated backpack for women waterproof travel backpack lunch bag womens cooler backpack lunchbox backpack for teacher nurse work picnic book bag


2.2 BM25 Alone Spanish

In [None]:
def BM25(query, top_k=5):
    #To detect the language from the query
    lang = detect(query)
    tokens = query.lower().split()

    # Score retrieval
    if lang == 'es':
        bm25_scores = bm25_es.get_scores(tokens)
    else:
        bm25_scores = bm25_en.get_scores(tokens)

    bm25_ranks = scores_to_ranking(bm25_scores)  #Converts the float BM25 scores to rankings for Reciprocal rank fusion(rrf)

    # Sort by RRF score
    top_k_ids = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k]  #Sorting the docs by their RRF score, with the highest first.

    return [(i, bm25_scores[i]) for i in top_k_ids]  #Returns a list of (document ID, score) pairs.


In [None]:

def BM25(query, top_k=5):
    """
    Perform BM25 keyword search based on query language.

    Args:
        query (str): The user's search query.
        top_k (int): Number of top documents to return.

    Returns:
        list of (doc_id, bm25_score)
    """
    #To detect the language from the query
    from langdetect import detect
    lang = detect(query)
    tokens = query.lower().split()

    # Score retrieval
    if lang == 'es':
        bm25_scores = bm25_es.get_scores(tokens)
    else:
        bm25_scores = bm25_en.get_scores(tokens)

    # Get top-k doc IDs based on raw BM25 scores
    top_k_ids = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k]
    
    return [(i, bm25_scores[i]) for i in top_k_ids] #Returns a list of (document ID, score) pairs.

In [None]:
#Testing phase for BM25
results = BM25("chaqueta de mujer")

for idx, score in results:
    print(f"{score:.4f} | {spanish_embeddings['title'][idx]}")

6.6092 | christian art gifts wide mouth bpafree reusable plastic sports water bottle for men  women inspirational scripture wlocking fliptop lid  carry strap 28 oz
6.4951 | one a day womens active metabolism multivitamin supplement with vitamin a c d e and zinc for immune health support iron calcium folic acid  more 50 count
6.3409 | huefull gua sha facial tools  jade roller set for skin care reduce puffiness and improve wrinkles guasha tool for face gua sha stone self care gift for woman man christmas gifts
6.1578 | smartypants womens multivitamin gummies sugar free biotin methylfolate omega 3 ala vitamin d3 c vitamin b12 b6 vitamin a k  zinc gluten free 60 count 20 day supply
4.8513 | skg smart watch for men women android iphone with alexa builtin  bluetooth callanswermake call 169 fitness tracker with ip68 waterproof 60 sports heart rate spo2 monitor v7 pro


2.3 TF-IDF + BGE Spanish

In [None]:
#For both Sparse and Sparse + Dense
tfidf_vectorizer_en = TfidfVectorizer()
tfidf_matrix_en = tfidf_vectorizer_en.fit_transform(spanish_embeddings['title'])

tfidf_vectorizer_es = TfidfVectorizer()
tfidf_matrix_es = tfidf_vectorizer_es.fit_transform(spanish_embeddings['title_spanish'])

In [58]:
def hybrid_search_tfidf(query, top_k=5):

    from langdetect import detect
    lang = detect(query)
    
    if lang == 'es':
        query_vec = tfidf_vectorizer_es.transform([query])  # query vector
        similarities = cosine_similarity(query_vec, tfidf_matrix_es)[0]  # cosine similarity with each doc
    else:
        query_vec = tfidf_vectorizer_en.transform([query])
        similarities = cosine_similarity(query_vec, tfidf_matrix_en)[0]  # Cosine similarity for English

    ranked_indices = np.argsort(similarities)[::-1]
    
    # Return top_k document indices with their similarity scores
    tfidf_rank_map = {int(idx): rank for rank, idx in enumerate(ranked_indices[:top_k], start=1)}

    # --- Semantic Retrieval (Pinecone) ---
    query_vec = model.encode(query).tolist() 

    pinecone_results = index.query(vector=query_vec, top_k=top_k * 2, include_metadata=False)

    pinecone_ids = [int(match['id'].split('-')[1]) for match in pinecone_results['matches']]

    pinecone_scores = [match['score'] for match in pinecone_results['matches']]

    semantic_ranks = scores_to_ranking(pinecone_scores)

    semantic_rank_map = {idx: rank for idx, rank in zip(pinecone_ids, semantic_ranks)}

    # --- Combine using RRF ---
    combined_scores = {}

    for idx in pinecone_ids:
        rrf_score = rrf(
            keyword_rank=tfidf_rank_map.get(idx, top_k + 1),  # Fallback rank if not found in BM25
            semantic_rank=semantic_rank_map.get(idx, top_k + 1)  # Fallback rank if not found in semantic
        )
        combined_scores[idx] = rrf_score  # Store the fused score

    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    return ranked[:top_k]

In [60]:
#Testing phase for TFIDF

results = hybrid_search_tfidf("chaqueta de mujer")

for idx, score in results:
    print(f"{score:.4f} | {spanish_embeddings['title'][idx]}")

0.0310 | simplicity creative patterns sleeves for tops vest jackets coats a 10121416182022
0.0305 | fit  fresh lunch bag for women insulated womens lunch bag for work leakproof  stainresistant large lunch box for women with containers tumbler  ice pack zipper closure wichita bag palm leaves
0.0301 | artelaris lunch backpack for women stylish insulated backpack for women waterproof travel backpack lunch bag womens cooler backpack lunchbox backpack for teacher nurse work picnic book bag
0.0299 | vlando viaggio small jewelry case box travel essential accessories for women gifts for travelers couples mom friends bridesmaid
0.0296 | indressme cotton basket 17¾ x 15¾ x 13¾ inches woven hamper pink girl basket for gift toy blanket corner basket in living room


2.4 TFIDF Alone Spanish

In [None]:
def TFIDF(query, top_k=5):
    lang = detect(query)
    
    if lang == 'es':
        query_vec = tfidf_vectorizer_es.transform([query])  # query vector
        similarities = cosine_similarity(query_vec, tfidf_matrix_es)[0]  # cosine similarity with each doc
    else:
        query_vec = tfidf_vectorizer_en.transform([query])
        similarities = cosine_similarity(query_vec, tfidf_matrix_en)[0]

    # Rank documents by similarity
    ranked_indices = np.argsort(similarities)[::-1]
    
    # Return top_k document indices with their similarity scores
    top_results = [(int(idx), float(similarities[idx])) for idx in ranked_indices[:top_k]]
    
    return top_results

In [62]:
#Testing phase for TFIDF

results = TFIDF("chaqueta de mujer")

for idx, score in results:
    print(f"{score:.4f} | {spanish_embeddings['title_spanish'][idx]}")

0.2474 | suplemento multivitamínico del metabolismo activo de una mujer al día con vitamina a c d e y zinc para el apoyo de la salud inmune hierro calcio ácido fólico más 50 recuento
0.2248 | skg reloj inteligente para hombres mujer iphone android con alexa incorporado bluetooth callanswermake llamada 169 fitness tracker con ip68 impermeable 60 deportes frecuencia cardíaca spo2 monitor v7 pro
0.2248 | arte cristiano regalos boca ancha bpafree reutilizable plástico deportes botella de agua para los hombres mujer inspiración escritura wlocking fliptop tapa llevar correa 28 oz
0.2034 | smartypants mujer gomas multivitamínicas azúcar libre de biotina metilfolato omega 3 ala vitamina d3 c vitamina b12 b6 vitamina a k zinc sin gluten 60 cuenta 20 días suministro
0.1855 | huefull gua sha herramientas faciales jade juego de rodillos para el cuidado de la piel reducir la hinchazón y mejorar las arrugas guasha herramienta para cara gua sha piedra auto cuidado regalo para mujer hombre regalos de 

## 3. Hybrid Retrieval Phase for CN

In [None]:
#Read the Chinese Embeddings
chinese_embeddings = pd.read_pickle("en_to_cn_embeddings.pkl")
chinese_embeddings

Unnamed: 0,title,chinese translation,english_embedding,chinese_embedding
0,Oppo A75 A75S A73 Phone Case Soft Rabbit Silic...,OPPO A75 A75s A73 手机壳 软壳 挂绳壳 大眼兔硅胶壳,"[-0.030606616, 0.010501585, -0.04400219, -0.00...","[-0.020788355, 0.032136466, -0.03952156, -0.04..."
1,SOFT 99 Coating Car Wax Strong Water Watt,SOFT 99 鍍膜車蠟(強力撥水型),"[-0.02521394, -0.0062141055, -0.02523462, -0.0...","[-0.013580757, -0.013445883, 0.013568486, -0.0..."
2,Low Sugar Mango Dry 250g Be The Royal,低糖芒果乾 250g 臻御行,"[-0.06998538, 0.025515176, -0.006934945, -0.02...","[-0.056555215, 0.015317621, 0.0015813652, -0.0..."
3,* the culture Japan Imported Round Top Space C...,＊小徑文化＊日本進口ROUND TOP space craft - diamond (SC-...,"[-0.003708915, 0.024768945, -0.062792934, 0.02...","[-0.018781146, 0.033165023, -0.05913993, 0.019..."
4,Hello Kitty Sandals Shoes White/Red Children n...,Hello Kitty 凱蒂貓 KITTY 涼鞋 童鞋 白/紅色 小童 no739,"[-0.019042147, 0.031313036, -0.06666778, 0.049...","[-0.043943617, 0.021419879, -0.059569906, 0.03..."
...,...,...,...,...
995,Hippored Torn Fun Unique Style Straight Jeans ...,【HippoRed】撕破乐趣★独特风格★中直筒牛仔裤 O591_445,"[-0.015312562, 0.002696402, -0.046150953, 0.00...","[-0.04397009, -0.013235806, -0.034632586, 0.03..."
996,Kids Set Table Bay - Thin Long Sleeve Home Sui...,兒童套裝 台灣製薄長袖居家套裝 魔法Baby~k60092,"[-0.00460147, 0.029976973, -0.080628425, 0.003...","[0.00086109334, 0.012746421, -0.04744607, 0.00..."
997,LONGCHAMP Le Pliage Neo High Density Nylon Bac...,LONGCHAMP Le Pliage Neo高密尼龍後背包(中型),"[-0.025269749, -0.050276544, -0.059641942, -0....","[-0.0399163, -0.031578567, -0.04178574, 0.0254..."
998,IFairies Opening Adjustable Ring ifairies [564...,iFairies 開口可調節戒指★ifairies【56472】【56472】,"[0.018464142, 0.016518341, -0.034174442, 0.007...","[0.029656759, 0.03718795, -0.042785533, -0.027..."


In [69]:
#Building BM25 for Chinese
entocn_english_titles = chinese_embeddings['title']
entocn_chinese_titles = chinese_embeddings['chinese translation']

entocn_tokenized_en = [title.split() for title in entocn_english_titles]
#entocn_tokenized_cn = [title.split() for title in entocn_chinese_titles]

#Using jieba for Chinese tokenization
import jieba
#entocn_tokenized_en = [list(jieba.cut(title)) for title in entocn_english_titles]
entocn_tokenized_cn = [list(jieba.cut(title)) for title in entocn_chinese_titles]

from rank_bm25 import BM25Okapi

bm25_en = BM25Okapi(entocn_tokenized_en)
bm25_cn = BM25Okapi(entocn_tokenized_cn)

In [None]:
# Text cleaner
def clean_text(text):
    text = text.strip()
    text = re.sub(r"[^\u4e00-\u9fa5a-zA-Z0-9]", "", text)
    return text

# Clean + tokenize
entocn_chinese_titles = chinese_embeddings['chinese translation'].apply(clean_text)
entocn_tokenized_cn = [list(jieba.cut(title)) for title in entocn_chinese_titles]

# Build BM25 index
bm25_cn = BM25Okapi(entocn_tokenized_cn)


In [None]:
#This function is especially made for en and it, need to be redeclared for cn
def search(query, lang='en', top_k=5):
    tokens = query.lower().split()
    if lang == 'en':
        scores = bm25_en.get_scores(tokens)
    else:
        scores = bm25_cn.get_scores(tokens)
    
    top_k_ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return top_k_ids, [scores[i] for i in top_k_ids]

In [None]:
def search_bm25(query, top_k=5):
    lang = detect(query)

    # Tokenize appropriately based on detected language
    if lang in ['zh', 'zh-cn', 'cn']:
        tokens = list(jieba.cut(query))
        scores = bm25_cn.get_scores(tokens)
    else:
        tokens = query.lower().split()
        scores = bm25_en.get_scores(tokens)

    # Get top-k ranked document indices
    top_k_ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

    return top_k_ids, [scores[i] for i in top_k_ids]

In [80]:
results, scores = search_bm25("女 排汗")

for i, score in zip(results, scores):
    print(f"{score:.4f} | {chinese_embeddings['title'][i]}")

0.0000 | Oppo A75 A75S A73 Phone Case Soft Rabbit Silicone Case
0.0000 | SOFT 99 Coating Car Wax Strong Water Watt
0.0000 | Low Sugar Mango Dry 250g Be The Royal
0.0000 | * the culture Japan Imported Round Top Space Craft - Diamond SC - MK - 010
0.0000 | Hello Kitty Sandals Shoes White/Red Children no739


In [81]:
from pinecone import Pinecone, ServerlessSpec

#This is Cheryl's API
pc = Pinecone(api_key=CHERYL_API)

In [82]:
#Initialising Pinecone index
#This is the Index for the ENTOIT
index = pc.Index('entocn')

In [83]:
en_ids = [f"en-{i}" for i in range(len(chinese_embeddings['english_embedding']))]
cn_ids = [f"cn-{i}" for i in range(len(chinese_embeddings['chinese_embedding']))]

In [57]:
# Combine English + Chinese as before
to_upsert = list(zip(en_ids, chinese_embeddings['english_embedding'])) + \
            list(zip(cn_ids, chinese_embeddings['chinese_embedding']))

# Run batch upload
batch_upsert(index, to_upsert, batch_size=50)


#With this code, the vectorDB has been established for en to cn.  

In [84]:
import numpy as np

def scores_to_ranking(scores: list[float]) -> list[int]:
    """Convert float scores into int rankings (1 = best)."""
    return np.argsort(scores)[::-1] + 1  # ranks start at 1

def rrf(keyword_rank: int, semantic_rank: int, k: int = 60) -> float:
    """Combine keyword rank and semantic rank into a hybrid score using RRF."""
    return 1 / (k + keyword_rank) + 1 / (k + semantic_rank)

3.1 BM25 + BGE Chinese

In [80]:
def hybrid_search_rrf(query, top_k=5):

    #To detect the language from the query
    from langdetect import detect

    lang = detect(query)
    #tokenising the query to be suitable for BM25
    tokens = list(jieba.cut(query))

    # --- BM25 Retrieval ---(Routes the tokenized query to the appropiate BM25 engine)
    if lang == 'cn':
        bm25_scores = bm25_cn.get_scores(tokens)
    else:
        bm25_scores = bm25_en.get_scores(tokens) #returns a list of scores, one for each document in the dataset

    bm25_ranks = scores_to_ranking(bm25_scores)  #Converts the float BM25 scores to rankings for Reciprocal rank fusion(rrf)

    # --- Semantic Retrieval (Pinecone) ---
    query_vec = model.encode(query).tolist() #generate the query's embedding and convert it a list
    pinecone_results = index.query(vector=query_vec, top_k=top_k, include_metadata=False) #Submits the query vector to Pinecone to retrieve top-k similar vectors, based on cosine similarity

    pinecone_ids = [int(match['id'].split('-')[1]) for match in pinecone_results['matches']] #extracts og row index from pinecone's ID
    pinecone_scores = [match['score'] for match in pinecone_results['matches']] #Obtain cosine similarity scores from pinecone
    semantic_ranks = scores_to_ranking(pinecone_scores) #Converting pinecone scores to ranks (lower rank = better match)

    # --- Combine using RRF ---
    combined_scores = {} #placeholder to store RRF combined scores for each shortlisted document
    for idx in pinecone_ids:
        #For each doc idx returned by pinecone
        # Retrieve the BM25 rank and semantic rank, and using rrf function defined on prev cell to combine them into one score
        rrf_score = rrf(
            keyword_rank=bm25_ranks[idx],
            semantic_rank=semantic_ranks[pinecone_ids.index(idx)]
        )
        combined_scores[idx] = rrf_score

    # Sort by RRF score
    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)  #Sorting the docs by their RRF score, with the highest first.

    return ranked[:top_k]  # list of (doc_id, final_score)


In [81]:
#Testing phase

results = hybrid_search_rrf("女式夹克")

for idx, score in results:
    print(f"{score:.4f} | {chinese_embeddings['title'][idx]}")

0.0246 | College Sexy Pleated Culottes
0.0237 | Long version Pocket Cardigan Knit Coat
0.0192 | Korean Made. Thick Straps Cross Vest
0.0170 | Two-piece Set Thick Thread Coat + Skirt M - XXL


3.2 BM25 Alone Chinese

In [82]:
def BM25(query, top_k=5):

    #To detect the language from the query
    from langdetect import detect

    lang = detect(query)
    #tokenising the query to be suitable for BM25
    tokens = list(jieba.cut(query))

    # --- BM25 Retrieval ---(Routes the tokenized query to the appropiate BM25 engine)
    if lang == 'cn':
        bm25_scores = bm25_cn.get_scores(tokens)
    else:
        bm25_scores = bm25_en.get_scores(tokens) #returns a list of scores, one for each document in the dataset

    bm25_ranks = {i: rank for rank, i in enumerate(np.argsort(bm25_scores)[::-1], start=1)} #Converts the float BM25 scores to rankings for Reciprocal rank fusion(rrf)

    # Sort by RRF score
    ranked = sorted(bm25_ranks.items(), key=lambda x: x[1], reverse=True)  #Sorting the docs by their RRF score, with the highest first.

    return ranked[:top_k]  # list of (doc_id, final_score)


In [83]:
#Testing phase for BM 25

results = BM25("女式夹克")

for idx, score in results:
    print(f"{score:.4f} | {chinese_embeddings['title'][idx]}")

1000.0000 | Oppo A75 A75S A73 Phone Case Soft Rabbit Silicone Case
999.0000 | Tree De Sc Multifunctional Locker - Scm3 - 3M6S
998.0000 | COGHLANS Canada 0044 Finger Compass Thermometer Whistle
997.0000 | [With Incense] Jujube Pack 5 Pc / Pack (2 Pack) 【
996.0000 | Japan gex schisandra Kittens Water Dispenser 900Ml GE2316


3.3 TFIDF + BGE Chinese

In [91]:
#For both Sparse and Sparse + Dense

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer_en = TfidfVectorizer()
tfidf_matrix_en = tfidf_vectorizer_en.fit_transform(chinese_embeddings['title'])

tfidf_vectorizer_cn = TfidfVectorizer()
tfidf_matrix_cn = tfidf_vectorizer_cn.fit_transform(chinese_embeddings['chinese translation'])

In [None]:
def hybrid_search_tfidf(query, top_k=5):
    lang = detect(query)

    # --- TF-IDF Query Vectorization ---
    if lang in ['zh', 'zh-cn', 'cn']:
        tokens = list(jieba.cut(query))
        query_joined = " ".join(tokens)  # convert token list to string input
        query_vec = tfidf_vectorizer_cn.transform([query_joined])
        similarities = cosine_similarity(query_vec, tfidf_matrix_cn)[0]
    else:
        query_vec = tfidf_vectorizer_en.transform([query])
        similarities = cosine_similarity(query_vec, tfidf_matrix_en)[0]

    # --- TF-IDF Ranking Map (doc_id → rank) ---
    ranked_indices = np.argsort(similarities)[::-1]
    tfidf_rank_map = {int(idx): rank for rank, idx in enumerate(ranked_indices[:top_k], start=1)}

    # --- Semantic Retrieval (Pinecone) ---
    query_vec_dense = model.encode(query).tolist()
    pinecone_results = index.query(vector=query_vec_dense, top_k=top_k * 2, include_metadata=False)

    pinecone_ids = [int(match['id'].split('-')[1]) for match in pinecone_results['matches']]
    pinecone_scores = [match['score'] for match in pinecone_results['matches']]

    # Convert scores to semantic ranks (lower = better)
    semantic_ranks = scores_to_ranking(pinecone_scores)
    semantic_rank_map = {idx: rank for idx, rank in zip(pinecone_ids, semantic_ranks)}

    # --- Reciprocal Rank Fusion (RRF) ---
    combined_scores = {}
    for idx in pinecone_ids:
        rrf_score = rrf(
            keyword_rank=tfidf_rank_map.get(idx, top_k + 1),
            semantic_rank=semantic_rank_map.get(idx, top_k + 1)
        )
        combined_scores[idx] = rrf_score

    # Sort by final RRF score
    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    return ranked[:top_k]

In [87]:
#Testing phase for TFIDF + BGE

results = hybrid_search_tfidf("女式夹克")

for idx, score in results:
    print(f"{score:.4f} | {chinese_embeddings['chinese translation'][idx]}")

0.0313 | 韩制。粗肩带交叉背心
0.0308 | 学院感百褶裤裙
0.0305 | 現貨-兩件式套裝加厚螺紋外套+長裙 M-XXL
0.0303 | 🎀真皮女用小手提/斜背二用包🎀
0.0301 | 長版口袋開襟針織外套


3.4 TF-IDF Alone Chinese

In [None]:
def TFIDF(query, top_k=5):
    lang = detect(query)

    # --- TF-IDF Query Vectorization ---
    if lang in ['zh', 'zh-cn', 'cn']:
        tokens = list(jieba.cut(query))
        query_joined = " ".join(tokens)  # convert token list to string input
        query_vec = tfidf_vectorizer_cn.transform([query_joined])
        similarities = cosine_similarity(query_vec, tfidf_matrix_cn)[0]
    else:
        query_vec = tfidf_vectorizer_en.transform([query])
        similarities = cosine_similarity(query_vec, tfidf_matrix_en)[0]

    # --- TF-IDF Ranking Map (doc_id → rank) ---
    ranked_indices = np.argsort(similarities)[::-1]
    tfidf_rank_map = {int(idx): rank for rank, idx in enumerate(ranked_indices[:top_k], start=1)}
    
    return tfidf_rank_map

In [95]:
#Testing phase for TFIDF + BGE

results = TFIDF("女式夹克")

for idx, score in results.items():
    print(f"{score:.4f} | {chinese_embeddings['chinese translation'][idx]}")

1.0000 | PolarStar 女 排汗快干T恤『黑』P18102
2.0000 | ALPINE PARTY PLUG PRO 頂級 音樂耳塞 聲音濾波器 荷蘭進口 20816
3.0000 | 時尚簡約實用抱枕109 靠墊 沙發裝飾靠枕
4.0000 | 寶寶外套 毛圈拉鍊休閒外套夾克 UG13220 好娃娃
5.0000 | 自強牌 A480 資料夾 / 箱
