
# ðŸ“Š Offline Search Evaluation on Amazon ESCI

This notebook implements an **offline evaluation prototype** for e-commerce search using the **Amazon ESCI** dataset.
- Dataset: [amazon-science/esci-data](https://github.com/amazon-science/esci-data)
- Algorithms: BM25 (keyword-based) and SBERT+FAISS (semantic)
- Metrics: nDCG@K, MAP@K, MRR, Precision/Recall@K


In [None]:
!apt install libomp-dev

In [None]:
!pip install faiss-gpu-cu12

In [None]:
# Step 1: Import Libraries
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import faiss
import random
from tqdm import tqdm
import time
from typing import List, Dict, Tuple
import gc

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [None]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
import re

# Define text cleaning function
def clean_text(text):
    """
    Clean text by removing HTML tags, emojis, and special characters.
    Args:
        text: Input raw text (string).
    Returns:
        Cleaned text.
    """
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove emojis and non-alphanumeric characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove emojis and non-ASCII characters
    text = re.sub(r'[^a-zA-Z0-9\s:]', '', text)  # Retain alphabetical, numerical, and colon characters

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
import pandas as pd
df_examples = pd.read_parquet('gs://chanderiyer/datasets/esci_shopping/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('gs://chanderiyer/datasets/esci_shopping/shopping_queries_dataset_products.parquet')

df_products["product_description"] = df_products["product_description"].apply(lambda x: clean_text(x) if not pd.isnull(x) else "")
df_products["product_bullet_point"] = df_products["product_bullet_point"].apply(lambda x: clean_text(x) if not pd.isnull(x) else "")

# Combine title, bullet_point, description
for c in ["product_title","product_description","product_bullet_point","product_brand"]:
    if c in df_products.columns:
        df_products[c] = df_products[c].fillna("").astype(str)

df_products["product_full_description"] = df_products["product_title"] + " " + df_products["product_bullet_point"] + " " + df_products["product_description"]

df_products["doc_text"] = df_products[["product_full_description","product_brand"]].agg(" ".join, axis=1)
df_products = df_products[["product_id","doc_text", "product_locale"]].drop_duplicates()

df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='inner',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)
train_df = df_examples_products[(df_examples_products["small_version"] == 1) & (df_examples_products["product_locale"] == "us") & (df_examples_products["split"] == "train")]
train_df.head(n=30)

In [None]:
# Step 3: Map ESCI Labels to Graded Relevance
label_mapping = {
    'E': 3,  # Exact
    'S': 2,  # Substitute
    'C': 1,  # Complement
    'I': 0   # Irrelevant
}

train_df['relevance_score'] = train_df['esci_label'].map(label_mapping)

In [None]:
# Step 4: Prepare Data for Embedding
# Get unique queries and products
unique_queries = train_df['query'].unique()
unique_products = df_products[['product_id', 'doc_text']].drop_duplicates()

print(f"Number of unique queries: {len(unique_queries)}")
print(f"Number of unique products: {len(unique_products)}")

In [None]:
# Step 5: Load Model on GPU
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)
model = model.to(device)
model.eval()

In [None]:
# Step 6: Generate Query Embeddings on GPU
def encode_texts_gpu(texts: List[str], model: SentenceTransformer, batch_size: int = 256) -> np.ndarray:
    """Encode texts using GPU with batching and mixed precision"""
    embeddings = []

    with torch.amp.autocast('cuda'):
        for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
            batch = texts[i:i + batch_size]
            batch_embeddings = model.encode(
                batch,
                convert_to_tensor=True,
                show_progress_bar=False
            )
            embeddings.append(batch_embeddings.cpu().numpy())

            # Clear GPU cache periodically
            if i % (batch_size * 10) == 0:
                torch.cuda.empty_cache()

    return np.vstack(embeddings)

# Generate embeddings
print("Generating query embeddings...")
query_embeddings = encode_texts_gpu(unique_queries.tolist(), model)

print("Generating product embeddings...")
product_titles = unique_products['doc_text'].tolist()
product_embeddings = encode_texts_gpu(product_titles, model)

# Create mappings
query_to_idx = {q: i for i, q in enumerate(unique_queries)}
product_to_idx = {pid: i for i, pid in enumerate(unique_products['product_id'].values)}

In [None]:
# Step 7: Create GPU FAISS Index
def create_gpu_faiss_index(embeddings: np.ndarray) -> faiss.Index:
    """Create GPU-optimized FAISS index"""
    dimension = embeddings.shape[1]

    # Create CPU index
    cpu_index = faiss.IndexFlatIP(dimension)

    # Move to GPU
    res = faiss.StandardGpuResources()
    gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)

    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings)

    # Add embeddings to index
    gpu_index.add(embeddings.astype(np.float32))

    return gpu_index

# Create GPU index
print("Creating GPU FAISS index...")
gpu_index = create_gpu_faiss_index(product_embeddings)

In [None]:
# Step 8: Perform Retrieval for Each Query
def retrieve_top_k(query_embeddings: np.ndarray, index: faiss.Index, k: int = 100) -> Tuple[np.ndarray, np.ndarray]:
    """Retrieve top-k products for each query using GPU"""
    # Normalize query embeddings
    query_embeddings_normalized = query_embeddings.copy()
    faiss.normalize_L2(query_embeddings_normalized)

    # Search
    similarities, indices = index.search(query_embeddings_normalized.astype(np.float32), k)

    return similarities, indices

# Retrieve top-100 products for each query
print("Performing retrieval...")
k = 100
similarities, indices = retrieve_top_k(query_embeddings, gpu_index, k)

In [None]:
# Step 9: Compute Evaluation Metrics
def compute_graded_metrics(train_df: pd.DataFrame, query_to_idx: Dict, product_to_idx: Dict,
                          indices: np.ndarray, unique_queries: np.ndarray,
                          unique_products: pd.DataFrame, k: int = 100) -> Dict:
    """Compute graded relevance metrics"""

    metrics = {
        'que'
        'precision_at_k': [],
        'recall_at_k': [],
        'f1_at_k': [],
        'map_scores': [],
        'ndcg_scores': [],
        'mrr_scores': []
    }

    for query_idx, query in enumerate(tqdm(unique_queries, desc="Computing metrics")):
        # Get ground truth for this query
        query_data = train_df[train_df['query'] == query]

        # Create ground truth relevance dictionary
        ground_truth = {}
        for _, row in query_data.iterrows():
            if row['product_id'] in product_to_idx:
                ground_truth[product_to_idx[row['product_id']]] = row['relevance_score']

        # Get retrieved product indices
        retrieved_indices = indices[query_idx][:k]

        # Compute relevance scores for retrieved items
        relevance_scores = []
        for idx in retrieved_indices:
            relevance_scores.append(ground_truth.get(idx, 0))

        relevance_scores = np.array(relevance_scores)

        # Precision@k (binary: relevant if score > 0)
        num_relevant_retrieved = np.sum(relevance_scores > 0)
        precision = num_relevant_retrieved / k if k > 0 else 0

        # Recall@k
        total_relevant = np.sum(np.array(list(ground_truth.values())) > 0)
        recall = num_relevant_retrieved / total_relevant if total_relevant > 0 else 0

        # F1@k
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Average Precision (graded)
        ap = 0
        num_relevant_seen = 0
        for i, score in enumerate(relevance_scores):
            if score > 0:
                num_relevant_seen += 1
                ap += num_relevant_seen / (i + 1)
        ap = ap / total_relevant if total_relevant > 0 else 0

        # NDCG@k
        dcg = np.sum(relevance_scores / np.log2(np.arange(1, k + 1) + 1))
        ideal_scores = sorted([s for s in ground_truth.values() if s > 0], reverse=True)[:k]
        idcg = np.sum(ideal_scores / np.log2(np.arange(1, len(ideal_scores) + 1) + 1))
        ndcg = dcg / idcg if idcg > 0 else 0

        # MRR
        mrr = 0
        for i, score in enumerate(relevance_scores):
            if score > 0:
                mrr = 1 / (i + 1)
                break

        metrics['precision_at_k'].append(precision)
        metrics['recall_at_k'].append(recall)
        metrics['f1_at_k'].append(f1)
        metrics['map_scores'].append(ap)
        metrics['ndcg_scores'].append(ndcg)
        metrics['mrr_scores'].append(mrr)

    # Compute averages
    avg_metrics = {
        f'avg_{key}': np.mean(values) for key, values in metrics.items()
    }

    return avg_metrics

# Compute metrics
print("Computing evaluation metrics...")
metrics = compute_graded_metrics(
    train_df, query_to_idx, product_to_idx, indices,
    unique_queries, unique_products, k=20
)

In [None]:
# Step 10: Display Results
print("\n=== Evaluation Results ===")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

# Step 11: Save Results
results_df = pd.DataFrame({
    'query': unique_queries,
    'top_20_products': [
        [unique_products.iloc[idx]['product_id'] for idx in indices[i][:100]]
        for i in range(len(unique_queries))
    ],
    'top_20_titles': [
        [unique_products.iloc[idx]['doc_text'] for idx in indices[i][:100]]
        for i in range(len(unique_queries))
    ],
    'similarities': [similarities[i][:100].tolist() for i in range(len(unique_queries))]
})

In [None]:
# Save to TSV
results_df.to_csv('gpu_esci_results.tsv', sep='\t', index=False)
print("\nResults saved to gpu_esci_results.tsv")

# Save metrics
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('evaluation_metrics.tsv', sep='\t', index=False)
print("Metrics saved to evaluation_metrics.tsv")

# Step 12: Memory Cleanup
del model
del gpu_index
torch.cuda.empty_cache()
gc.collect()

In [None]:
from google.cloud import storage

bucket_name = 'chanderiyer'
destination_blob_name = 'output/metrics/all_mpnet_base_query_eval_metrics.csv'
source_file_name = 'all_mpnet_base_query_eval_metrics.csv'

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)

minilml6_query_eval_metrics = results_df.to_csv(sep = '\t', index=False)
blob.upload_from_string(minilml6_query_eval_metrics, content_type='text/csv')

print(f"Eval Metrics File {source_file_name} with model {model_name} uploaded to gs://{bucket_name}/{destination_blob_name}")