
# ðŸ“Š Offline Search Evaluation on Amazon ESCI

This notebook implements an **offline evaluation prototype** for e-commerce search using the **Amazon ESCI** dataset.
- Dataset: [amazon-science/esci-data](https://github.com/amazon-science/esci-data)
- Algorithms: BM25 (keyword-based) and SBERT+FAISS (semantic)
- Metrics: nDCG@K, MAP@K, MRR, Precision/Recall@K


In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sentence_transformers import CrossEncoder
import torch
import gc
from tqdm import tqdm
import time
from collections import defaultdict
from torch.amp import autocast
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
import re

# Define text cleaning function
def clean_text(text):
    """
    Clean text by removing HTML tags, emojis, and special characters.
    Args:
        text: Input raw text (string).
    Returns:
        Cleaned text.
    """
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove emojis and non-alphanumeric characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove emojis and non-ASCII characters
    text = re.sub(r'[^a-zA-Z0-9\s:]', '', text)  # Retain alphabetical, numerical, and colon characters

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
import pandas as pd
df_examples = pd.read_parquet('gs://chanderiyer/datasets/esci_shopping/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('gs://chanderiyer/datasets/esci_shopping/shopping_queries_dataset_products.parquet')

df_products["product_description"] = df_products["product_description"].apply(lambda x: clean_text(x) if not pd.isnull(x) else "")
df_products["product_bullet_point"] = df_products["product_bullet_point"].apply(lambda x: clean_text(x) if not pd.isnull(x) else "")

# Combine title, bullet_point, description
for c in ["product_title","product_description","product_bullet_point","product_brand"]:
    if c in df_products.columns:
        df_products[c] = df_products[c].fillna("").astype(str)

df_products["product_full_description"] = df_products["product_title"] + " " + df_products["product_bullet_point"] + " " + df_products["product_description"]
df_products["doc_text"] = df_products[["product_full_description","product_brand"]].agg(" ".join, axis=1)

df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='inner',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)
df_examples_products = df_examples_products[(df_examples_products["small_version"] == 1) & (df_examples_products["product_locale"] == "us") & (df_examples_products["split"] == "train")]

In [None]:
train_df = df_examples_products[["query_id", "query", "product_id", "doc_text", "esci_label"]].drop_duplicates()
train_df.head(n=30)

In [None]:
# Step 2: Map ESCI labels to numerical scores
esci_to_score = {
    'E': 3,  # Exact
    'S': 2,  # Substitute
    'C': 1,  # Complement
    'I': 0   # Irrelevant
}

train_df['relevance_score'] = train_df['esci_label'].map(esci_to_score)
train_df.shape

In [None]:
# def create_query_product_pairs_with_sampling(df_examples: pd.DataFrame,
#                                             df_products: pd.DataFrame,
#                                             sample_size: int = 1000,
#                                             random_state: int = 42) -> pd.DataFrame:
#     """
#     Create query-product pairs with random sampling of irrelevant products.

#     Args:
#         df_examples: DataFrame with existing query-product judgments
#         df_products: DataFrame with all products
#         sample_size: Number of irrelevant products to sample per query
#         random_state: Random seed for reproducibility
#     """
#     np.random.seed(random_state)

#     # Get all unique products
#     all_products = set(df_products['product_id'].unique())

#     # Group existing judgments by query
#     query_groups = df_examples.groupby('query_id')

#     # Prepare lists for efficient concatenation
#     new_pairs = []

#     for query_id, group in query_groups:
#         # Get labeled products for this query
#         labeled_products = set(group['product_id'].unique())

#         # Get unlabeled products
#         unlabeled_products = list(all_products - labeled_products)

#         # Sample unlabeled products
#         if len(unlabeled_products) > sample_size:
#             sampled_products = np.random.choice(
#                 unlabeled_products,
#                 size=sample_size,
#                 replace=False
#             )
#         else:
#             sampled_products = unlabeled_products

#         # Create irrelevant pairs
#         if len(sampled_products) > 0:
#             irrelevant_pairs = pd.DataFrame({
#                 'query_id': query_id,
#                 'product_id': sampled_products,
#                 'esci_label': 'I',
#                 'relevance_score': 0.0
#             })
#             new_pairs.append(irrelevant_pairs)

#     # Combine existing and new pairs
#     df_existing = df_examples.copy()

#     # Add relevance scores to existing pairs
#     label_to_score = {'E': 1.0, 'S': 0.7, 'C': 0.3, 'I': 0.0}
#     df_existing['relevance_score'] = df_existing['esci_label'].map(label_to_score)

#     # Concatenate all DataFrames
#     if new_pairs:
#         df_all_pairs = pd.concat([df_existing] + new_pairs, ignore_index=True)
#     else:
#         df_all_pairs = df_existing

#     # Clean up
#     del new_pairs
#     gc.collect()

#     return df_all_pairs

# sample_size = 1000
# SEED = 42
# df_test_small_us_sampled = create_query_product_pairs_with_sampling(df_examples=df_train_small_us, df_products=df_products, sample_size, SEED)
# df_test_small_us_sampled.shape

In [None]:
# Step 3: Load the Cross-Encoder model with GPU optimization
print("\nLoading Cross-Encoder model...")
model = CrossEncoder('cross-encoder/nli-deberta-v3-base', device=device)

# Enable mixed precision for the underlying model
if hasattr(model.model, 'half') and torch.cuda.is_available():
    # For some models, we can use half precision
    try:
        model.model = model.model.half()
        use_fp16 = True
        print("Using FP16 (half precision) for faster inference")
    except:
        use_fp16 = False
        print("Using FP32 (full precision)")
else:
    use_fp16 = False
    print("Using FP32 (full precision)")

In [None]:
# Step 4: Prepare data for cross-encoder evaluation
print("\nPreparing query-product pairs...")

# Group by query to get all products for each query
query_products = train_df.groupby(['query_id', 'query']).agg({
    'doc_text': list,
    'product_id': list,
    'relevance_score': list
}).reset_index()

print(f"Number of unique queries: {len(query_products)}")

In [None]:
def score_batch_gpu(model, pairs, batch_size=32):
    """Score query-product pairs in batches with GPU optimization"""
    scores = []

    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i + batch_size]

        with torch.no_grad():
            if use_fp16 and torch.cuda.is_available():
                with autocast('cuda'):
                    batch_logits = model.predict(batch, apply_softmax=False)
            else:
                batch_logits = model.predict(batch, apply_softmax=False)

        # Assuming batch_logits is (batch_size, num_classes) and index 2 is the 'entailment' score
        batch_relevance_scores = np.array(batch_logits)[:, 2]
        scores.extend(batch_relevance_scores)

        # Clear GPU cache periodically
        if i % (batch_size * 10) == 0 and torch.cuda.is_available():
            torch.cuda.empty_cache()

    return np.array(scores)

In [None]:
# Step 6: Evaluate with dynamic batch size optimization
print("\nPerforming cross-encoder evaluation with GPU optimization...")

# Dynamic batch size based on GPU memory
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU memory: {gpu_memory:.2f} GB")
    if gpu_memory > 16:
        batch_size = 64
    elif gpu_memory > 8:
        batch_size = 32
    else:
        batch_size = 16
else:
    batch_size = 8
print(f"Using batch size: {batch_size}")

results = []
total_time = 0

# Process queries in batches for memory efficiency
query_batch_size = 50
num_query_batches = (len(query_products) + query_batch_size - 1) // query_batch_size

for batch_idx in tqdm(range(num_query_batches), desc="Processing query batches"):
    start_idx = batch_idx * query_batch_size
    end_idx = min((batch_idx + 1) * query_batch_size, len(query_products))
    batch_queries = query_products.iloc[start_idx:end_idx]

    for _, row in batch_queries.iterrows():
        query = row["query"]
        query_id = row["query_id"]
        products = row["doc_text"]
        product_ids = row["product_id"]
        true_scores = row["relevance_score"]

        # Ensure these are lists or NumPy arrays for consistent indexing
        products = list(products)
        product_ids = list(product_ids)
        true_scores = list(true_scores)

        # Create query-product pairs
        pairs = [(query, product) for product in products]

        # Score all pairs with GPU optimization
        start_time = time.time()
        scores = score_batch_gpu(model, pairs, batch_size)
        scoring_time = time.time() - start_time
        total_time += scoring_time

        # Get top-k products (k=20)
        k = min(20, len(products))
        # scores is now a 1D array of relevance scores for each product
        top_k_indices = np.argsort(scores)[-k:][::-1]

        # Store results
        result = {
            "query_id": query_id,
            "query": query,
            "num_products": len(products),
            "scoring_time": scoring_time,
            "top_k_product_ids": [product_ids[i] for i in top_k_indices],
            "top_k_scores": [scores[i] for i in top_k_indices], # Scores are already 1D relevance scores
            "top_k_relevance": [true_scores[i] for i in top_k_indices],
            "all_true_scores": true_scores
        }
        results.append(result)

    # Clear GPU cache after each batch
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

print(f"\nTotal scoring time: {total_time:.2f} seconds")
print(f"Average time per query: {total_time/len(results):.4f} seconds")

In [None]:
print(results[:20])

In [None]:
# Step 7: Compute evaluation metrics
print("\nComputing evaluation metrics...")

def compute_metrics(results, k=20):
    """Compute various IR metrics"""
    metrics = defaultdict(list)

    for result in results:
        top_k_relevance = result['top_k_relevance'][:k]
        all_relevance = result['all_true_scores']

        # Binary relevance (relevant if score > 0)
        top_k_binary = [1 if r > 0 else 0 for r in top_k_relevance]
        all_binary = [1 if r > 0 else 0 for r in all_relevance]

        # Precision@k
        precision = sum(top_k_binary) / len(top_k_binary) if top_k_binary else 0
        metrics['precision@k'].append(precision)

        # Recall@k
        total_relevant = sum(all_binary)
        recall = sum(top_k_binary) / total_relevant if total_relevant > 0 else 0
        metrics['recall@k'].append(recall)

        # F1@k
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        metrics['f1@k'].append(f1)

        # MAP@k (Mean Average Precision)
        ap = 0
        relevant_count = 0
        for i, rel in enumerate(top_k_binary):
            if rel:
                relevant_count += 1
                ap += relevant_count / (i + 1)
        map_score = ap / relevant_count if relevant_count > 0 else 0
        metrics['map@k'].append(map_score)

        # NDCG@k (Normalized Discounted Cumulative Gain)
        dcg = sum([(2**rel - 1) / np.log2(i + 2) for i, rel in enumerate(top_k_relevance)])

        # Ideal DCG
        ideal_relevance = sorted(all_relevance, reverse=True)[:k]
        idcg = sum([(2**rel - 1) / np.log2(i + 2) for i, rel in enumerate(ideal_relevance)])

        ndcg = dcg / idcg if idcg > 0 else 0
        metrics['ndcg@k'].append(ndcg)

        # MRR (Mean Reciprocal Rank)
        for i, rel in enumerate(top_k_binary):
            if rel:
                metrics['mrr'].append(1 / (i + 1))
                break
        else:
            metrics['mrr'].append(0)

        result['precsion@k'] = precision
        result['recall@k'] = recall
        result['f1@k'] = f1
        result['ndcg@k'] = ndcg
        result['map@k'] = map_score
        result['mrr'] = metrics['mrr'][-1]

    # Average metrics
    avg_metrics = {metric: np.mean(scores) for metric, scores in metrics.items()}
    return avg_metrics

# Compute metrics for different k values
# k_values = [5, 10, 20]
k_val = 20
all_metrics = {}

print(f"\nMetrics for k={k_val}:")
metrics = compute_metrics(results, k_val)
all_metrics[k_val] = metrics

for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
print(results[:20])

In [None]:
# Step 8: Save detailed results
print("\nSaving results...")

# Create detailed results DataFrame
# detailed_results = []
# for result in results[:100]:  # Save first 100 queries for inspection
#     for i, (product, score, relevance) in enumerate(zip(
#         result['top_k_product_ids'][:10],
#         result['top_k_scores'][:10],
#         result['top_k_relevance'][:10]
#     )):
#         detailed_results.append({
#             'query': result['query'],
#             'rank': i + 1,
#             'product_title': product,
#             'cross_encoder_score': score,
#             'true_relevance': relevance,
#             'scoring_time_ms': result['scoring_time'] * 1000
#         })

# detailed_df = pd.DataFrame(detailed_results)
# detailed_df.to_csv('cross_encoder_results_gpu.tsv', sep='\t', index=False)


# Save metrics summary
metrics_summary = []
for k, metrics in all_metrics.items():
    for metric, value in metrics.items():
        metrics_summary.append({
            'k': k,
            'metric': metric,
            'value': value
        })

In [None]:
from google.cloud import storage

bucket_name = 'chanderiyer'
summary_destination_blob_name = 'output/metrics/debertav3base_summary_eval_metrics.csv'
summary_source_file_name = 'debertav3base_summary_eval_metrics.csv'

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
summary_blob = bucket.blob(summary_destination_blob_name)

metrics_df = pd.DataFrame(metrics_summary)
model_summary_eval_metrics = metrics_df.to_csv(sep = '\t', index=False)

summary_blob.upload_from_string(model_summary_eval_metrics, content_type='text/csv')
print(f"Model weights {summary_source_file_name} uploaded to gs://{bucket_name}/{summary_destination_blob_name}")


query_destination_blob_name = 'output/metrics/debertav3base_query_eval_metrics.csv'
query_source_file_name = 'debertav3base_query_eval_metrics.csv'
query_blob = bucket.blob(query_destination_blob_name)

query_metrics_df = pd.DataFrame(results)
model_query_eval_metrics = query_metrics_df.to_csv(sep='\t', index=False)

query_blob.upload_from_string(model_query_eval_metrics, content_type='text/csv')
print(f"Model weights {query_source_file_name} uploaded to gs://{bucket_name}/{query_destination_blob_name}")


print("\nResults saved to:")
print("- cross_encoder_results_gpu.tsv (detailed results)")
print("- cross_encoder_metrics_gpu.tsv (metrics summary)")

# Performance comparison
print("\n" + "="*50)
print("PERFORMANCE SUMMARY")
print("="*50)
print(f"Total queries processed: {len(results)}")
print(f"Total time: {total_time:.2f} seconds")
print(f"Average time per query: {total_time/len(results):.4f} seconds")
print(f"Queries per second: {len(results)/total_time:.2f}")

if torch.cuda.is_available():
    print(f"\nGPU Memory Usage: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")