In [None]:
# Import necessary libraries
import concurrent.futures
import math
import pandas as pd
import requests
import time

# Constants
API_ENDPOINT = "http://localhost:30200/search"
QUERY_PARAM = "query"  # Column name from the dataset containing the queries
ALGO = "COMMERCE_AI_SEARCH.latest"
INDEX = "PRODUCT_ESCI"
TOP_K = 5  # Number of top results to fetch from the API (k for NDCG@k and MRR@k)
MAX_WORKERS = 4  # Number of parallel workers for REST queries
NUM_RETRIES = 5

# Metrics calculation functions (These remain unchanged)

def calculate_precision_recall_f1_score(predicted, true_labels, k):
    # Same as provided code (unchanged)
    predicted_top_k = set(predicted[:k])
    true_labels_set = set(true_labels)

    precision = len(predicted_top_k & true_labels_set) / len(predicted_top_k) if len(predicted_top_k) > 0 else 0
    recall = len(predicted_top_k & true_labels_set) / len(true_labels_set) if len(true_labels_set) > 0 else 0

    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)

    return {"precision": precision, "recall": recall, "f1_score": f1_score}

def calculate_graded_ndcg_at_k(predicted, ground_truth, labels, graded_ground_truth, true_labels, k):
    # Same as provided code (unchanged)
    dcg = 0.0
    idcg = sum([true_labels[i] / math.log2(i + 2) for i in range(min(k, len(graded_ground_truth)))])

    for i, result in enumerate(predicted[:k]):
        if result in ground_truth and i < len(labels):
            dcg += labels[i] / math.log2(i + 2)
    ndcg = dcg / idcg if idcg > 0 else 0
    return ndcg

def calculate_ndcg_at_k(predicted, ground_truth, k):
    """
    Calculate Normalized Discounted Cumulative Gain (NDCG) @ k.

    Args:
        predicted (list): List of predicted labels.
        ground_truth (list): List of ground truth labels.
        k (int): Number of top results (k).

    Returns:
        float: NDCG@k.
    """
    dcg = 0.0
    idcg = sum([1 / math.log2(i + 2) for i in range(min(k, len(ground_truth)))])

    for i, result in enumerate(predicted[:k]):
        if result in ground_truth:
            dcg += 1 / math.log2(i + 2)

    ndcg = dcg / idcg if idcg > 0 else 0
    return ndcg


def calculate_map_at_k(predicted, ground_truth, k):
    hits,ap=0,0.0
    for i, result in enumerate(predicted[:k]):
        if result in ground_truth:
            hits+=1
            ap += hits / (i + 1)
    return ap/max(1,len(ground_truth)) if ground_truth else 0

def calculate_mrr_at_k(predicted, ground_truth, k):
    # Same as provided code (unchanged)
    for i, result in enumerate(predicted[:k]):
        if result in ground_truth:
            return 1 / (i + 1)
    return 0

def sanitize_query(query):

    special_chars = '#?\'"()*+,-/'
    query = query.strip(special_chars)
    return query.strip()

# Updated fetch API results
def fetch_results(query):
    """
    Fetch product titles from API.
    Args:
        query (str): Search query to send to API.

    Returns:
        dict: Product titles and query result.
    """

    params = {
        "query": sanitize_query(query),
        "algo": ALGO,
        "limit": TOP_K,
        "noAggs": "true",
        "searchIndex": INDEX
    }

    try:
        response = requests.get(API_ENDPOINT, params=params)
        if response.status_code == 200:
            response_json = response.json()
            product_ids = [product["parent_id"] for product in response_json.get("products", [])]
            return {"query": query, "results": product_ids}
        else:
            return {"query": query, "results": [], "error": f"HTTP {response.status_code}"}
    except Exception as e:
        return {"query": query, "results": [], "error": str(e)}

# Parallelized version to fetch results concurrently for all queries
def fetch_results_parallel(queries):
    """
    Perform parallel fetching of product results for the given queries using ThreadPoolExecutor.

    Args:
        queries (list): List of queries to execute.

    Returns:
        list: A list of dictionaries containing query results and errors (if any).
    """
    results = []
    print(f"Starting parallel execution with {MAX_WORKERS} workers...")

    start_time = time.perf_counter()
    completed_queries = 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_query = {executor.submit(fetch_results, query): query for query in queries}
        for future in concurrent.futures.as_completed(future_to_query):
            try:
                result = future.result()  # Get the result of the completed future object
                results.append(result)
            except Exception as e:
                query = future_to_query[future]
                print(f"Error during query execution '{query}': {str(e)}")
            completed_queries += 1  # Initialize a counter for the completed queries
            if (completed_queries % 200) == 0:
                elapsed_time = time.perf_counter() - start_time
                print(f"200 Queries processed with total queries processed size : {completed_queries} with processed time taken : {elapsed_time:.2f} seconds")

    print(f"Completed parallel execution for {len(queries)} queries.")
    return results

# Main evaluation loop with parallelized REST queries
def evaluate_queries_parallel(df_train_small_us):
    """
    Evaluate the queries from a dataset using API results and calculate metrics parallelly.

    Args:
        df_train_small_us (pd.DataFrame): DataFrame containing queries and ground truth.

    Returns:
         pd.DataFrame: DataFrame containing evaluation metrics for all queries.
    """
    # Prepare results
    results = []

    # Extract all unique queries
    queries_to_execute = df_train_small_us[QUERY_PARAM].unique()
    # Process each query result using the ground truth
    empty_result_cnt = 0
    error_cnt = 0
    start_time = time.perf_counter()
    for i in range(NUM_RETRIES):
        print(f"Total number of queries to execute {len(queries_to_execute)} at iteration {i}.")
        if len(queries_to_execute) == 0:
            break
        # Perform parallel query execution
        parallel_results = fetch_results_parallel(queries_to_execute)

        queries_to_execute = []
        for query_result in parallel_results:
            query = query_result.get("query")
            predicted_product_ids = query_result.get("results", [])
            error = query_result.get("error")

            df_query_train_small_us = df_train_small_us[
                df_train_small_us[QUERY_PARAM] == query
            ].sort_values(by=['example_id', 'query_id', 'esci_graded_label'], ascending=[True, True, False])
            query_id = df_query_train_small_us['query_id'].unique()[0]
            graded_df_query_train_small_us = df_query_train_small_us.sort_values(by=['query_id', 'esci_graded_label', 'example_id'], ascending=[True, False, True])

            ground_truth_product_ids = df_query_train_small_us['product_id'].tolist()
            ground_truth_labels = df_query_train_small_us['esci_graded_label'].tolist()
            graded_ground_truth_product_ids = graded_df_query_train_small_us['product_id'].tolist()
            graded_ground_truth_labels = graded_df_query_train_small_us['esci_graded_label'].tolist()

            if error:
                # print(f"API error for query '{query}': {error}")
                if i == (NUM_RETRIES-1):
                    results.append({
                        "query_id": query_id,
                        "query": query,
                        "ground_truth": ground_truth_product_ids,
                        "predicted": error,
                        "precision": -1.0,
                        "recall": -1.0,
                        "f1_score": -1.0,
                        "ndcg@k": -1.0,
                        "graded_ndcg@k": -1.0,
                        "map@k": -1.0,
                        "mrr@k": -1.0
                    })
                    error_cnt += 1
                queries_to_execute.append(query)
                continue
            else:
                if not predicted_product_ids or len(predicted_product_ids) == 0:
                    empty_result_cnt += 1

            # Calculate metrics
            metrics = calculate_precision_recall_f1_score(predicted_product_ids, ground_truth_product_ids, TOP_K)
            ndcg_at_k = calculate_ndcg_at_k(predicted_product_ids, ground_truth_product_ids, TOP_K)
            graded_ndcg_at_k = calculate_graded_ndcg_at_k(predicted_product_ids, ground_truth_product_ids, ground_truth_labels, graded_ground_truth_product_ids, graded_ground_truth_labels, TOP_K)
            map_at_k = calculate_map_at_k(predicted_product_ids, ground_truth_product_ids, TOP_K)
            mrr_at_k = calculate_mrr_at_k(predicted_product_ids, ground_truth_product_ids, TOP_K)

            # Add to results
            results.append({
                "query_id": query_id,
                "query": query,
                "ground_truth": ground_truth_product_ids,
                "predicted": predicted_product_ids,
                **metrics,
                "ndcg@k": ndcg_at_k,
                "graded_ndcg@k": graded_ndcg_at_k,
                "map@k": map_at_k,
                "mrr@k": mrr_at_k
            })
    print(f"Total Empty Results: {empty_result_cnt} and total Errors: {error_cnt}")
    elapsed_time = time.perf_counter() - start_time
    print(f"Parallel query execution completed in {elapsed_time:.2f} seconds.")

    # Return as DataFrame
    result_df = pd.DataFrame(results)
    result_df.sort_values(by=['query_id'], ascending=[True], inplace=True)
    return result_df

# Data Loading
df_examples = pd.read_parquet('/Users/chiyer/misc/projects/honeyevaluation/datasets/golden/esci_shopping_queries/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('/Users/chiyer/misc/projects/honeyevaluation/datasets/golden/esci_shopping_queries/shopping_queries_dataset_products.parquet')
print(df_products.shape)
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='inner',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)
df_examples_products_train_small_us = df_examples_products[(df_examples_products["split"] == 'train') & (df_examples_products["small_version"] == 1) & (df_examples_products["product_locale"] == 'us')]

ESCI_GRADED_MAP = {"E":3,"S":2,"C":1,"I":0}
df_examples_products_train_small_us['esci_graded_label'] = df_examples_products_train_small_us['esci_label'].map(ESCI_GRADED_MAP)
df_examples_products_train_small_us = df_examples_products_train_small_us[["example_id","query_id", "query", "product_id", "product_title", "esci_graded_label"]].drop_duplicates()
df_examples_products_train_small_us = df_examples_products_train_small_us[df_examples_products_train_small_us["esci_graded_label"] > 0]  #Filter out Irrelevant Labels (esci_graded_label > 0)
# df_examples_products_train_small_us.sort_values(by=['example_id', 'query_id', 'esci_graded_label'], ascending=[True, True, False], inplace=True)
print(df_examples_products_train_small_us.shape)

# Evaluate queries with parallel REST calls
print("Running parallel evaluations on dataset...")
evaluation_results = evaluate_queries_parallel(df_examples_products_train_small_us)

query_output_path = f"/Users/chiyer/misc/projects/honeyevaluation/output/{ALGO}_query_metrics_TOP{TOP_K}_v0.tsv"
evaluation_results.to_csv(query_output_path, sep='\t', index=False)
print(f"Evaluation results saved to {query_output_path}")

In [None]:
filtered_evaluation_results = evaluation_results[evaluation_results['predicted'] != "HTTP 500"]
metric_columns = ['precision', 'recall', 'f1_score', 'ndcg@k', 'graded_ndcg@k', 'map@k', 'mrr@k']
for col in metric_columns:
    filtered_evaluation_results[col] = pd.to_numeric(filtered_evaluation_results[col], errors='coerce')

# Calculate the average of the metric columns
summary_metrics = filtered_evaluation_results[metric_columns].mean()
summary_output_path = f"/Users/chiyer/misc/projects/honeyevaluation/output/{ALGO}_summary_metrics_TOP{TOP_K}_v0.tsv"
summary_metrics.to_csv(summary_output_path, sep='\t', index=False)
print(f"Evaluation results saved to {summary_output_path}")