In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

In [2]:
# Function to lookup query matches
def get_exact_matches_for_(query):

    labels = label_df[label_df['query_id'] == query]
    
    return labels[labels['label'] == 'Exact']['product_id'].values

# Function to get tf-idf results
def get_tfidf_products(x):

    return cosine_similarity(vec.transform([x]), matrix).flatten().argsort()[-10:][::-1]

#define functions for evaluating retrieval performance
def map_at_k(true_ids, predicted_ids, k = 10):
    """
    Calculate the Mean Average Precision at K (MAP@K).

    Parameters:
    true_ids (list): List of relevant product IDs.
    predicted_ids (list): List of predicted product IDs.
    k (int): Number of top elements to consider.
             NOTE: IF you wish to change top k, please provide a justification for choosing the new value

    Returns:
    float: MAP@K score.
    """
    #if either list is empty, return 0
    if not len(true_ids) or not len(predicted_ids):
        return 0.0

    score = 0.0
    num_hits = 0.0

    for i, p_id in enumerate(predicted_ids[:k]):
        if p_id in true_ids and p_id not in predicted_ids[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(true_ids), k)

# Function to execute bm search
def execute_bm_search(q):

    return np.argsort(bm25.get_scores(q.split(' ')))[-10:]

In [3]:
# get search queries
query_df = pd.read_csv("/home/human/Documents/code/WANDS/dataset/query.csv", sep = '\t')
product_df = pd.read_csv("/home/human/Documents/code/WANDS/dataset/product.csv", sep = '\t')
label_df = pd.read_csv("/home/human/Documents/code/WANDS/dataset/label.csv", sep = '\t')

In [4]:
# Combine fields and process text
product_df['text'] = product_df['product_name'] + ' ' + product_df['product_description'].fillna('')
product_df['text'] = product_df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
product_df['text'] = product_df['text'].apply(lambda x: re.sub('\d+', '', x).lower().strip().replace('  ', ' '))

In [5]:
product_df.head()

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count,text
0,0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0,solid wood platform bed good deep sleep can be...
1,1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0,allclad qt slow cooker create delicious slowco...
2,2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0,allclad electrics qt slow cooker prepare homec...
3,3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0,allclad all professional tools pizza cutter th...
4,4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0,baldwin prestige alcott passage knob with roun...


In [6]:
label_df.head()

Unnamed: 0,id,query_id,product_id,label
0,0,0,25434,Exact
1,1,0,12088,Irrelevant
2,2,0,42931,Exact
3,3,0,2636,Exact
4,4,0,42923,Exact


In [7]:
query_df.head()

Unnamed: 0,query_id,query,query_class
0,0,salon chair,Massage Chairs
1,1,smart coffee table,Coffee & Cocktail Tables
2,2,dinosaur,Kids Wall Décor
3,3,turquoise pillows,Accent Pillows
4,4,chair and a half recliner,Recliners


In [8]:
# Calculate TF-IDF
vec = TfidfVectorizer()
tfidf = vec.fit(product_df['text'])
matrix = tfidf.transform(product_df['text'])

In [9]:
query_df['matches'] = query_df['query_id'].apply(get_exact_matches_for_)

In [10]:
query_df['suggestions'] = query_df['query'].apply(get_tfidf_products)

In [11]:
query_df['score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['suggestions'], k = 10), axis = 1)

In [12]:
np.mean(query_df['score'])

0.27224461254409166

In [13]:
query_df.loc[:, 'score'].mean()

0.27224461254409166

In [14]:
# BM25
bm25 = BM25Okapi(product_df['text'].apply(lambda x: x.split(' ')))

In [15]:
query_df['bm_suggestions'] = query_df['query'].apply(execute_bm_search)

In [16]:
query_df['bm_score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['bm_suggestions'], k = 10), axis = 1)

In [17]:
np.mean(query_df['bm_score'])

0.26119303902116403

In [27]:
# Prepare an embedding model
queries = []
passages = []

for idx, row in query_df.iterrows():

    input = f"{row['query']}"
    queries.append(input)

for idx, row in product_df.iterrows():

    input = f"{row['text']}"
    passages.append(input)