In [None]:
#! pip install rank-bm25 torch transformers sentence-transformers

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

In [None]:
# Function to get states from embedding model
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:

    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# Function to lookup query matches
def get_exact_matches_for_(query):

    labels = label_df[label_df['query_id'] == query]

    return labels[labels['label'] == 'Exact']['product_id'].values

# Function to get tf-idf results
def get_tfidf_products(x):

    return cosine_similarity(vec.transform([x]), matrix).flatten().argsort()[-10:][::-1]

#define functions for evaluating retrieval performance
def map_at_k(true_ids, predicted_ids, k = 10):
    """
    Calculate the Mean Average Precision at K (MAP@K).

    Parameters:
    true_ids (list): List of relevant product IDs.
    predicted_ids (list): List of predicted product IDs.
    k (int): Number of top elements to consider.
             NOTE: IF you wish to change top k, please provide a justification for choosing the new value

    Returns:
    float: MAP@K score.
    """
    #if either list is empty, return 0
    if not len(true_ids) or not len(predicted_ids):
        return 0.0

    score = 0.0
    num_hits = 0.0

    for i, p_id in enumerate(predicted_ids[:k]):

        if p_id in true_ids and p_id not in predicted_ids[:i]:

            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(true_ids), k)

# Function to execute bm search
def execute_bm_search(q):

    return np.argsort(bm25.get_scores(q.split(' ')))[-10:]

# Function to perform reranking
def execute_reranking(data, query):

    # Translate query
    query_embeddings = biencoder.encode(query, convert_to_tensor = True) #.cuda()

    # Get cosine similarity
    hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k = topk)
    hits = hits[0]

    # Perform reranking
    cross_inp = [[query, data[hit['corpus_id']]] for hit in hits]
    cross_scores = crossencoderembeddingmodel.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):

        hits[idx]['score'] = cross_scores[idx]

    sortie = sorted(hits, key = lambda x: x['score'], reverse = True)[0:10]

    return [item['corpus_id'] for item in sortie]

In [None]:
!git clone https://github.com/wayfair/WANDS.git

Cloning into 'WANDS'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 40 (delta 7), reused 23 (delta 3), pack-reused 0[K
Receiving objects: 100% (40/40), 33.32 MiB | 19.18 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [None]:
# get search queries
query_df = pd.read_csv("WANDS/dataset/query.csv", sep = '\t')
product_df = pd.read_csv("WANDS/dataset/product.csv", sep = '\t')
label_df = pd.read_csv("WANDS/dataset/label.csv", sep = '\t')

In [None]:
# Combine fields and process text
product_df['text'] = product_df['product_name'] + ' ' + product_df['product_description'].fillna('')
product_df['text'] = product_df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
product_df['text'] = product_df['text'].apply(lambda x: re.sub('\d+', '', x).lower().strip().replace('  ', ' '))

In [None]:
# Calculate TF-IDF
vec = TfidfVectorizer()
tfidf = vec.fit(product_df['text'])
matrix = tfidf.transform(product_df['text'])

In [None]:
query_df['matches'] = query_df['query_id'].apply(get_exact_matches_for_)

In [None]:
query_df['suggestions'] = query_df['query'].apply(get_tfidf_products)

In [None]:
query_df['score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['suggestions'], k = 10), axis = 1)

In [None]:
query_df.loc[:, 'score'].mean()

0.2726724674823633

In [None]:
# BM25
bm25 = BM25Okapi(product_df['text'].apply(lambda x: x.split(' ')))

In [None]:
query_df['bm_suggestions'] = query_df['query'].apply(execute_bm_search)

In [None]:
query_df['bm_score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['bm_suggestions'], k = 10), axis = 1)

In [None]:
np.mean(query_df['bm_score'])

0.2607000041335979

In [None]:
# Prepare an embedding model
queries = []
passages = []

for idx, row in query_df.iterrows():

    input = f"query: {row['query']}"
    queries.append(input)

for idx, row in product_df.iterrows():

    input = f"passage: {row['text']}"
    passages.append(input)

input_texts = queries + passages

In [None]:
queries[:10]

['query: salon chair',
 'query: smart coffee table',
 'query: dinosaur',
 'query: turquoise pillows',
 'query: chair and a half recliner',
 'query: sofa with ottoman',
 'query: acrylic clear chair',
 'query: driftwood mirror',
 'query: home sweet home sign',
 'query: coffee table fire pit']

In [None]:
len(set(queries))

480

In [None]:
passages[-10:]

['passage: random sized mosaic tile',
 'passage: jalapa medallion floral room darkening thermal grommet curtain panels this jalapa curtain adds an accent style to your house the pattern is medallions a very classic design that will match all home styles also we used sand gray and beige colors to imply bohemian style as well as national style a very unique design can not find in other places our fabric is environmentally friendly without dye substance  none chemical smell high quality for your house decoration but using much cheaper price',
 'passage: hein floral thermal rod pocket single curtain panel this wyn linen blend curtain is designed to bring an elegant and soft feeling to your room the curtain with flower and branch pattern will bring a natural and enthusiastic feeling into your home the vintage style makes your home full of romantic atmosphere these classic designs will decorate your home with great effect keep privacy and perfectly match contemporary traditional vintage rust

In [None]:
## Crashes

#%%time

#tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
#model = AutoModel.from_pretrained('intfloat/e5-large-v2')

# Perform tokenization
#batch_dict = tokenizer(input_texts, max_length = 512, padding = True, truncation = True, return_tensors = 'pt')
#outputs = model(**batch_dict)
#embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# Normalize
#embeddings = F.normalize(embeddings, p = 2, dim = 1)

In [None]:
# Employ an embedding model
#model = SentenceTransformer('intfloat/e5-large-v2')
#%time query_embeddings = model.encode(queries, normalize_embeddings = True)
#%time passage_embeddings = model.encode(passages, normalize_embeddings = True)

# Compute cosine-similarities
#cosine_scores = util.cos_sim(query_embeddings, passage_embeddings)

In [None]:
# Init model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
%time corpus_embeddings = embedder.encode(passages, convert_to_tensor = True)

CPU times: user 36.2 s, sys: 1.29 s, total: 37.5 s
Wall time: 18.2 s


In [None]:
%time query_embeddings = embedder.encode(queries, convert_to_tensor = True)

CPU times: user 147 ms, sys: 7.93 ms, total: 155 ms
Wall time: 112 ms


In [None]:
# Embed the products
corpus_embeddings = corpus_embeddings.to("cuda")
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

In [None]:
# Embed the queries
query_embeddings = query_embeddings.to("cuda")
query_embeddings = util.normalize_embeddings(query_embeddings)

In [None]:
hits = util.semantic_search(query_embeddings, corpus_embeddings, score_function = util.dot_score, top_k = 10)

In [None]:
hits[0]

[{'corpus_id': 7465, 'score': 0.7541677355766296},
 {'corpus_id': 33689, 'score': 0.7429394125938416},
 {'corpus_id': 40996, 'score': 0.7371830344200134},
 {'corpus_id': 25431, 'score': 0.7352949976921082},
 {'corpus_id': 33690, 'score': 0.7233842015266418},
 {'corpus_id': 33691, 'score': 0.7232968807220459},
 {'corpus_id': 20026, 'score': 0.715502917766571},
 {'corpus_id': 25432, 'score': 0.683279812335968},
 {'corpus_id': 40997, 'score': 0.6817553043365479},
 {'corpus_id': 6168, 'score': 0.6804269552230835}]

In [None]:
len(hits)

480

In [None]:
# Extract a single instance
sims = [[y['corpus_id'] for y in x] for x in hits]

In [None]:
sims[0]

[7465, 33689, 40996, 25431, 33690, 33691, 20026, 25432, 40997, 6168]

In [None]:
query_df['semantic_suggestions'] = sims

In [None]:
query_df['semantic_score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['semantic_suggestions'], k = 10), axis = 1)

In [None]:
query_df.loc[:, 'semantic_score'].mean()

0.3233098522009112

In [None]:
# Init model for reranking
biencoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
crossencoderembeddingmodel = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Set sequence length
biencoder.max_seq_length = 512

# Num docs
topk = 100

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Perform initial encoding
corpus_embeddings = biencoder.encode(product_df['text'], convert_to_tensor = True, show_progress_bar = True)
query_embeddings = biencoder.encode(query_df['query'].iloc[0], convert_to_tensor = True).cuda()

Batches:   0%|          | 0/1344 [00:00<?, ?it/s]

In [None]:
# Extract 100 relevant passages
hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k = topk)
hits = hits[0]

In [None]:
# Re-Rank the products
cross_inp = [[query_df['query'].iloc[0], product_df['text'][hit['corpus_id']]] for hit in hits]
cross_scores = crossencoderembeddingmodel.predict(cross_inp)

# Sort results by the cross-encoder scores
for idx in range(len(cross_scores)):

    hits[idx]['score'] = cross_scores[idx]

In [None]:
print(f"Query Selection- {query_df['query'].iloc[0]}")

print("Top-3 Re-Ranked Hits")
hits = sorted(hits, key = lambda x: x['score'], reverse = True)

for hit in hits[0:3]:

    print("\t{:.3f}\t{}".format(hit['score'], product_df['product_name'][hit['corpus_id']].replace("\n", " ")))

Query Selection- salon chair
Top-3 Re-Ranked Hits
	8.191	barberpub salon massage chair
	8.156	hair salon chair
	7.957	reclining faux leather massage chair


In [None]:
query_df['reranked'] = query_df['query'].apply(lambda x: execute_reranking(product_df['text'], x))

In [None]:
query_df['reranked'].iloc[0]

[25431, 7465, 7467, 24010, 7466, 25433, 24009, 42329, 24008, 33690]

In [None]:
query_df['reranking_score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['reranked'], k = 10), axis = 1)

In [None]:
query_df.loc[:, 'reranking_score'].mean()

0.4374410181510876