# Part 1: Setting up a basic semantic search system

A big thank you to [David on Github](https://github.com/gypsydave5) for finding a bug in my analysis code and bringing it to my attention. The bug was resolved! 

Some numbers might be different from what is reported in the book/video course but the overall gist is the same: re-ranking helps our semantic search and fine-tuning the re-ranking cross encoder yielded even better results.

In [None]:
%pip install pinecone openai sentence-transformers tiktoken datasets

In [None]:
from openai import OpenAI
from datetime import datetime, timezone
import hashlib
import re
import os
from sentence_transformers import CrossEncoder


from tqdm import tqdm
import numpy as np
from torch import nn

import logging
from pinecone import Pinecone, ServerlessSpec

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)


In [None]:
pinecone_key = os.environ.get('PINECONE_API_KEY')
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

INDEX_NAME = 'semantic-search-test'
NAMESPACE = 'default'
ENGINE = 'text-embedding-3-large'  # has vector size 3072

pc = Pinecone(
    api_key=pinecone_key
)

In [None]:
# helper functions to get lists of embeddings from the OpenAI API
def get_embeddings(texts, engine=ENGINE):
    response = client.embeddings.create(
        input=texts,
        model=engine
    )

    return [d.embedding for d in list(response.data)]

def get_embedding(text, engine=ENGINE):
    return get_embeddings([text], engine)[0]

len(get_embedding('hi')), len(get_embeddings(['hi', 'hello']))

In [None]:
if INDEX_NAME not in pc.list_indexes().names():
    print(f'Creating index {INDEX_NAME}')
    pc.create_index(
        name=INDEX_NAME,  # The name of the index
        dimension=3072,  # The dimensionality of the vectors for our OpenAI embedder
        metric='cosine',  # The similarity metric to use when searching the index
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Store the index as a variable
index = pc.Index(name=INDEX_NAME)
index

In [None]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

In [None]:
def prepare_for_pinecone(texts, engine=ENGINE):
    # Get the current UTC date and time
    now = datetime.now(timezone.utc).isoformat()

    # Generate vector embeddings for each string in the input list, using the specified engine
    embeddings = get_embeddings(texts, engine=engine)

    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time
    return [
        (
            my_hash(text),  # A unique ID for each string, generated using the my_hash() function
            embedding,  # The vector embedding of the string
            dict(text=text, date_uploaded=now)  # A dictionary of metadata, including the original text and the current UTC date and time
        )
        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding
    ]


In [None]:
texts = ['hi']

In [None]:
_id, embedding, metadata = prepare_for_pinecone(texts)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

In [None]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i: i + batch_size]
        prepared_texts = prepare_for_pinecone(batch)

        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            vectors=prepared_texts,
            namespace=namespace
        )['upserted_count']


    return total_upserted

# Call the upload_texts_to_pinecone() function with the input texts
upload_texts_to_pinecone(texts)


In [None]:
def query_from_pinecone(query, top_k=3, include_metadata=True):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query, engine=ENGINE)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=include_metadata   # gets the metadata (dates, text, etc)
    ).get('matches')

In [None]:
def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    # Compute the hash (id) for each text
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]

    # The ids parameter is used to specify the list of IDs (hashes) to delete
    return index.delete(ids=hashes, namespace=namespace)

In [None]:
from datasets import load_dataset

dataset = load_dataset("xtreme", "MLQA.en.en")

# rename test -> train and val -> test (as we will use it in later in this chapter)
dataset['train'] = dataset['test']
dataset['test'] = dataset['validation']
del dataset['validation']

dataset

In [None]:
dataset['train'][0], dataset['train'][1]

In [None]:
unique_passages = list(set(dataset['test']['context']))
for idx in tqdm(range(0, len(unique_passages), 32)):
    passages = unique_passages[idx:idx + 32]
    upload_texts_to_pinecone(passages)


In [None]:
len(unique_passages)

In [None]:
index.describe_index_stats()

In [None]:
dataset['test'][0]

In [None]:
query_from_pinecone('Does an infection for Sandflies go away over time?')

# Part 2: Making results more relevant with a cross-encoder

In [None]:
# if you didn't import before

from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
from torch import nn

In [None]:
from copy import copy

def get_results_from_pinecone(query, top_k=3, re_rank_model=None, verbose=True, correct_hash=None):

    results_from_pinecone = query_from_pinecone(query, top_k=top_k)

    if not results_from_pinecone:
        return []

    if verbose:
        print("Query:", query)


    final_results = []

    retrieved_correct_position, reranked_correct_position = None, None
    for idx, result_from_pinecone in enumerate(results_from_pinecone):
        if correct_hash and result_from_pinecone['id'] == correct_hash:
            retrieved_correct_position = idx

    if re_rank_model is not None:
        if verbose:
            print('Document ID (Hash)\t\tRetrieval Score\tCE Score\tText')

        sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]

        # Compute the similarity scores for these combinations
        similarity_scores = re_rank_model.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
        sim_scores_sort = list(reversed(np.sort(similarity_scores)))
        top_re_rank_score = sim_scores_sort[0]

        # Print the scores
        # print(list(zip(sim_scores_argsort, sim_scores_sort)))
        for idx, _ in enumerate(sim_scores_argsort):
            result_from_pinecone = results_from_pinecone[_]
            if correct_hash and retrieved_correct_position == _:
                reranked_correct_position = idx
            final_results.append({'score': similarity_scores[idx], 'id': result_from_pinecone['id'], 'metadata': result_from_pinecone['metadata']})
            if verbose:
                print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{similarity_scores[idx]:.6f}\t{result_from_pinecone['metadata']['text'][:50]}")
        return {'final_results': final_results, 'retrieved_correct_position': retrieved_correct_position, 'reranked_correct_position': reranked_correct_position, 'results_from_pinecone': results_from_pinecone, 'top_re_rank_score': top_re_rank_score}

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return {'final_results': final_results, 'retrieved_correct_position': retrieved_correct_position, 'reranked_correct_position': reranked_correct_position}

In [None]:
# Pre-trained cross encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', num_labels=1)

q_to_hash = {data['question']: my_hash(data['context']) for data in dataset['test']}

In [None]:
unique_inputs = list(set(dataset['test']['question']))
len(unique_inputs)

In [None]:
query = unique_inputs[0]
print(query)

for t in dataset['test']:
    if t['question'] == query:
        print(t['context'])

In [None]:
query_result = get_results_from_pinecone(
    query,
    top_k=2, # grab 2 results
    re_rank_model=cross_encoder,
    correct_hash=q_to_hash[query],
    verbose=False
    )

query_result['retrieved_correct_position'], query_result['reranked_correct_position']

In [None]:
query_result  # the right context isn't there!

In [None]:
query_result = get_results_from_pinecone(
    query,
    top_k=100, # grab 10 results
    re_rank_model=cross_encoder, correct_hash=q_to_hash[query],
    verbose=False
    )

query_result['retrieved_correct_position'], query_result['reranked_correct_position']

In [None]:
test_sample = dataset['test']

In [None]:
TOP_K=50

In [None]:
logger.setLevel(logging.CRITICAL)

predictions = []

for question in tqdm(test_sample['question']):
    r = get_results_from_pinecone(
        question, top_k=TOP_K, re_rank_model=cross_encoder, correct_hash=q_to_hash[question],
        verbose=False
        )

    r['retrieved_correct_position'], r['reranked_correct_position']
    predictions.append(r)
    if len(predictions) % 100 == 0:
        retrieved_accuracy = sum([_['retrieved_correct_position'] == 0 for _ in predictions])/len(predictions)
        re_ranked_accuracy = sum([_['reranked_correct_position'] == 0 for _ in predictions])/len(predictions)

        print(f'Accuracy without re-ranking: {retrieved_accuracy}')
        print(f'Accuracy with re-ranking: {re_ranked_accuracy}')


In [None]:
retrieved_accuracy = sum([_['retrieved_correct_position'] == 0 for _ in predictions])/len(predictions)
re_ranked_accuracy = sum([_['reranked_correct_position'] == 0 for _ in predictions])/len(predictions)

print(f'Accuracy without re-ranking: {retrieved_accuracy}')
print(f'Accuracy with re-ranking: {re_ranked_accuracy}')

In [None]:
import pandas as pd

predictions_df = pd.DataFrame(predictions)
predictions_df.head()

In [None]:
predictions_df[['retrieved_correct_position', 'reranked_correct_position']].mean()  # lower is better

In [None]:
# do recall @ 1, 3, 5, 10, etc
X = [1, 3, 5, 10, 25, 50]
OPENAI_RETRIEVAL = []
OLD_CROSS_ENCODER = []

for k in X:
    embedding_only_recall = predictions_df[predictions_df['retrieved_correct_position'] < k].shape[0]
    reranked_recall = predictions_df[predictions_df['reranked_correct_position'] < k].shape[0]
    OPENAI_RETRIEVAL.append(embedding_only_recall / predictions_df.shape[0])
    OLD_CROSS_ENCODER.append(reranked_recall / predictions_df.shape[0])
    print(k, embedding_only_recall, reranked_recall)

## OPEN SOURCE ALTERNATIVE TO EMBEDDING

In [None]:
from sentence_transformers import SentenceTransformer
# load up our open source embedding model
bi_encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# bi_encoder = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L-6-v3")

In [None]:
#Encode query and documents
docs = dataset['test']['context']
doc_emb = bi_encoder.encode(docs, batch_size=32, show_progress_bar=True)

In [None]:
doc_emb.shape

In [None]:
from sentence_transformers.util import semantic_search

# Function to find most similar document
def find_most_similar(embedder, text, embeddings, documents, k=3):
    query_embedding = embedder.encode([text], show_progress_bar=False)
    similarities = semantic_search(query_embedding, embeddings, top_k=k)
    return [(documents[sim['corpus_id']], sim['score'], sim['corpus_id']) for sim in similarities[0]]

In [None]:
from random import sample

query = sample(dataset['test']['question'], 1)[0]
print(query)

In [None]:
def eval_ranking_open_source(embedder, doc_emb, query, top_k=3, re_rank_model=None):
    ans = {'retrieved_correct_position': None}
    correct_hash = q_to_hash[query]
    results = find_most_similar(embedder, query, doc_emb, docs, k=top_k)
    for idx, (passage, score, doc_idx) in enumerate(results):
        if correct_hash == my_hash(passage):
            ans['retrieved_correct_position'] =  idx
    if re_rank_model is not None:
        ans['reranked_correct_position'] = None
        sentence_combinations = [(query, r[0]) for r in results]

        # Compute the similarity scores for these combinations
        similarity_scores = re_rank_model.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
        for i, idx in enumerate(sim_scores_argsort):
            r = results[idx]
            if correct_hash and my_hash(r[0]) == correct_hash:
                ans['reranked_correct_position'] = i

    return ans

In [None]:
eval_ranking_open_source(bi_encoder, doc_emb, query, top_k=TOP_K, re_rank_model=cross_encoder)

In [None]:
logger.setLevel(logging.CRITICAL)
os_predictions = []

for i, question in tqdm(enumerate(test_sample), total=len(test_sample)):
    os_predictions.append(eval_ranking_open_source(bi_encoder, doc_emb, question['question'], top_k=TOP_K, re_rank_model=cross_encoder))

In [None]:
os_predictions_df = pd.DataFrame(os_predictions)
os_predictions_df.head()

In [None]:
raw_accuracy = sum([p['retrieved_correct_position'] == 0 for p in os_predictions])/len(os_predictions)
reranked_accuracy = sum([p['reranked_correct_position'] == 0 for p in os_predictions])/len(os_predictions)

print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


In [None]:
# do recall @ 1, 3, 5, 10
OPEN_SOURCE_RETRIEVAL = []
OPEN_SOURCE_RETRIEVAL_PLUS_PRE_CE = []
for k in X:
    embedding_only_recall = os_predictions_df[os_predictions_df['retrieved_correct_position'] < k].shape[0]
    reranked_recall = os_predictions_df[os_predictions_df['reranked_correct_position'] < k].shape[0]
    print(k, embedding_only_recall, reranked_recall)
    OPEN_SOURCE_RETRIEVAL.append(embedding_only_recall / os_predictions_df.shape[0])
    OPEN_SOURCE_RETRIEVAL_PLUS_PRE_CE.append(reranked_recall / os_predictions_df.shape[0])

In [None]:
import matplotlib.pyplot as plt

# Creating the plot
plt.figure(figsize=(10, 6))
plt.plot(X, OPENAI_RETRIEVAL, label='OAI Retrieval Only', marker='o')
plt.plot(X, OLD_CROSS_ENCODER, label='OAI + Pretrained CE', marker='s')

plt.plot(X, OPEN_SOURCE_RETRIEVAL, label='OS Retrieval Only', marker='*')
plt.plot(X, OPEN_SOURCE_RETRIEVAL_PLUS_PRE_CE, label='OS + Pretrained CE', marker='^')


# Adding titles and labels
plt.title('Comparing embedding models + pre-trained vs fine-tuned CE (all retrieved 50 results then re-ranked)')
plt.xlabel('Recall @')
plt.ylabel('Performance')
plt.xticks(X)
plt.yticks([i/100 for i in range(70, 101, 5)])  # Adjusting y-ticks to start from 0.75

# Adding legend
plt.legend()

# Show the plot
plt.grid(True)

## Advanced: Fine-tuning the re-ranker

In [None]:
dataset['train'][0]

In [None]:
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader
from random import shuffle

In [None]:
unique_train_passages = list(set(dataset['train']['context']))
len(unique_train_passages), len(dataset['train']['context'])

In [None]:
len(unique_train_passages), doc_emb.shape

In [None]:
# use sentence_transformers.util.semantic_search
train_doc_embed = bi_encoder.encode(unique_train_passages, batch_size=32, show_progress_bar=True)

In [None]:
unique_train_passages = np.array(unique_train_passages)

# Example usage
print(unique_train_passages[0])

find_most_similar(bi_encoder, unique_train_passages[0], train_doc_embed, unique_train_passages)

In [None]:
# negative example mining
train_samples = []

for train_example in tqdm(dataset['train']):
    # train_samples.append(
    #         InputExample(
    #             texts=[train_example['question'], train_example['context']], label=1
    #         )
    #     )
    for i, (passage, score, corpus_idx) in enumerate(find_most_similar(bi_encoder, train_example['question'], train_doc_embed, unique_train_passages)):

        train_samples.append(
            InputExample(
                texts=[train_example['question'], passage], label=int(passage == train_example['context'])
            )
        )


shuffle(train_samples)


In [None]:
import pandas as pd
pd.Series([t.label for t in train_samples]).value_counts().plot(kind='bar')

In [None]:
dataset['train'][0]

In [None]:
train_samples[2].__dict__

In [None]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator, CEBinaryClassificationEvaluator
import math
import torch
from random import sample

logger.setLevel(logging.DEBUG)  # just to get some logs

num_epochs = 1

model_save_path = './fine_tuned_ir_cross_encoder'

train_dataloader = DataLoader(train_samples[:int(len(train_samples)*.8)], shuffle=True, batch_size=16)

# An evaluator for training performance
evaluator = CECorrelationEvaluator.from_input_examples(train_samples[int(len(train_samples)*.8):], name='test')

# Rule of thumb for warmup steps
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
print(f"Warmup-steps: {warmup_steps}")

In [None]:
for t in train_samples:
    if t.label == 1:
        print('Example of label 1')
        print(t.__dict__, cross_encoder.predict(t.texts, activation_fct=nn.Sigmoid()))
        break
for t in train_samples:
    if t.label == 0:
        print('Example of label 0')
        print(t.__dict__, cross_encoder.predict(t.texts, activation_fct=nn.Sigmoid()))
        break

In [None]:
evaluator(cross_encoder)

In [None]:
# Train the model
from sentence_transformers import InputExample, losses, evaluation

# you may turn on debug for more logs here e.g. logger.setLevel(logging.DEBUG)
cross_encoder.fit(
    train_dataloader=train_dataloader,
    loss_fct=nn.BCEWithLogitsLoss(),  # this is the default loss if num_labels is 1 otherwise CrossEntropyLoss
    evaluator=evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=True
)

In [None]:
evaluator(cross_encoder)

In [None]:
finetuned = CrossEncoder(model_save_path)

print(finetuned.predict(['hello', 'hi'], activation_fct=nn.Sigmoid()))
print(finetuned.predict(['hello', 'hi'], activation_fct=nn.Identity()))

In [None]:
logger.setLevel(logging.CRITICAL)

ft_predictions = []

for question in tqdm(test_sample['question']):
    r = get_results_from_pinecone(
        question, top_k=TOP_K, re_rank_model=finetuned, correct_hash=q_to_hash[question],
        verbose=False
        )

    r['retrieved_correct_position'], r['reranked_correct_position']
    ft_predictions.append(r)
    if len(ft_predictions) % 100 == 0:
        retrieved_accuracy = sum([_['retrieved_correct_position'] == 0 for _ in ft_predictions])/len(ft_predictions)
        re_ranked_accuracy = sum([_['reranked_correct_position'] == 0 for _ in ft_predictions])/len(ft_predictions)

        print(f'Accuracy without re-ranking: {retrieved_accuracy}')
        print(f'Accuracy with re-ranking: {re_ranked_accuracy}')


In [None]:
retrieved_accuracy = sum([_['retrieved_correct_position'] == 0 for _ in ft_predictions])/len(ft_predictions)
re_ranked_accuracy = sum([_['reranked_correct_position'] == 0 for _ in ft_predictions])/len(ft_predictions)

print(f'Accuracy without re-ranking: {retrieved_accuracy}')
print(f'Accuracy with re-ranking: {re_ranked_accuracy}')

In [None]:
# Re-ranking got slightly better after 1 epoch

In [None]:
ft_predictions_df = pd.DataFrame(ft_predictions)
ft_predictions_df.head()

In [None]:
ft_predictions_df[['retrieved_correct_position', 'reranked_correct_position']].isnull().sum()

In [None]:
ft_predictions_df[['retrieved_correct_position', 'reranked_correct_position']].mean()

In [None]:
# do recall @ 1, 3, 5, 10
OPENAI_RETRIEVAL = []
OPENAI_RETRIEVAL_PLUS_FT_CE = []
for k in X:
    embedding_only_recall = ft_predictions_df[ft_predictions_df['retrieved_correct_position'] < k].shape[0]
    reranked_recall = ft_predictions_df[ft_predictions_df['reranked_correct_position'] < k].shape[0]
    OPENAI_RETRIEVAL.append(embedding_only_recall / ft_predictions_df.shape[0])
    OPENAI_RETRIEVAL_PLUS_FT_CE.append(reranked_recall / ft_predictions_df.shape[0])
    print(k, embedding_only_recall, reranked_recall)

In [None]:
logger.setLevel(logging.CRITICAL)
os_predictions = []

for i, question in tqdm(enumerate(test_sample), total=len(test_sample)):
    os_predictions.append(eval_ranking_open_source(bi_encoder, doc_emb, question['question'], top_k=TOP_K, re_rank_model=finetuned))

os_predictions_df = pd.DataFrame(os_predictions)
os_predictions_df.head()

In [None]:
raw_accuracy = sum([p['retrieved_correct_position'] == 0 for p in os_predictions])/len(os_predictions)
reranked_accuracy = sum([p['reranked_correct_position'] == 0 for p in os_predictions])/len(os_predictions)

print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


In [None]:
# do recall @ 1, 3, 5, 10
OPEN_SOURCE_RETRIEVAL = []
OPEN_SOURCE_RETRIEVAL_PLUS_FT_CE = []
for k in X:
    embedding_only_recall = os_predictions_df[os_predictions_df['retrieved_correct_position'] < k].shape[0]
    reranked_recall = os_predictions_df[os_predictions_df['reranked_correct_position'] < k].shape[0]
    print(k, embedding_only_recall, reranked_recall)
    OPEN_SOURCE_RETRIEVAL.append(embedding_only_recall / os_predictions_df.shape[0])
    OPEN_SOURCE_RETRIEVAL_PLUS_FT_CE.append(reranked_recall / os_predictions_df.shape[0])

In [None]:
import matplotlib.pyplot as plt

# Creating the plot
plt.figure(figsize=(10, 6))
plt.plot(X, OPENAI_RETRIEVAL, label='OAI Retrieval Only', marker='o')
plt.plot(X, OPEN_SOURCE_RETRIEVAL, label='OS Retrieval Only', marker='*')
plt.plot(X, OPEN_SOURCE_RETRIEVAL_PLUS_PRE_CE, label='OS + Pretrained CE', marker='^')

plt.plot(X, OPEN_SOURCE_RETRIEVAL_PLUS_FT_CE, label='OS + Finetuned CE', marker='v')
plt.plot(X, OLD_CROSS_ENCODER, label='OAI + Pretrained CE', marker='s')
plt.plot(X, OPENAI_RETRIEVAL_PLUS_FT_CE, label='OAI + Finetuned CE', marker='d')

# Adding titles and labels
plt.title('Comparing embedding models + pre-trained vs fine-tuned CE (all retrieved 50 results then re-ranked)')
plt.xlabel('Recall @')
plt.ylabel('Performance')
plt.xticks(X)
plt.yticks([i/100 for i in range(70, 101, 5)])  # Adjusting y-ticks to start from 0.75

# Adding legend
plt.legend()

# Show the plot
plt.grid(True)
# plt.show()

plt.savefig('recall_at_k.png', dpi=1000)

In [None]:
# show results as a table

results_df = pd.DataFrame({'RECALL @': [1, 3, 5, 10, 25, 50], 'OS_Retrieval_Only': OPEN_SOURCE_RETRIEVAL, 'OS_Retrieval_Plus_Finetuned_CE': OPEN_SOURCE_RETRIEVAL_PLUS_FT_CE   , 'OAI_Retrieval_Only': OPENAI_RETRIEVAL    , 'OAI_Retrieval_Plus_Pretrained_CE': OLD_CROSS_ENCODER, 'OAI_Retrieval_Plus_Finetuned_CE': OPENAI_RETRIEVAL_PLUS_FT_CE})
results_df.sort_values(by='RECALL @')

In [None]:
import matplotlib.pyplot as plt

# Data
methods = [
    "OS_Retrieval_Only",
    "OS_Retrieval_Plus_Finetuned_CE",
    "OAI_Retrieval_Only",
    "OAI_Retrieval_Plus_Pretrained_CE",
    "OAI_Retrieval_Plus_Finetuned_CE"
]

recalls = [0.501742, 0.642857, 0.753484, 0.833624, 0.890244]

# Create a bar plot
plt.figure(figsize=(8, 5))
bars = plt.bar(methods, recalls, color="skyblue")

# Add value labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 0.005,
        f"{height:.3f}",
        ha="center",
        va="bottom",
        fontsize=9
    )

# Labeling and aesthetics
plt.title("Recall@1 Across Different Methods")
plt.ylabel("Recall@1")
plt.xticks(rotation=25, ha="right")  # Rotate x-axis labels if needed
plt.ylim([0, 1])                    # Since recall values typically range [0,1]
plt.tight_layout()

# Display the plot
plt.show()
