In [19]:
# calculate hitrate for the scope
import os
import json
import lancedb
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer



In [7]:
# not a very big dataset, 87k rows and <16mb
train = load_dataset("rajpurkar/squad", split="train")
validation = load_dataset("rajpurkar/squad", split="validation")
question_df = pd.concat([pd.DataFrame(train), pd.DataFrame(validation)])
print(question_df.shape)
question_df.head()

(98169, 5)


Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [10]:
scope = json.load(open("scopes/scopes-001_cosine.json"))
scope

{'directory': '/Users/enjalot/latent-scope-data',
 'scope_id': 'scopes-001',
 'dataset': 'squad',
 'metric': 'cosine',
 'db_uri': '/Users/enjalot/latent-scope-data/squad/lancedb',
 'table_name': 'scopes-001_cosine',
 'embedding_id': 'embedding-002',
 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}

In [11]:
db = lancedb.connect(scope["db_uri"])
table = db.open_table(scope["table_name"])

In [16]:
table.schema

index: int64
title: string
context: string
x: float
y: float
cluster: int64
raw_cluster: int64
label: string
vector: fixed_size_list<item: float>[384]
  child 0, item: float

In [20]:
model = SentenceTransformer(scope["model_name"], trust_remote_code=True, device="mps")


In [23]:
results = []
topk = 5

for index, row in tqdm(question_df.iterrows(), total=question_df.shape[0]):
    question = row['question']
    question_embedding = model.encode(question)
    search_results = table.search(question_embedding).metric(scope["metric"]).limit(topk).to_list()
    # print("RESULTS", search_results)
    search_results_contexts = [r["context"] for r in search_results]
    match_index = -1
    for i, context in enumerate(search_results_contexts):
        if context == row["context"]:
            match_index = i
            break

    results.append({
        "question": question,
        "results": [s['index'] for s in search_results],
        "match_index": match_index
    })

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)
results_df.head()


100%|██████████| 98169/98169 [23:19<00:00, 70.15it/s]  


Unnamed: 0,question,results,match_index
0,To whom did the Virgin Mary allegedly appear i...,"[0, 6913, 12247, 17590, 6931]",0
1,What is in front of the Notre Dame Main Building?,"[38, 29, 28, 0, 7]",3
2,The Basilica of the Sacred heart at Notre Dame...,"[28, 38, 0, 17, 10608]",2
3,What is the Grotto at Notre Dame?,"[38, 28, 0, 29, 17]",2
4,What sits on top of the Main Building at Notre...,"[38, 29, 27, 28, 2]",-1


In [27]:
results_df.tail(20)

Unnamed: 0,question,results,match_index
98149,What is resposible for speeding up or slowing ...,"[10912, 20953, 13405, 20919, 10910]",1
98150,What is the only form potential energy can cha...,"[14667, 14650, 14646, 14648, 14670]",-1
98151,What is the only form kinetic energy can chang...,"[14667, 14646, 14650, 14669, 14672]",-1
98152,What is preserved in a closed system of forces...,"[20954, 20956, 20923, 20926, 20935]",0
98153,What is the force between two locations relate...,"[20926, 20935, 20954, 20948, 20950]",2
98154,What is the force called rgarding a potential ...,"[20954, 20932, 20955, 20926, 20950]",0
98155,What is sometimes impossible to model?,"[1554, 19147, 19143, 16586, 19172]",-1
98156,Why are some forces due to that are impossible...,"[20955, 20926, 20924, 20950, 20935]",0
98157,What do electrostatic gradiient potentials cre...,"[20955, 20954, 14674, 20945, 20932]",0
98158,"Tension, compression, and drag are what kind o...","[20950, 20932, 20926, 20925, 20949]",-1


In [35]:
# Count the number of results where match_index is greater than 0
num_matches = (results_df['match_index'] >= 0).sum()

print(f"Number of results where match_index >= 0: {num_matches}")


Number of results where match_index >= 0: 75577


In [25]:
if not os.path.exists("results"):
    os.makedirs("results")
results_df.to_parquet(f"results/{scope['table_name']}.parquet")


In [36]:
question_df.iloc[98161]

id                                   5737a9afc3c5551400e51f63
title                                                   Force
context     The connection between macroscopic nonconserva...
question        What is the exchange of heat associated with?
answers     {'text': ['nonconservative forces', 'nonconser...
Name: 10562, dtype: object