In [146]:
# calculate hitrate for the scope
import os
import json
import lancedb
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer



In [177]:
# not a very big dataset, 87k rows and <16mb
train = load_dataset("rajpurkar/squad", split="train")
validation = load_dataset("rajpurkar/squad", split="validation")
question_df = pd.concat([pd.DataFrame(train), pd.DataFrame(validation)])
print(question_df.shape)
question_df.head(20)

(98169, 5)


Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...
5,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,"{'text': ['September 1876'], 'answer_start': [..."
6,5733bf84d058e614000b61bf,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",How often is Notre Dame's the Juggler published?,"{'text': ['twice'], 'answer_start': [441]}"
7,5733bf84d058e614000b61c0,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",What is the daily student paper at Notre Dame ...,"{'text': ['The Observer'], 'answer_start': [598]}"
8,5733bf84d058e614000b61bd,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",How many student news papers are found at Notr...,"{'text': ['three'], 'answer_start': [126]}"
9,5733bf84d058e614000b61c1,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...,"{'text': ['1987'], 'answer_start': [908]}"


In [148]:
question_df.iloc[12].answers

{'text': ['Old College'], 'answer_start': [234]}

In [149]:
scope_id = "scopes-007"
metric = "cosine"

In [150]:
scope = json.load(open(f"scopes/{scope_id}_{metric}.json"))
scope

{'directory': '/Users/enjalot/latent-scope-data',
 'scope_id': 'scopes-007',
 'dataset': 'squad',
 'metric': 'cosine',
 'db_uri': '/Users/enjalot/latent-scope-data/squad/lancedb',
 'table_name': 'scopes-007_cosine',
 'embedding_id': 'embedding-001',
 'model_name': 'BAAI/bge-base-en-v1.5'}

In [151]:
db = lancedb.connect(scope["db_uri"])
table = db.open_table(scope["table_name"])

In [152]:
table.schema

index: int64
title: string
context: string
x: float
y: float
cluster: int64
raw_cluster: int64
label: string
vector: fixed_size_list<item: float>[768]
  child 0, item: float

In [153]:
model = SentenceTransformer(scope["model_name"], trust_remote_code=True, device="mps")


In [154]:
results = []
embeddings = []
topk = 10

for index, row in tqdm(question_df.iterrows(), total=question_df.shape[0]):
    question = row['question']
    question_embedding = model.encode(question)
    embeddings.append(question_embedding)

    # for scopes-006 where we truncated to 64 dimensions
    # Truncate to 64 dimensions and normalize
    # truncated_embedding = question_embedding[:64]
    # normalized_embedding = truncated_embedding / np.linalg.norm(truncated_embedding)
    # question_embedding = normalized_embedding

    search_results = table.search(question_embedding).metric(scope["metric"]).limit(topk).to_list()
    # print("RESULTS", search_results)
    search_results_contexts = [r["context"] for r in search_results]
    match_index = -1
    for i, context in enumerate(search_results_contexts):
        if context == row["context"]:
            match_index = i
            break

    results.append({
        "question": question,
        "results": [s['index'] for s in search_results],
        "match_index": match_index,

    })

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)
results_df.head()


100%|██████████| 98169/98169 [28:47<00:00, 56.82it/s]


Unnamed: 0,question,results,match_index
0,To whom did the Virgin Mary allegedly appear i...,"[0, 6915, 17590, 268, 6907, 6913, 14154, 6929,...",0
1,What is in front of the Notre Dame Main Building?,"[29, 38, 12630, 0, 12632, 12914, 3701, 50, 17,...",3
2,The Basilica of the Sacred heart at Notre Dame...,"[38, 28, 0, 14156, 14202, 11426, 8646, 3685, 1...",2
3,What is the Grotto at Notre Dame?,"[0, 38, 28, 17, 3721, 4591, 9, 29, 40, 7239]",0
4,What sits on top of the Main Building at Notre...,"[38, 29, 17, 7, 9, 9560, 0, 11436, 30, 27]",6


In [155]:
results_df.tail(20)

Unnamed: 0,question,results,match_index
98149,What is resposible for speeding up or slowing ...,"[10910, 10907, 10912, 9330, 5049, 10918, 5390,...",8
98150,What is the only form potential energy can cha...,"[14646, 14667, 14670, 14650, 14645, 2496, 1464...",-1
98151,What is the only form kinetic energy can chang...,"[14646, 14667, 14670, 14650, 14672, 14645, 146...",-1
98152,What is preserved in a closed system of forces...,"[20954, 20956, 20923, 20924, 14647, 20955, 209...",0
98153,What is the force between two locations relate...,"[20925, 20935, 20954, 20926, 20951, 20932, 209...",2
98154,What is the force called rgarding a potential ...,"[20932, 20925, 20954, 20955, 20942, 20935, 146...",2
98155,What is sometimes impossible to model?,"[13736, 19166, 14429, 19171, 19154, 20955, 191...",5
98156,Why are some forces due to that are impossible...,"[20955, 20925, 20924, 20949, 20923, 14783, 209...",0
98157,What do electrostatic gradiient potentials cre...,"[20955, 10930, 14650, 11565, 14674, 15940, 209...",0
98158,"Tension, compression, and drag are what kind o...","[20955, 20949, 20951, 20924, 20929, 20925, 209...",0


In [156]:
# Count the number of results where match_index is greater than 0
num_matches = (results_df['match_index'] >= 0).sum()

print(f"Number of results where match_index >= 0: {num_matches}")


Number of results where match_index >= 0: 82020


In [157]:
if not os.path.exists("results"):
    os.makedirs("results")
results_df.to_parquet(f"results/{scope['table_name']}.parquet")


In [158]:
question_df.iloc[98161]

id                                   5737a9afc3c5551400e51f63
title                                                   Force
context     The connection between macroscopic nonconserva...
question        What is the exchange of heat associated with?
answers     {'text': ['nonconservative forces', 'nonconser...
Name: 10562, dtype: object

In [159]:
# if a saved umap model is available, we can calculate the x,y positions of the questions to save along side

In [160]:
import pickle
import os

# Construct the path to the UMAP pickle file
scopes_meta = json.load(open(os.path.join(scope["directory"], scope["dataset"], "scopes", f"{scope['scope_id']}.json")))
umap_id = scopes_meta["umap_id"]

umap_file_path = os.path.join(scope["directory"], scope["dataset"], "umaps", f"{umap_id}.pkl")

print(f"loading UMAP model from: {umap_file_path}")

# Load the UMAP model
with open(umap_file_path, 'rb') as f:
    umap_model = pickle.load(f)

print(f"UMAP model loaded")



loading UMAP model from: /Users/enjalot/latent-scope-data/squad/umaps/umap-007.pkl
Mon Oct 28 13:11:41 2024 Building and compiling search function
UMAP model loaded


In [161]:
xy = umap_model.transform(embeddings)

Epochs completed:   0%|            0/30 [00:00]

	completed  0  /  30 epochs
	completed  3  /  30 epochs
	completed  6  /  30 epochs
	completed  9  /  30 epochs
	completed  12  /  30 epochs
	completed  15  /  30 epochs
	completed  18  /  30 epochs
	completed  21  /  30 epochs
	completed  24  /  30 epochs
	completed  27  /  30 epochs


In [162]:
# Read in the UMAP metadata JSON
umap_metadata_path = os.path.join(scope["directory"], scope["dataset"], "umaps", f"{umap_id}.json")
with open(umap_metadata_path, 'r') as f:
    umap_metadata = json.load(f)

print("UMAP metadata loaded:")
print(json.dumps(umap_metadata, indent=2))


UMAP metadata loaded:
{
  "id": "umap-007",
  "embedding_id": "embedding-001",
  "neighbors": 25,
  "min_dist": 0.1,
  "min_values": [
    -1.246901512145996,
    -3.9628655910491943
  ],
  "max_values": [
    15.80892562866211,
    9.830828666687012
  ]
}


In [163]:
min_values = np.array(umap_metadata['min_values'])
max_values = np.array(umap_metadata['max_values'])
normalized_points = (xy - min_values) / (max_values - min_values)
normalized_points = 2 * normalized_points - 1
results_df["x"] = normalized_points[:, 0]
results_df["y"] = normalized_points[:, 1]
results_df.head()

Unnamed: 0,question,results,match_index,x,y
0,To whom did the Virgin Mary allegedly appear i...,"[0, 6915, 17590, 268, 6907, 6913, 14154, 6929,...",0,0.779924,0.529662
1,What is in front of the Notre Dame Main Building?,"[29, 38, 12630, 0, 12632, 12914, 3701, 50, 17,...",3,-0.062185,-0.242264
2,The Basilica of the Sacred heart at Notre Dame...,"[38, 28, 0, 14156, 14202, 11426, 8646, 3685, 1...",2,0.068487,-0.030443
3,What is the Grotto at Notre Dame?,"[0, 38, 28, 17, 3721, 4591, 9, 29, 40, 7239]",0,-0.076982,-0.317033
4,What sits on top of the Main Building at Notre...,"[38, 29, 17, 7, 9, 9560, 0, 11436, 30, 27]",6,-0.081414,-0.346759


In [164]:
results_df.to_parquet(f"results/{scope['table_name']}.parquet")

In [178]:
# make sure the results are saved as int32
scope_id = "scopes-007"
metric = "cosine"
scope = json.load(open(f"scopes/{scope_id}_{metric}.json"))

In [179]:
test_df = pd.read_parquet(f"results/{scope['table_name']}.parquet")

In [180]:
test_df.head()

Unnamed: 0,question,results,match_index,x,y,answer_index
0,To whom did the Virgin Mary allegedly appear i...,"[0, 6915, 17590, 268, 6907, 6913, 14154, 6929,...",0,0.779924,0.529662,0
1,What is in front of the Notre Dame Main Building?,"[29, 38, 12630, 0, 12632, 12914, 3701, 50, 17,...",3,-0.062185,-0.242264,0
2,The Basilica of the Sacred heart at Notre Dame...,"[38, 28, 0, 14156, 14202, 11426, 8646, 3685, 1...",2,0.068487,-0.030443,0
3,What is the Grotto at Notre Dame?,"[0, 38, 28, 17, 3721, 4591, 9, 29, 40, 7239]",0,-0.076982,-0.317033,0
4,What sits on top of the Main Building at Notre...,"[38, 29, 17, 7, 9, 9560, 0, 11436, 30, 27]",6,-0.081414,-0.346759,0


In [181]:
test_df['results'] = test_df['results'].apply(lambda x: np.array(x, dtype=np.int32))
test_df.head()


Unnamed: 0,question,results,match_index,x,y,answer_index
0,To whom did the Virgin Mary allegedly appear i...,"[0, 6915, 17590, 268, 6907, 6913, 14154, 6929,...",0,0.779924,0.529662,0
1,What is in front of the Notre Dame Main Building?,"[29, 38, 12630, 0, 12632, 12914, 3701, 50, 17,...",3,-0.062185,-0.242264,0
2,The Basilica of the Sacred heart at Notre Dame...,"[38, 28, 0, 14156, 14202, 11426, 8646, 3685, 1...",2,0.068487,-0.030443,0
3,What is the Grotto at Notre Dame?,"[0, 38, 28, 17, 3721, 4591, 9, 29, 40, 7239]",0,-0.076982,-0.317033,0
4,What sits on top of the Main Building at Notre...,"[38, 29, 17, 7, 9, 9560, 0, 11436, 30, 27]",6,-0.081414,-0.346759,0


In [182]:
test_df.to_parquet(f"results/{scope['table_name']}.parquet")

In [169]:
# add the answer index for the question

context_df = pd.read_parquet(f"{scope["directory"]}/{scope['dataset']}/scopes/{scope_id}-input.parquet")
context_df.head()

Unnamed: 0,index,title,context,x,y,cluster,raw_cluster,label
0,0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",0.779956,0.54428,137,137,Mary Jesus God
1,1,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",-0.0431,-0.280547,252,-1,University city College
2,2,University_of_Notre_Dame,The university is the major seat of the Congre...,-0.049068,-0.314146,248,248,University university students
3,3,University_of_Notre_Dame,The College of Engineering was established in ...,-0.080649,-0.305042,248,248,University university students
4,4,University_of_Notre_Dame,All of Notre Dame's undergraduate students are...,-0.087783,-0.317861,248,248,University university students


In [170]:
# Create a list to store the matching indices
# TODO: this could have just been i % 5 or something since the questions are in order and there are 5 questions per context
matching_indices = []

# Iterate through each row in test_df
for i, test_row in test_df.iterrows():
    # Get the question's context
    question_context = question_df.iloc[i]['context']
    
    # Find matching index in context_df
    match_idx = context_df[context_df['context'] == question_context].index
    
    # If match found, append the index, otherwise append -1
    if len(match_idx) > 0:
        matching_indices.append(match_idx[0])
    else:
        matching_indices.append(-1)




In [171]:
# matching_indices

In [172]:
# add answer_index
scope_id = "scopes-007"
metric = "cosine"
scope = json.load(open(f"scopes/{scope_id}_{metric}.json"))
test_df = pd.read_parquet(f"results/{scope['table_name']}.parquet")

In [173]:
# Add the matching indices as a new column
test_df['answer_index'] = matching_indices

In [174]:
test_df.head(10)

Unnamed: 0,question,results,match_index,x,y,answer_index
0,To whom did the Virgin Mary allegedly appear i...,"[0, 6915, 17590, 268, 6907, 6913, 14154, 6929,...",0,0.779924,0.529662,0
1,What is in front of the Notre Dame Main Building?,"[29, 38, 12630, 0, 12632, 12914, 3701, 50, 17,...",3,-0.062185,-0.242264,0
2,The Basilica of the Sacred heart at Notre Dame...,"[38, 28, 0, 14156, 14202, 11426, 8646, 3685, 1...",2,0.068487,-0.030443,0
3,What is the Grotto at Notre Dame?,"[0, 38, 28, 17, 3721, 4591, 9, 29, 40, 7239]",0,-0.076982,-0.317033,0
4,What sits on top of the Main Building at Notre...,"[38, 29, 17, 7, 9, 9560, 0, 11436, 30, 27]",6,-0.081414,-0.346759,0
5,When did the Scholastic Magazine of Notre dame...,"[1, 8555, 38, 9832, 12548, 45, 26, 15, 9833, 9]",0,-0.082958,-0.327221,1
6,How often is Notre Dame's the Juggler published?,"[1, 21, 38, 15, 12628, 8555, 17, 8557, 16515, ...",0,-0.097537,-0.362245,1
7,What is the daily student paper at Notre Dame ...,"[1, 8555, 10591, 16515, 10516, 7946, 38, 10590...",0,-0.051903,-0.286697,1
8,How many student news papers are found at Notr...,"[1, 15, 8555, 9, 8, 10516, 10591, 38, 16515, 40]",0,-0.056738,-0.311108,1
9,In what year did the student paper Common Sens...,"[1, 38, 8555, 8557, 29, 35, 616, 20248, 4, 6]",0,-0.082132,-0.316825,1


In [175]:
test_df.to_parquet(f"results/{scope['table_name']}.parquet")

In [187]:
print([round(x, 3) for x in embeddings[0][0:50].tolist()])

[-0.046, -0.02, -0.02, -0.001, 0.042, 0.066, 0.089, 0.086, -0.043, -0.067, 0.004, -0.025, 0.041, 0.042, -0.015, 0.049, 0.018, 0.03, -0.012, -0.062, 0.012, 0.021, -0.003, 0.004, -0.028, -0.016, 0.002, -0.007, -0.037, 0.001, 0.017, -0.011, 0.015, -0.05, -0.008, 0.0, -0.003, -0.044, -0.027, 0.02, -0.005, -0.057, 0.015, 0.061, -0.026, -0.025, 0.034, 0.058, -0.035, 0.046]
