In [1]:
import sys
sys.path.append('..')
sys.path.append('../src/')
import numpy as np
from tqdm import tqdm
from scipy.spatial import distance
import pickle
from src.graph_class import Graph

from datetime import datetime

In [4]:
# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

def read_ivecs(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy()

In [6]:
def calculate_recall(predicted_neighbors, actual_neighbors):
    total_recall = 0

    for pred, actual in zip(predicted_neighbors, actual_neighbors):
        true_positives = len(set(pred) & set(actual))
        possible_positives = len(set(actual))

        recall = true_positives / possible_positives if possible_positives else 0

        total_recall += recall

    average_recall = total_recall / len(actual_neighbors)

    return average_recall

In [7]:
# read in data
# data we will search through

base = read_fvecs('C:/Users/ewang/OneDrive/Desktop/Fall 2023/cos597a-final-project/data/siftsmall/siftsmall_base.fvecs')  # 1M samples
# also get some query vectors to search with
query = read_fvecs('C:/Users/ewang/OneDrive/Desktop/Fall 2023/cos597a-final-project/data/siftsmall/siftsmall_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
# xq = xq[0].reshape(1, xq.shape[1])

groundtruth = read_ivecs('C:/Users/ewang/OneDrive/Desktop/Fall 2023/cos597a-final-project/data/siftsmall/siftsmall_groundtruth.ivecs')

In [120]:
with open('../graphs/fixed-graph-set-k3-m3.pkl', 'rb') as f:
    ggraph = pickle.load(f)

In [40]:
with open('../graphs/beam-graph-set-k3-m1.pkl', 'rb') as f:
    bgraph = pickle.load(f)

In [121]:
k = 10
results_greedy = []
results_beam = []
greedytime = 0.0
beamtime = 0.0
for q in tqdm(query):
  start = datetime.now()
  g = [r[1] for r in ggraph.greedy_search(ggraph.graph, q, k=k, m = 10)[0]]

  end = datetime.now()
  greedytime += (end-start).total_seconds()
  start= datetime.now()
  b = [r[1] for r in ggraph.beam_search(ggraph.graph, q, k=k, m =10)[0]]
  end = datetime.now()
  beamtime += (end-start).total_seconds()
  results_greedy.append(g)
  results_beam.append(b)

100%|██████████| 100/100 [00:11<00:00,  8.81it/s]


In [119]:
print(greedytime)
print(beamtime)

5.498977999999996
6.101148999999999


In [64]:
sum = 0
for i in range(len(graph.graph)):
  sum += len(graph.graph[i].neighborhood)

print("Average number of edges per vertex: ", sum/len(graph.graph))

Average number of edges per vertex:  5.9988


In [84]:
true = groundtruth[:, :k]

In [122]:
average_recall = calculate_recall(results_greedy, true)
print(average_recall)

0.8630000000000004


In [123]:
average_recall = calculate_recall(results_beam, true)
print(average_recall)

0.8880000000000001


In [48]:
k = 10
results_greedy = []
results_beam = []
greedytime = 0.0
beamtime = 0.0
for q in tqdm(query):
  start = datetime.now()
  g = [r[1] for r in bgraph.greedy_search(bgraph.graph, q, k=k, m = 1)[0]]

  end = datetime.now()
  greedytime += (end-start).total_seconds()
  start= datetime.now()
  b = [r[1] for r in bgraph.beam_search(bgraph.graph, q, k=k, m =1)[0]]
  end = datetime.now()
  beamtime += (end-start).total_seconds()
  results_greedy.append(g)
  results_beam.append(b)

100%|██████████| 100/100 [00:01<00:00, 93.27it/s]


In [12]:
def test():
    def time(graph):
        start = datetime.now()
        for _ in range(100):
            query_point = np.random.rand(10)
            nearest_neighbor = graph.greedy_search(query_point)
        end = datetime.now()
        print(len(graph.nodes))
        print(end - start)
        print()

    def add(graph, node_count):
        for _ in range(node_count):
            graph.add_node(np.random.rand(10))

    nsw1 = NSWGraph()
    nsw2 = NSWGraph()
    nsw3 = NSWGraph()

    add(nsw1, 1000)
    add(nsw2, 2000)
    add(nsw3, 4000)

    time(nsw1)
    time(nsw2)
    time(nsw3)

test()

1000
0:00:00.108000

2000
0:00:00.214994

4000
0:00:00.432016

