In [1]:
import sys
sys.path.append('/fsx/matzeni/duck')

In [2]:
import torch
import h5py
import json
from pathlib import Path
import pickle
from tqdm import tqdm
import logging
from transformers import AutoTokenizer
import numpy as np
import faiss

from mblink.utils.utils import EntityCatalogue
from duck.datamodule import RelationCatalogue
from duck.common.utils import load_json
from duck.preprocessing.duck_index import DuckIndex

logger = logging.getLogger()

In [32]:
ent_catalogue = EntityCatalogue(
    '/fsx/matzeni/data/duck/bert_ent_256.h5',
    '/fsx/matzeni/data/duck/bert_ent_256_idx.txt'
)
rel_catalogue = RelationCatalogue(
    '/fsx/matzeni/data/duck/bert_rel_256.h5',
    '/fsx/matzeni/data/duck/bert_rel_256_idx.txt'
)
ent_to_rel = load_json(
    '/fsx/matzeni/data/duck/ent_to_rel.json'
)

In [35]:
entities = list(ent_catalogue.idx.keys())
relations = list(rel_catalogue.idx.keys())
duck_index = DuckIndex(entities, relations, ent_to_rel, gpu=False)

In [36]:
duck_index.load("/fsx/matzeni/data/duck/duck_index_flat.faiss")

In [43]:
duck_index.search([
    "Italy",
    "Donald Trump",
    "Cristiano Ronaldo",
    "Justin Bieber",
    "London",
    "Lion"
], k=4)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:17<00:00, 17.46s/it]


{'Italy': ['Portugal', 'Norway', 'Poland', 'Spain'],
 'Donald Trump': ['Trump Donald',
  "Donald J. Trump's",
  'Joe Biden',
  'Clinton hillary'],
 'Cristiano Ronaldo': ['Lionel Messi', 'Gerard Piqué', 'Neymar', 'Ronaldinho'],
 'Justin Bieber': ['Bieber Justin',
  'Harry Styles',
  'Chris Brown',
  'Ed Sheeran'],
 'London': ['Istanbul', 'Madrid', 'Stockholm', 'Istanbul Province'],
 'Lion': ['Tiger', 'Leopard', 'Cheetah', 'Jaguar']}

In [9]:
res = faiss.StandardGpuResources()

In [54]:
index = faiss.IndexBinaryHNSW(8888, 16)
faiss.index_cpu_to_gpu(res, 0, index)

TypeError: Wrong number or type of arguments for overloaded function 'index_cpu_to_gpu'.
  Possible C/C++ prototypes are:
    faiss::gpu::index_cpu_to_gpu(faiss::gpu::GpuResourcesProvider *,int,faiss::Index const *,faiss::gpu::GpuClonerOptions const *)
    faiss::gpu::index_cpu_to_gpu(faiss::gpu::GpuResourcesProvider *,int,faiss::Index const *)


In [63]:
index_hnsw_float = faiss.IndexHNSWFlat(d, 16)
index_hnsw_ref = faiss.IndexBinaryFromFloat(index_hnsw_float)

In [31]:
d = 888
index = faiss.index_binary_factory(d, "BHash4x32")

In [None]:
list(ent_catalogue.idx.values()) == list(range(len(ent_catalogue)))

In [None]:
nrels = len(rel_catalogue)
dim = nrels
if nrels % 8 != 0:
    dim = nrels + 8 - (nrels % 8)
base_index = faiss.IndexBinaryFlat(dim)
index = faiss.GpuIndexBinaryFlat(res, base_index)

eye = np.eye(dim).astype(np.uint8)
one_hots = []
repr_to_ent = {}

for i, e in tqdm(enumerate(ent_to_rel), total=30000):
    rels = ent_to_rel[e]
    indexes = [rel_catalogue[r][0] for r in rels]
    one_hot = np.sum(eye[indexes], axis=0)
    one_hots.append(one_hot)
    repr_to_ent[one_hot.tobytes()] = e # this is a bug
    if (i + 1) % 5000 == 0:
        data = np.stack(one_hots)
        data = np.packbits(data, axis=-1)
        index.add(data)
        one_hots = []
    if (i + 1) % 30000 == 0:
        break

In [None]:
faiss.write_index(base_index, "index.faiss")

In [None]:
index.copyTo(base_index)

In [None]:
base_index.ntotal

In [None]:
all_ents = list(ent_to_rel.keys())

In [None]:
ent = all_ents[3]
print(ent)
rels = ent_to_rel[ent]
indexes = [rel_catalogue[r][0] for r in rels]
one_hot = np.sum(eye[indexes], axis=0)
key = np.packbits(one_hot)
key = np.expand_dims(key, axis=0)

D, I = index.search(key, k=4)
I

In [None]:
d = 1000                           # dimension
nb = 500000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.randint(2, size=(nb, d)).astype(np.uint8)
xq = np.random.randint(2, size=(nq, d)).astype(np.uint8)

In [None]:
xb = np.packbits(xb, axis=-1)
xq = np.packbits(xq, axis=-1)

In [None]:
gpu_index_flat.d

In [None]:
xb.shape

In [None]:
torch.cuda.is_available()

In [None]:
!nvidia-smi

In [None]:
gpu_index_flat.add(xb)         # add vectors to the index
print(gpu_index_flat.ntotal)

k = 4                          # we want to see 4 nearest neighbors
D, I = gpu_index_flat.search(xq, k)  # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:]) 

In [None]:
a = (np.eye(10)[[1, 3, 8]]).tolist()
a = np.stack([np.array(row) for row in a])
for r in a:
    print(r.shape)

In [None]:
onehot_to_ent = {}
onehot_to_ent[a] = 'ciao'

In [None]:
b = np.eye(5)[0].tobytes()

In [None]:
b