-
Notifications
You must be signed in to change notification settings - Fork 6
/
create_cosine_index.py
60 lines (51 loc) · 1.63 KB
/
create_cosine_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import opensearch_py_ml as oml
import pandas as pd
from config import DATA_PATH, VECTOR_NAME, VECTOR_SIZE, client
from tqdm import tqdm
INDEX_NAME = "arxiv-cosine"
# create index payload
body = {
"settings": {"index": {"knn": "true", "knn.algo_param.ef_search": 100}},
"mappings": {
"properties": {
VECTOR_NAME: {
"type": "knn_vector",
"dimension": VECTOR_SIZE,
"method": {
"name": "hnsw",
"space_type": "cosinesimil",
"engine": "nmslib",
"parameters": {"ef_construction": 128, "m": 24},
},
},
}
},
}
print(f"Creating index={INDEX_NAME}")
response = client.indices.create(INDEX_NAME, body=body)
print(response)
# read in datafile to get in the fields to add to the index
df = pd.read_csv(DATA_PATH).fillna("").reset_index(drop=True)
# open cache of embedding vectors
with open("cache.jsonl", "r") as fp:
cache = json.load(fp)
# insert each row one-at-a-time to the document index
for i, row in tqdm(df.iterrows()):
text = row.abstract
try:
embed = cache[text]
body = {
VECTOR_NAME: embed,
"text": text,
"title": row.title,
"arxiv_id": row.id,
"doi": row.doi,
}
response = client.index(index=INDEX_NAME, id=i, body=body)
except Exception as e:
print(f"[ERROR]: {e}")
continue
# sanity check inserted records
oml_df = oml.DataFrame(client, INDEX_NAME)
print(f"Shape of records inserted into index {INDEX_NAME} = {oml_df.shape}")