In [1]:
import os
import pandas as pd
from pinecone import Pinecone, ServerlessSpec
import sys
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np 

sys.path.append('..')

from main import VectorDatabase, BiEncoder, SimpleSentenceChunker

  from tqdm.autonotebook import tqdm


## Initiliaze Database

In [2]:
API_KEY = "c4ac140e-932e-40c3-84e5-e407580eef2a"
pc = Pinecone(api_key=API_KEY)
indexes = pc.list_indexes()
print(indexes)

[{
    "name": "test",
    "dimension": 312,
    "metric": "dotproduct",
    "host": "test-im26dq4.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}]


In [3]:
INDEX_NAME = 'test'
DIMENSION = 312  #
CLOUD = 'aws'
REGION = 'us-west-1'

In [4]:
vector_db = VectorDatabase(api_key=API_KEY)
handler = vector_db.start_db(index_name=INDEX_NAME, dimension=DIMENSION, cloud=CLOUD, region=REGION)

In [5]:
handler.delete_all()

## Read Data

In [None]:
data_folder_path = os.path.join('..', 'data')
query_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/queries.csv"), index_col=0)
document_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/corpus.csv"), index_col=0)
document_df

## Encode and Save to DB

In [6]:
tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
model = AutoModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

In [7]:
encoder = BiEncoder(tokenizer, model)

In [8]:
document_df["text"] = document_df["text"].fillna("") 
texts = document_df["text"].astype(str).tolist() 

In [None]:
batch_size = 10  

document_df["text"] = document_df["text"].fillna("")
texts = document_df["text"].astype(str).tolist()

def batch_upsert(texts, batch_size):
    # Iterate through batches of texts
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_indexes = document_df.index[i:i+batch_size]

        # Encode the current batch
        encoded_documents = encoder.encode_batch(batch_texts)

        # Prepare the batch data for upsert
        batch_data = [(str(idx), embedding.tolist()) for idx, embedding in zip(batch_indexes, encoded_documents)]

        # Perform batch upsert
        handler.index.upsert(vectors=batch_data)
        print(f"Upserted batch {i//batch_size + 1}")

batch_upsert(texts, batch_size=batch_size)

In [16]:
test_query = query_df["text"].iloc[0]

In [17]:
test_query = encoder.encode(test_query)

In [None]:
test_query.tolist()

In [None]:
test_query = np.array(test_query, dtype=np.float32)

test_query_list = test_query.tolist()

results = handler.query_vector(test_query_list, top_k=5)

for result in results:
    print(f"Document ID: {result['id']}, Score: {result['score']}")