## DB Initialization

In this notebook we desgin the process to initialice the vector db. Since we will design this process with a future cloud deployment in mind we will start by uploading the files to cloud storge and the index files within a directory 

In [1]:
import os
import sys
import json
import hashlib
import tempfile

from tqdm import tqdm
from dotenv import load_dotenv

project_path = os.path.dirname(os.getcwd())
sys.path.append(project_path)

from src.preprocess import  extract_text_from_pdf, get_sequential_semantic_chunks
from src.storage import StorageManager
from src.db import ElasticsearchManager

load_dotenv(override=True)

GCP_PROJECT_ID = os.getenv('GCP_PROJECT_ID')
BUCKET = os.getenv('BUCKET')
INDEX_NAME = os.getenv('INDEX_NAME')
ELASTICSEARCH_HOST = os.getenv('ELASTICSEARCH_HOST')
ELASTICSEARCH_PORT = os.getenv('ELASTICSEARCH_PORT')
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
TEXT_FIELDS = os.getenv('TEXT_FIELDS').split(',')
KEYWORD_FIELDS = os.getenv('KEYWORD_FIELDS').split(',')
WORKERS = int(os.getenv('WORKERS'))

  from tqdm.autonotebook import tqdm, trange


We begin by uplading the docs to the bucket

In [2]:
storage_manager = StorageManager(GCP_PROJECT_ID, BUCKET)



In [3]:
docs_path = os.path.join(project_path, 'docs')
doc_categories = os.listdir(docs_path)

In [4]:
for category in doc_categories:
    category_path = os.path.join(docs_path, category)
    storage_manager.upload_dir(category_path, f'docs/{category}')

let's clear it and build a function to update the documents:

In [5]:
storage_manager.delete_dir("docs")

In [6]:
def upload_documents(docs_path:str, storage_manager:StorageManager) -> None:
    """Uploads documents to GCP bucket.

    Args:
        docs_path (str): Path to documents.
        storage_manager (StorageManager): Storage manager object.
    """
    doc_categories = os.listdir(docs_path)
    for category in doc_categories:
        category_path = os.path.join(docs_path, category)
        storage_manager.upload_dir(category_path, f'docs/{category}')

In [7]:
upload_documents(docs_path, storage_manager)

Now, lindex our documents:

In [6]:
elasticsearch_manager = ElasticsearchManager(ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, EMBEDDING_MODEL)

In [7]:
# We create an index first
elasticsearch_manager.create_index(
    index_name=INDEX_NAME,
    text_fields=TEXT_FIELDS,
    keyword_fields=KEYWORD_FIELDS
)

Index documentor_project created.


In [8]:
def index_document_from_blob(
    blob, 
    doc_index:int, 
    elasticsearch_manager:ElasticsearchManager,
    index_name:str
) -> None:
    """Indexes a document from a GCP bucket blob.

    Args:
        blob (Blob): GCP bucket blob.
        doc_id (str): Document ID.
        elasticsearch_manager (ElasticsearchManager): Elasticsearch manager object.
    """
    category, paper = blob.name.split('/')[1:]
    
    doc_id = hashlib.sha256(
        f'{category}-{paper}-{doc_index}'.encode('utf-8')
    ).hexdigest()
    
    pdf_path = os.path.join(tempfile.gettempdir(), 'paper.pdf')
    blob.download_to_filename(pdf_path)
    pdf_text = extract_text_from_pdf(pdf_path)
    doc_chunks = get_sequential_semantic_chunks(pdf_text,doc_id,WORKERS)
    
    docs = []
    
    for doc_chunk in doc_chunks:                
        docs.append({
            'id': f'{doc_id}-{doc_chunk['chunk']}',
            'category':category,
            'paper': paper,
            'text': doc_chunk['text']
        })
        
    elasticsearch_manager.index_documents(
        docs=docs,
        index_name=index_name
    )

In [7]:
for index, blob in tqdm(enumerate(storage_manager.bucket.list_blobs(prefix='docs'))):
    index_document_from_blob(
        blob=blob,
        doc_index=index+1,
        elasticsearch_manager=elasticsearch_manager,
        index_name=INDEX_NAME
    )
    

0it [00:00, ?it/s]

In [10]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(f"http://{ELASTICSEARCH_HOST}:{ELASTICSEARCH_PORT}")                       

In [11]:
from sentence_transformers import SentenceTransformer

emb_model = SentenceTransformer(EMBEDDING_MODEL)



In [13]:
elasticsearch_manager.hybrid_search(
    index_name=INDEX_NAME,
    query="machine learning",
    field_names=TEXT_FIELDS,
    vector=emb_model.encode("machine learning"),
)

[{'paper': 'metra_scalable_unsupervised_rl_with_metric_aware_abstraction.pdf',
  'id': 'bf9d5038e3b94a5e4df68bd3140008123eb773a52547f06b71520d78e2029474-22',
  'text': 'Count-based explo-\nration with neural density models.\nIn International Conference on Machine Learning (ICML) ,\n2017.\nSherjil Ozair, Corey Lynch, Yoshua Bengio, A ¨aron van den Oord, Sergey Levine, and Pierre Ser-\nmanet.\nWasserstein dependency measure for representation learning.\nIn Neural Information\nProcessing Systems (NeurIPS) , 2019.\nSeohong Park and Sergey Levine.\nPredictable mdp abstraction for unsupervised model-based rl.\nIn\nInternational Conference on Machine Learning (ICML) , 2023.\nSeohong Park, Jongwook Choi, Jaekyeom Kim, Honglak Lee, and Gunhee Kim.\nLipschitz-\nconstrained unsupervised skill discovery.\nIn International Conference on Learning Represen-\ntations (ICLR) , 2022.\n13Published as a conference paper at ICLR 2024\nSeohong Park, Dibya Ghosh, Benjamin Eysenbach, and Sergey Levine.\nHiql: