In [39]:
import configparser
import os
import uuid
import cloudpickle
import faiss

import chromadb
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.docstore.document import Document
from langchain.llms import OpenAI
from langchain.chains.query_constructor.base import load_query_constructor_chain
from langchain.chains import ConversationalRetrievalChain, RetrievalQA

import pandas as pd 

In [2]:
DB_PATH = "../data/embeddings/"

In [3]:
def get_openai_key():
    config = configparser.ConfigParser()
    config.read('config.ini')
    api_key = config['DEFAULT']['OpenAI_API_Key']
    return api_key

def initialize_openai_embeddings():
    # Load the OpenAI embeddings
    api_key = get_openai_key()
    openai_embeddings = OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-large")

    # Define a custom Embeddings class
    class CustomEmbeddings(Embeddings):
        openai_embeddings: OpenAIEmbeddings  # Define the openai_embeddings field

        def __init__(self, openai_embeddings):
            self.openai_embeddings = openai_embeddings

        def embed_documents(self, texts):
            return self.openai_embeddings.embed_documents(texts)

        def embed_query(self, text):
            return self.openai_embeddings.embed_query(text)

    # Create an instance of the custom Embeddings class
    embeddings = CustomEmbeddings(openai_embeddings)

    return embeddings

def save_database_components(db, DB_PATH):
    """Save the components of the database"""
    # Ensure the directory exists
    if not os.path.exists(DB_PATH):
        os.makedirs(DB_PATH)

    # Create an empty FAISS index if it doesn't exist
    if not hasattr(db, 'index'):
        db.index = faiss.IndexFlatL2(1536)  # Assuming the embedding dimension is 1536 for text-embedding-3-large

    # Save the faiss index
    faiss.write_index(db.index, os.path.join(DB_PATH, "faiss.index"))

    # Create an empty docstore if it doesn't exist
    if not hasattr(db, 'docstore'):
        db.docstore = {}

    # Save the docstore
    with open(os.path.join(DB_PATH, "docstore.pkl"), "wb") as f:
        cloudpickle.dump(db.docstore, f)

    # Create an empty index_to_docstore_id if it doesn't exist
    if not hasattr(db, 'index_to_docstore_id'):
        db.index_to_docstore_id = {}

    # Save the index_to_docstore_id
    with open(os.path.join(DB_PATH, "index_to_docstore_id.pkl"), "wb") as f:
        cloudpickle.dump(db.index_to_docstore_id, f)

    print("Database components saved successfully.")


def convert_to_list(doc, field):
    """Converts a comma-separated string from the specified field in a document to a list of strings."""
    # Check if the field exists in the document's metadata and it contains a string
    if field in doc.metadata and isinstance(doc.metadata[field], str):
        # Split the string by commas and strip any surrounding whitespace from each item
        doc.metadata[field] = [item.strip() for item in doc.metadata[field].split(',')]
    else:
        # If the field doesn't exist or doesn't contain a string, do nothing
        pass

In [4]:
# Load the probes data
probes_df = pd.read_csv('../data/probes.csv')
probes_df.fillna("", inplace=True)
#probes_df['Description'].fillna("", inplace=True)
probes_df.head()

Unnamed: 0,Probe_ID,Probe_Name,Manufacturer,Compatible_Systems,Probe_Type,Frequency_Range,Stock,Description
0,1,C3,ATL,HDI 5000,Curved Array,,0,The ATL C3 is a convex curved array ultrasound...
1,2,C4-2,ATL,HDI 5000,Curved Array,2-4 MHz,2,The ATL C4-2 is a convex ultrasound transducer...
2,3,C5-2,ATL,"HDI 1500, HDI 3000, HDI 3500, HDI 5000",Curved Array,2-5 MHz,7,The ATL C5-2 Curved Array transducer is a vers...
3,4,C5-IVT,ATL,"UM9 HDI, HDI 1500, HDI 3000, HDI 3500, HDI 5000",Intracavitary,,0,The ATL C5-IVT curved linear ultrasound transd...
4,5,C7-4,ATL,"UM9 HDI, HDI 1500, HDI 3000, HDI 5000",Curved Array,4-7 MHz,0,The ATL C7-4 curved linear ultrasound transduc...


In [7]:
# Create an instance of the custom Embeddings class
embeddings = initialize_openai_embeddings()

# Create the DB_PATH directory if it doesn't exist
DB_PATH = "../data/embeddings/"
os.makedirs(DB_PATH, exist_ok=True)

# Initialize Chroma vector store with the custom Embeddings instance
db = Chroma(persist_directory="../data/embeddings/chromaDB", embedding_function=embeddings, collection_name = 'my_collection')

def populate_vector_db(probes_df, db, DB_PATH):
    """Extract embedding content and metadata from dataframe and populates the vector database with documents"""
    # Process each row in the DataFrame
    for index, row in probes_df.iterrows():
        texts = []
        metadatas = []
        
        # Metadata for filtering
        metadata = {
            "Probe_ID": row['Probe_ID'],
            "Probe_Name": row['Probe_Name'],
            "Manufacturer": row['Manufacturer'],
            "Compatible_Systems": ', '.join(row['Compatible_Systems'].split(', ')),  # Convert list to a comma-separated string
            "Probe_Type": row['Probe_Type'],
            "Frequency_Range": row['Frequency_Range'],
            "Stock": row['Stock']
        }
        
        # Ensure page_content is a string, replace NaN with an empty string
        page_content = str(row['Description']) if pd.notna(row['Description']) else ""
        
        # Skip the row if the Description column is empty
        if not page_content:
            continue
        
        # Split the content into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=256)
        chunks = text_splitter.split_text(page_content)
        
        for chunk in chunks:
            random_uuid = str(uuid.uuid4())  
            texts.append(chunk)
            
            # Create a unique path for each chunk
            chunk_file_path = f"{DB_PATH}chunks/{random_uuid}.txt"
            os.makedirs(os.path.dirname(chunk_file_path), exist_ok=True)
            with open(chunk_file_path, "w") as file:
                file.write(chunk)
            
            metadatas.append({
                'id': random_uuid,
                'probe_description_path': page_content,
                'chunk_file_path': chunk_file_path,
                **metadata
            })
        
        # Add the text chunks and their metadata to the database.
        db.add_texts(texts, metadatas)

    return db

db = populate_vector_db(probes_df, db, DB_PATH)

# Save components of the database
save_database_components(db, DB_PATH)

Database components saved successfully.


In [19]:
def load_vector_db(DB_PATH="../data/embeddings/"):
    # Initialize variables for the components of the database.
    db = None
    docstore = None
    index_to_docstore_id = None

    # Check if the database already exists. If it does, load its components.
    if os.path.exists(DB_PATH):
        with open(os.path.join(DB_PATH, "docstore.pkl"), "rb") as f:
            docstore = cloudpickle.load(f)
        with open(os.path.join(DB_PATH, "index_to_docstore_id.pkl"), "rb") as f:
            index_to_docstore_id = cloudpickle.load(f)
        index = faiss.read_index(os.path.join(DB_PATH, "faiss.index"))
    else:
        # If the database does not exist, create a new FAISS index.
        index = faiss.IndexFlatL2(3072)

    # Load the OpenAI embeddings
    embeddings = initialize_openai_embeddings()

    # Create the FAISS vector database with the loaded or new components.
    db = FAISS(
        index=index,
        docstore=InMemoryDocstore(docstore),
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embeddings
    )

    return db

vectorstore = load_vector_db(DB_PATH)

