In [None]:
%pip install langchain openai chromadb tiktoken

import os
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

api_key = ''
os.environ["OPENAI_API_KEY"] = api_key

In [1]:
# function that initializes the vector database
def initialize_db(file_path, chunk_size=1000, chunk_overlap=0, persist_directory="./chroma_db"):
    # document loader
    loader = CSVLoader(file_path=file_path)
    data = loader.load()

    # document transform
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(data)

    # document embedding
    embeddings = OpenAIEmbeddings()

    # vector database
    db = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)

    return db

In [None]:
# function that retrieves relevant documents based on the query from the database
def retrieve(query, db, k=20):
    # retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})

    # retrieve documents
    retrieved_docs = retriever.get_relevant_documents(query)
    
    retrieved_str = '\n'.join([doc.page_content for doc in retrieved_docs])
    return retrieved_str  # returns a string