In [4]:
# Import necessary libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_chroma import Chroma
from PyPDF2 import PdfReader

In [5]:
FILES = [
    './train_files/academic-policies-and-procedures.pdf',
]

In [6]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

# Initialize embeddings using HuggingFace model
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

vector_store = Chroma(
    collection_name='howard_information',
    embedding_function=embeddings,
    persist_directory='./data/chroma'
)

for file_path in FILES:
    # Convert the content of the PDF file to a BytesIO stream
    file = open(file_path, 'rb')

    # Create a PdfReader object from the stream to extract text 
    pdf = PdfReader(file)  
    pdf_text = ""
    # # Iterate through each page in the PDF and extract text
    for page in pdf.pages:
        pdf_text += page.extract_text()  # Concatenate the text from each page

    # # ### Create embeddings for the uploaded documents and store in vector store
    # # # Initialize a text splitter for processing long texts
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                                    chunk_overlap=10)
    # pdf_text = """"""
    # # Create documents by splitting the provided texts
    documents = text_splitter.create_documents([pdf_text])
    # # Create embeddings
    vector_store.add_documents(documents)