In [1]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
import os, shutil

Loading pdf documents

In [2]:
document_loader = PyPDFDirectoryLoader('./papers/')
file = document_loader.load()

Splitting documents into chunks

In [4]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
chunk_overlap=400, 
length_function=len,
is_separator_regex=False,
)
chunks = splitter.split_documents(file)

In [5]:
#Assigning ids to chunks from source and page+chunk number
ids = []
curr_page = 0
chunk_num = 0
for chunk in chunks:
    source = chunk.metadata.get('source').split('\\')[1]
    page = chunk.metadata.get('page')
    if page == curr_page:
        chunk_num += 1
    else:
        curr_page = page
        chunk_num = 1
    id = source + '-' + str(page) + ':' + str(chunk_num)
    chunk.metadata['id'] = id
    ids.append(id)

In [6]:
embeddings = OllamaEmbeddings(model='nomic-embed-text')

Creating embeddings and vector database

In [7]:
#Clearing previous database directory
#shutil.rmtree('database', ignore_errors=True)

In [12]:
#Initializing database
database = Chroma(persist_directory='./database', embedding_function=embeddings)

Adding documents to the database

In [None]:
%%capture
#Checking for existing entries and updating the database
existing = database.get()['ids']
new_chunks = []
new_ids = []
for chunk in chunks:
    curr_id = chunk.metadata.get('id')
    if curr_id not in existing:
        new_chunks.append(chunk)
        new_ids.append(curr_id)
database.add_documents(new_chunks, ids=new_ids)

In [15]:
#Creating a database directly from documents
#database = Chroma.from_documents(documents=chunks, collection_name='papers', persist_directory='./database', embedding=embeddings)