In [17]:
import tqdm
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema.document import Document
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [2]:
# Extract data from the pdf:
def load_pdf(data):
    loader = DirectoryLoader(data, 
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [3]:
extracted_data = load_pdf("data/")

In [4]:
#extracted_data

In [5]:
# Create text chunks:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    print(vars(text_chunks[0]))
    return text_chunks

In [6]:
text_chunks = text_split(extracted_data)
print("Length of my chunk: ", len(text_chunks))

{'page_content': 'TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', 'metadata': {'source': 'data\\Medicine_Encyclopedia.pdf', 'page': 0}, 'type': 'Document'}
Length of my chunk:  6983


In [7]:
# Download embedding model:
def download_hf_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [None]:
embeddings = download_hf_embeddings()

In [9]:
#embeddings

In [10]:
query_result = embeddings.embed_query("Hello world")
print("Length: ", len(query_result))

Length:  384


In [16]:
# Initializing the chromadb:
chromadb = Chroma.from_documents(
            documents=text_chunks,
            embedding=embeddings,
            persist_directory='./Data/chromadb')