In [15]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers

In [2]:
PINE_CONE_API_KEY = "aaab20a3-08ea-4b22-9e4a-2d1316049892"
PINE_CONE_ENV = "gcp-starter"

#### Reading the PDF data

In [3]:
# Loading the data from PDF file
def data_extractor(dir_path):
    loader = DirectoryLoader(dir_path, # directory path
                            glob="*.pdf", # only pdf files
                            loader_cls = PyPDFLoader, # using module
                            show_progress=True,
                            use_multithreading=True
                            )
    # loading the pdf documents
    documents = loader.load()

    return documents

In [4]:
pdf_data = data_extractor('data/')
# print(pdf_data)

100%|██████████| 1/1 [00:26<00:00, 26.02s/it]


#### Splitting corpus into text chunks

In [5]:
# Create text chunks
def text_splitter(document):
    text_chunks = RecursiveCharacterTextSplitter(chunk_size=500,
                                    chunk_overlap = 20
                                )
    return text_chunks.split_documents(document)

In [6]:
text_chunks = text_splitter(pdf_data)
print('Length of text chunks : ',len(text_chunks))

Length of text chunks :  7020


#### Converting text to vectors

In [7]:
# Downloading the embedding model from hugging face
EMBD_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=EMBD_MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 167kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 31.3kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 1.25MB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 59.7kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 11.5kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 2.17MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:05<00:00, 17.9MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 3.58kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 9.56kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.26MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<?, ?B/s] 
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 3.79MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 13.6MB/s]
modules.json: 100%|██████████| 349/349 [0

In [17]:
# initializing the vector DB pinecone
pinecone.init(api_key=PINE_CONE_API_KEY,
              environment=PINE_CONE_ENV
            )
index_name = "medical-chat-bot-llama-2"

# creating embeddings for each text chunk and storing into pinecone
doc_search = Pinecone.from_texts([i.page_content for i in text_chunks], # list of chunks text
                                  embedding_model, # Embedding model 
                                  index_name = index_name  # pinecone index name
                                  )