# Dependencies

In [92]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings import OpenAIEmbeddings 
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate 
from dotenv import load_dotenv
load_dotenv()
import os
import time

In [59]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")

# Loading and chunking the PDF Data
Here, we will use the PyPDFDirectoryLoader from the langchain wrapper to load the PDF data and chunk it into paragraphs. First step includes loading the current PDF file into the loader and converting it to a list of documents. Each document is a list of pages, which consists the metadata of the source PDF and the page number.

In [60]:
loader = PyPDFDirectoryLoader("data")
data = loader.load()
data

[Document(page_content='Project\nReport\nSPG\nGroup\nProject\nJan\n2024\nCommunication\nand\nListening\nSkills\nTeam\n5\nTeam\nMembers\nHari\nPrapan\n(21f3002087)\nPragya\nSingh\n(21f3001204)\nVisist\nTallam\n(21f2001553)\nUllas\nKumar\n(21f3002619)\nDhruv\nPamneja\n(21f1001719)\n', metadata={'source': 'data/Team_5_SPG_GP2_Report_Jan_2024.pdf', 'page': 0}),
 Document(page_content="Abstract\nThe\nproject\nfocuses\non\nexploring\nthe\ncritical\nimportance\nof\ncommunication\nand\nlistening\nskills\nwithin\nthe\ncorporate\nworld.\nIt\ndelves\ninto\nthe\nfoundational\nrole\nthese\nskills\nplay\nin\norganizational\nsuccess,\nimpacting\nareas\nsuch\nas\nemployee\nengagement,\nteam\ncollaboration,\nand\noverall\nproductivity.\nThe\ndecision\nto\ninvestigate\nthis\ntopic\nstems\nfrom\nits\nrelevance\nin\ntoday's\nfast-paced\nbusiness\nlandscape,\nwhere\neffective\ncommunication\nis\nvital\nfor\nnavigating\ncomplex\nenvironments\nand\nachieving\norganizational\ngoals.\nBy\nconducting\nsemi-stru

Now, since the context window of our LLM will be limited, the ideal way to handle this is to chunk the data into paragraphs. This is done by the chunker, which takes the list of documents and returns a list of paragraphs within the chunk limit we set (500 words in this case). The chunker also takes care of the page breaks and ensures that the paragraphs are not split across pages. Also, we will be introducing an overlap, which will be the number of words that will be repeated in the end of one chunk and the beginning of the next chunk. This is done to ensure that the context is not lost between the chunks.

In [61]:
text_split = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
text_chunks = text_split.split_documents(data)


text_chunks[:3]

[Document(page_content='Project\nReport\nSPG\nGroup\nProject\nJan\n2024\nCommunication\nand\nListening\nSkills\nTeam\n5\nTeam\nMembers\nHari\nPrapan\n(21f3002087)\nPragya\nSingh\n(21f3001204)\nVisist\nTallam\n(21f2001553)\nUllas\nKumar\n(21f3002619)\nDhruv\nPamneja\n(21f1001719)', metadata={'source': 'data/Team_5_SPG_GP2_Report_Jan_2024.pdf', 'page': 0}),
 Document(page_content="Abstract\nThe\nproject\nfocuses\non\nexploring\nthe\ncritical\nimportance\nof\ncommunication\nand\nlistening\nskills\nwithin\nthe\ncorporate\nworld.\nIt\ndelves\ninto\nthe\nfoundational\nrole\nthese\nskills\nplay\nin\norganizational\nsuccess,\nimpacting\nareas\nsuch\nas\nemployee\nengagement,\nteam\ncollaboration,\nand\noverall\nproductivity.\nThe\ndecision\nto\ninvestigate\nthis\ntopic\nstems\nfrom\nits\nrelevance\nin\ntoday's\nfast-paced\nbusiness\nlandscape,\nwhere\neffective\ncommunication\nis\nvital\nfor\nnavigating\ncomplex\nenvironments\nand\nachieving", metadata={'source': 'data/Team_5_SPG_GP2_Report_Ja

As we can see, each of our chunks is in the limit of 500 characters and the overlap is 20 characters, let us view the total number of chunks. 

In [62]:
print(f"Length of chunks : {len(text_chunks)}")

Length of chunks : 37


# Pinecone Initialization
Now, we will be using the pinecone vectorDB to store the embeddings of the chunks. We will be using the `pinecone.init()` function to initialize the pinecone environment. We will be using the `pinecone.use_index()` function to use the index created for this project and setup the instance for the same.

In [50]:
pc = Pinecone(api_key = PINECONE_API_KEY, environment = PINECONE_API_ENV)

Now, let us view the indexs avaliable in the pinecone environment.

In [52]:
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'documents-f1hj4e6.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'documents',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

So this is the one we will be using for storing the documnets of this project.

# Embedding the Chunks using OpenAI text-embedding-3-small
Here, we will be using the OpenAI text-embedding-3-small model to embed the chunks, for which we will need an openAI instance initialised.

In [63]:
openAI_client = OpenAI(api_key=OPENAI_API_KEY)

Let us go ahead and set the embeddings model and a function to get the embeddings of any given text via the text-embedding-3-small model.

In [68]:
embedding_model = openAI_client.embeddings

def get_embedding(text) :
    response = embedding_model.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

Now, each chunk will ideally below to a documnent stored in S3 bucket in AWS. So, for now we will assume that the **KEY** of that document is in the format as given below which we will be using in the metadata. To simiplify the process, each entry in the vectorDB should have : 

* **ID** : The unique ID of the document, which will be a combination of the document key and the chunk number.
* **VALUES** : The embedding of the chunk, as generated by the OpenAI text-embedding-3-small model.
* **METADATA** : The metadata of the document, which will include the document key and the chunk number.
    *  **USER_ID** : The ID of the user in our system.
    *  **DOCUMENT_TYPE** : The type of the document from either "pdf" or "text".
    *  **KEY** : The key of the document in the S3 bucket, which defines the location of the document.
    *  **CHUNK** : The text of the chunk.
    *  **PAGE_NUMBER** : The page number of the chunk in the document.
    *  **CHUNK_INDEX** : The number of the chunk in the document.

Since most of the information will be given by the backend server, for now we will use dummy values for the metadata.

In [94]:
KEY = "61f100abx/pdf/projectReport"
USER_ID = "61f100abx"
DOCUMENT_TYPE = "pdf"

Let us now create the function to create vectors in our desired format as defined above.

In [99]:
def create_vectors(text_chunks,KEY,USER_ID,DOCUMENT_TYPE):
    v = []
    chunk_num = 0
    
    for chunk in text_chunks: 
        page_num = chunk.metadata["page"]
        
        entry = {}
        entry["id"] = f"{KEY}_PAGE_{page_num}_CHUNK_{chunk_num}"
        entry["values"] = get_embedding(chunk.page_content)
        entry["metadata"] = {
            "userID" : USER_ID,
            "type" : DOCUMENT_TYPE,
            "key" : KEY,
            "chunk" : chunk.page_content,
            "page_number" : chunk.metadata["page"],
            "chunk_number" : chunk_num
        }
        
        chunk_num += 1
        v.append(entry)
        
    return v

In [100]:
vectors = create_vectors(text_chunks,KEY,USER_ID,DOCUMENT_TYPE)

With this, we have our vectors stored in the ideal format to be pushed into the vector DB. Let us now push the vectors into the vectorDB of pinecone.

# Pushing the Vectors into the Pinecone Index

In [101]:
index_name = "documents"
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

index.upsert(
    vectors=vectors
)

{'upserted_count': 37}

# Querying the Vectors
We shall now query the vectors to check if the vectors have been stored correctly in the pinecone index, and how does this exactly work. We will fetch the relevant vectors from the pinecone index. For that, we will create a function which takes a text query, converts into to an embedding and queries the pinecone index to get the most similar texts from the vectors stored in the index.

In [106]:
def get_relevant_chunks(query):
    query_vector = get_embedding(query)
    
    results = index.query(
        vector = query_vector,
        top_k = 5,
        include_values = False,
        include_metadata = True
    )
    
    relevant_texts = []
    for record in results['matches']:
        text = {}
        text['score'] = record['score']
        text['text'] = record['metadata']['chunk']
        relevant_texts.append(text)
    
    return relevant_texts

Finally, we can create a QA system which will take a query and return the most relevant chunks from the PDF document.

In [None]:
import sys 
while True:
    user_input = input(f"Input Prompt: ")
    if user_input=='exit':
        print( 'Exiting')
        sys.exit()
    if user_input == '':
        continue
    
    docs = get_relevant_chunks(user_input)
        
    for doc in docs:
        print(f"Rank {doc['score']} \n Answer: \n {doc['text']}")
        print("------------------------")

    print("------------------------------------------------------------------------------------------------------------------------")
        

With this, our pipeline is complete and we can now move on to the next steps which is sending these relvant documents to the LLM to answer our query.