# Dependencies

In [2]:
from langchain.document_loaders import PyPDFDirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from pinecone import Pinecone, ServerlessSpec
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate 
from dotenv import load_dotenv
load_dotenv()
import os
import time

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")

# Loading and chunking the Data
Here, we will use the PyPDFDirectoryLoader for the PDF from the langchain wrapper to load the PDF data and chunk it into paragraphs. First step includes loading the current PDF file into the loader and converting it to a list of documents. Each document is a list of pages, which consists the metadata of the source PDF and the page number.

In [60]:
loader = PyPDFDirectoryLoader("data")
data = loader.load()
data

[Document(page_content='Project\nReport\nSPG\nGroup\nProject\nJan\n2024\nCommunication\nand\nListening\nSkills\nTeam\n5\nTeam\nMembers\nHari\nPrapan\n(21f3002087)\nPragya\nSingh\n(21f3001204)\nVisist\nTallam\n(21f2001553)\nUllas\nKumar\n(21f3002619)\nDhruv\nPamneja\n(21f1001719)\n', metadata={'source': 'data/Team_5_SPG_GP2_Report_Jan_2024.pdf', 'page': 0}),
 Document(page_content="Abstract\nThe\nproject\nfocuses\non\nexploring\nthe\ncritical\nimportance\nof\ncommunication\nand\nlistening\nskills\nwithin\nthe\ncorporate\nworld.\nIt\ndelves\ninto\nthe\nfoundational\nrole\nthese\nskills\nplay\nin\norganizational\nsuccess,\nimpacting\nareas\nsuch\nas\nemployee\nengagement,\nteam\ncollaboration,\nand\noverall\nproductivity.\nThe\ndecision\nto\ninvestigate\nthis\ntopic\nstems\nfrom\nits\nrelevance\nin\ntoday's\nfast-paced\nbusiness\nlandscape,\nwhere\neffective\ncommunication\nis\nvital\nfor\nnavigating\ncomplex\nenvironments\nand\nachieving\norganizational\ngoals.\nBy\nconducting\nsemi-stru

Now, since the context window of our LLM will be limited, the ideal way to handle this is to chunk the data into paragraphs. This is done by the chunker, which takes the list of documents and returns a list of paragraphs within the chunk limit we set (500 words in this case). The chunker also takes care of the page breaks and ensures that the paragraphs are not split across pages. Also, we will be introducing an overlap, which will be the number of words that will be repeated in the end of one chunk and the beginning of the next chunk. This is done to ensure that the context is not lost between the chunks.

Now, we should be able to do the same for text data as well.

In [55]:
text_loader = TextLoader("data/W5_Code.txt")
text_data = text_loader.load()
text_data

[Document(page_content='from pyspark.sql import SparkSession\nfrom pyspark.sql.functions import last\n\nspark = SparkSession.builder.appName("SCD Type II Merge W5 GA").getOrCreate()\n\nmaster_file_path = "gs://week-5-ga/source_data-w5.csv"\nupdate_file_path = "gs://week-5-ga/update_data-w5.csv"\noutput_file_path = "gs://week-5-ga/master_data-w5.csv"\n\nmaster_df = spark.read.csv(master_file_path, header=True, inferSchema=True)\nupdate_df = spark.read.csv(update_file_path, header=True, inferSchema=True)\n\ncombined_df = master_df.union(update_df)\n\nfinal_df = combined_df.groupBy("Customer ID").agg(\n    last("Name").alias("Name"),\n    last("Address").alias("Address"),\n    last("Membership Start Date").alias("Membership Start Date"),\n    last("Membership End Date").alias("Membership End Date")\n)\n\nfinal_df.show()\nfinal_df.write.csv(output_file_path, header=True)', metadata={'source': 'data/W5_Code.txt'})]

Now, we will have to add a page (could be 0) in the metadata on this, as it expects it to be there in the final chain.

In [56]:
text_data[0].metadata["page"] = 0
text_data

[Document(page_content='from pyspark.sql import SparkSession\nfrom pyspark.sql.functions import last\n\nspark = SparkSession.builder.appName("SCD Type II Merge W5 GA").getOrCreate()\n\nmaster_file_path = "gs://week-5-ga/source_data-w5.csv"\nupdate_file_path = "gs://week-5-ga/update_data-w5.csv"\noutput_file_path = "gs://week-5-ga/master_data-w5.csv"\n\nmaster_df = spark.read.csv(master_file_path, header=True, inferSchema=True)\nupdate_df = spark.read.csv(update_file_path, header=True, inferSchema=True)\n\ncombined_df = master_df.union(update_df)\n\nfinal_df = combined_df.groupBy("Customer ID").agg(\n    last("Name").alias("Name"),\n    last("Address").alias("Address"),\n    last("Membership Start Date").alias("Membership Start Date"),\n    last("Membership End Date").alias("Membership End Date")\n)\n\nfinal_df.show()\nfinal_df.write.csv(output_file_path, header=True)', metadata={'source': 'data/W5_Code.txt', 'page': 0})]

After this, this chunk will follow the exact same steps as the PDF data.

In [61]:
text_split = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
text_chunks = text_split.split_documents(data)


text_chunks[:3]

[Document(page_content='Project\nReport\nSPG\nGroup\nProject\nJan\n2024\nCommunication\nand\nListening\nSkills\nTeam\n5\nTeam\nMembers\nHari\nPrapan\n(21f3002087)\nPragya\nSingh\n(21f3001204)\nVisist\nTallam\n(21f2001553)\nUllas\nKumar\n(21f3002619)\nDhruv\nPamneja\n(21f1001719)', metadata={'source': 'data/Team_5_SPG_GP2_Report_Jan_2024.pdf', 'page': 0}),
 Document(page_content="Abstract\nThe\nproject\nfocuses\non\nexploring\nthe\ncritical\nimportance\nof\ncommunication\nand\nlistening\nskills\nwithin\nthe\ncorporate\nworld.\nIt\ndelves\ninto\nthe\nfoundational\nrole\nthese\nskills\nplay\nin\norganizational\nsuccess,\nimpacting\nareas\nsuch\nas\nemployee\nengagement,\nteam\ncollaboration,\nand\noverall\nproductivity.\nThe\ndecision\nto\ninvestigate\nthis\ntopic\nstems\nfrom\nits\nrelevance\nin\ntoday's\nfast-paced\nbusiness\nlandscape,\nwhere\neffective\ncommunication\nis\nvital\nfor\nnavigating\ncomplex\nenvironments\nand\nachieving", metadata={'source': 'data/Team_5_SPG_GP2_Report_Ja

As we can see, each of our chunks is in the limit of 500 characters and the overlap is 20 characters, let us view the total number of chunks. 

In [62]:
print(f"Length of chunks : {len(text_chunks)}")

Length of chunks : 37


# Pinecone Initialization
Now, we will be using the pinecone vectorDB to store the embeddings of the chunks. We will be using the `pinecone.init()` function to initialize the pinecone environment. We will be using the `pinecone.use_index()` function to use the index created for this project and setup the instance for the same.

In [4]:
pc = Pinecone(api_key = PINECONE_API_KEY, environment = PINECONE_API_ENV)

Now, let us view the indexs avaliable in the pinecone environment.

In [5]:
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'documents-f1hj4e6.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'documents',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

So this is the one we will be using for storing the documnets of this project.

# Embedding the Chunks using OpenAI text-embedding-3-small
Here, we will be using the OpenAI text-embedding-3-small model to embed the chunks, for which we will need an openAI instance initialised.

In [8]:
openAI_client = OpenAI(api_key=OPENAI_API_KEY)

Let us go ahead and set the embeddings model and a function to get the embeddings of any given text via the text-embedding-3-small model.

In [69]:
embedding_model = openAI_client.embeddings

def get_embedding(text) :
    response = embedding_model.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

Now, each chunk will ideally below to a documnent stored in S3 bucket in AWS. So, for now we will assume that the **KEY** of that document is in the format as given below which we will be using in the metadata. To simiplify the process, each entry in the vectorDB should have : 

* **ID** : The unique ID of the document, which will be a combination of the document key and the chunk number.
* **VALUES** : The embedding of the chunk, as generated by the OpenAI text-embedding-3-small model.
* **METADATA** : The metadata of the document, which will include the document key and the chunk number.
    *  **USER_ID** : The ID of the user in our system.
    *  **DOCUMENT_TYPE** : The type of the document from either "pdf" or "text".
    *  **KEY** : The key of the document in the S3 bucket, which defines the location of the document.
    *  **CHUNK** : The text of the chunk.
    *  **PAGE_NUMBER** : The page number of the chunk in the document.
    *  **CHUNK_INDEX** : The number of the chunk in the document.

Since most of the information will be given by the backend server, for now we will use dummy values for the metadata.

In [73]:
KEY = "61f100abx/pdf/projectReport"
USER_ID = "61f100abx"
DOCUMENT_TYPE = "pdf"

Let us now create the function to create vectors in our desired format as defined above.

In [61]:
def create_vectors(text_chunks,KEY,USER_ID,DOCUMENT_TYPE):
    v = []
    chunk_num = 0
    
    for chunk in text_chunks: 
        page_num = chunk.metadata["page"]
        
        entry = {}
        entry["id"] = f"{KEY}_PAGE_{page_num}_CHUNK_{chunk_num}"
        entry["values"] = get_embedding(chunk.page_content)
        entry["metadata"] = {
            "userID" : USER_ID,
            "type" : DOCUMENT_TYPE,
            "key" : KEY,
            "chunk" : chunk.page_content,
            "page_number" : chunk.metadata["page"],
            "chunk_number" : chunk_num
        }
        
        chunk_num += 1
        v.append(entry)
        
    return v

In [None]:
vectors = create_vectors(text_chunks,KEY,USER_ID,DOCUMENT_TYPE)

With this, we have our vectors stored in the ideal format to be pushed into the vector DB. Let us now push the vectors into the vectorDB of pinecone.

# Pushing the Vectors into the Pinecone Index

In [101]:
index_name = "documents"
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

index.upsert(
    vectors=vectors
)

{'upserted_count': 37}

# Querying the Vectors
We shall now query the vectors to check if the vectors have been stored correctly in the pinecone index, and how does this exactly work. We will fetch the relevant vectors from the pinecone index. For that, we will create a function which takes a text query, converts into to an embedding and queries the pinecone index to get the most similar texts from the vectors stored in the index.

In [None]:
def get_relevant_chunks(query,userID,KEY):
    query_vector = get_embedding(query)
    
    results = index.query(
        vector = query_vector,
        top_k = 5,
        include_values = False,
        include_metadata = True,
        filter={
            "userID" : userID,
            "key" : KEY
        }
    )
    
    relevant_texts = []
    for record in results['matches']:
        text = {}
        text['score'] = record['score']
        text['text'] = record['metadata']['chunk']
        text["reference"] = int(record["metadata"]["page_number"]) + 1
        relevant_texts.append(text)
    
    return relevant_texts

Finally, we can create a QA system which will take a query and return the most relevant chunks from the PDF document.

In [75]:
import sys 
while True:
    user_input = input(f"Input Prompt: ")
    if user_input=='exit':
        print( 'Exiting')
        sys.exit()
    if user_input == '':
        continue
    
    docs = get_relevant_chunks(user_input,USER_ID,KEY)
        
    for doc in docs:
        print(f"Rank {doc['score']} \n Reference {doc['reference']} \n Answer: \n {doc['text']}")
        print("------------------------")

    print("------------------------------------------------------------------------------------------------------------------------")
        

Rank 0.353432596 
 Reference 9 
 Answer: 
 Project
Review
Name
and
Role
Task
Undertaken
Hari
Prapan
-
TEAM
LEAD
●
Project
Research
and
Approach 
●
Primary
&
Secondary
research 
●
Interview
of
Primary
Resource 
●
Poster
Creation
Pragya
Singh
-
Team
Member
●
Interview
of
Primary
Resource 
●
Primary
&
Secondary
research 
●
Reviewed
all
document
submissions
Visist
Tallam
-
Team
Member
●
Interview
of
Primary
Resource 
●
Primary
&
Secondary
research 
●
Reviewed
all
document
submissions
Ullas
Kumar
-
Team
Member
●
Compilation
of
Resources 
●
------------------------
Rank 0.347688705 
 Reference 9 
 Answer: 
 dedication
and
achievements
in
enhancing
communication
within
our
team.
Signed
&
Attested
Hari
Prapan,
Pragya
Singh,
Visist
Tallam,
Ullas
Kumar,
Dhruv
Pamneja
------------------------
Rank 0.319635719 
 Reference 3 
 Answer: 
 perspectives
from
industry
professionals,
our
team
seeks
to
address
this
gap
by
providing
nuanced
insights
and
practical
implications.
Research
Design
and
Sample
Se

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


With this, our pipeline is complete and we can now move on to the next steps which is sending these relvant documents to the LLM to answer our query.

# Prompt Template for the LLM
Here, we will need to define the prompt for the LLM to answer the query. The LLM will be given the query and the relevant documents, and it will be expected to return the answer to the query. 

In [38]:
query_prompt_template = """
    You are a specialised AI document analyser, and you will be assisting the users to answer their queries. You will be given 
    the top relevant documents and you have to use those to answer the query asked by the user, which will be given to you below. 
    In the relevant documents,you will be given the cosine similarity score, the reference (which is the page number where this 
    text was in the document) and the text itself. You can in you answer integrate the reference to build authenticity of your answer, 
    by precisely writing it like (reference page : page_num)
    
    \n\n User Query : {query}
    \n\n Documents : {documents}
    
    MAKE SURE YOU DO NOT ANSWER FROM ANYTHING APART FROM THE DOCUMENTS GIVEN TO YOU. 
"""

In [39]:
query_prompt = PromptTemplate(
    input_variables=["query","documents"],
    template=query_prompt_template
)

# Initializing the LLM Client and Chain for RAG Model

In [30]:
chat = ChatOpenAI(
    temperature = 0,  
    model = "gpt-4o",
    openai_api_key = OPENAI_API_KEY
)

In [40]:
query_chain = LLMChain(
    llm=chat,
    prompt=query_prompt
)

# Q&A System using the chain

In [None]:
user_query = "who worked on the compliation of the resources and final documents? tell me their details and roll numbers?"
docs = get_relevant_chunks(user_query,USER_ID,KEY)

# Run the chain
response = query_chain.invoke({
    "query": user_query,
    "documents": docs
})

# Print the response
print("Response from LLM:")
print(response['text'])

Response from LLM:
The compilation of resources and final documents was primarily handled by Ullas Kumar and Dhruv Pamneja. Ullas Kumar was responsible for the compilation of resources, while Dhruv Pamneja worked on the creation of the final documents (reference page: 9).

Here are the details and roll numbers of the team members involved:

- Hari Prapan (21f3002087) - Team Lead
- Pragya Singh (21f3001204) - Team Member
- Visist Tallam (21f2001553) - Team Member
- Ullas Kumar (21f3002619) - Team Member
- Dhruv Pamneja (21f1001719) - Team Member (reference page: 1)


Here, we have successfully built the RAG model and the Q&A system using the chain. With this, we get the functionality to query the relevant documents and get the answer to the query.

# Deleting via Pinecone (Listing indexes with prefix and then deleting the index)

In [None]:
index_name = "documents"


index = pc.Index(index_name)
list1 = []
for ids in index.list(prefix=KEY):
  list1.append(ids)

In [16]:
KEY = "674313bc5cbbf2d8da2a4649/pdf/"
res = index.delete([ids for ids in index.list(prefix=KEY)])
res

{}