In [None]:
pip install langchain langchain_groq langchain_pinecone pinecone-client pypdf langchain groq google-generativeai

In [1]:
import os
from langchain_groq import ChatGroq
import google.generativeai as genai
from langchain.document_loaders import UnstructuredPDFLoader, PyPDFLoader
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from pinecone import Pinecone
import pdfplumber
from langchain.schema import Document
from langchain_core.embeddings import Embeddings
from typing import List
import numpy as np

In [2]:
import os
import unstructured
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '<YOUR API KEY>'
os.environ['GROQ_API_KEY'] = '<YOUR API KEY>'
os.environ['PINECONE_API_KEY'] = '<YOUR API KEY>'
os.environ['PINECONE_ENV'] = 'gemini-rag'
os.environ["GEMINI_API_KEY"] = '<YOUR API KEY>'

In [3]:
def read_data_from_doc(pdf_path):
    docs = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            tables = page.extract_tables()
            table_text = "\n".join([
                "\n".join(["\t".join(cell if cell is not None else "" for cell in row) for row in table])
                for table in tables if table
            ]) if tables else ""
            images = page.images
            image_text = f"[{len(images)} image(s) detected]" if images else ""
            content = f"{text}\n\n{table_text}\n\n{image_text}".strip()
            if content:
                docs.append(Document(page_content=content, metadata={"page": i + 1}))
    return docs

In [4]:
data = read_data_from_doc('/Users/aditya/Desktop/Aditya_s_Resume-4.pdf')

In [5]:
def make_chunks(docs, chunk_len=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_len, chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(docs)
    return [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]

In [6]:
splits = make_chunks(data)

In [7]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [8]:
import google.generativeai as genai
import numpy as np
from langchain.embeddings.base import Embeddings
from typing import List

class GeminiEmbeddings(Embeddings):
    def __init__(self, api_key):
        genai.configure(api_key=api_key)
        self.model_name = "models/embedding-001"  

    def embed_documents(self, texts):
        return [self._convert_to_float32(genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"]) for text in texts]

    def embed_query(self, text):
        response = genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")
        return self._convert_to_float32(response["embedding"])

    @staticmethod
    def _convert_to_float32(embedding):
        return np.array(embedding, dtype=np.float32).tolist()

In [9]:
embeddings = GeminiEmbeddings(api_key=os.environ["GEMINI_API_KEY"])

In [10]:
pc = Pinecone(os.environ['PINECONE_API_KEY'])

In [11]:
text_field = "text"
vectorstore = PineconeVectorStore.from_documents(
    splits,
    embeddings,
    index_name = os.environ['PINECONE_ENV']
)

In [12]:
models = [
    "gemma2-9b-it",
    "llama-3.3-70b-versatile",
    "llama-3.1-8b-instant",
    "llama-guard-3-8b",
    "llama3-70b-8192",
    "llama3-8b-8192",
    "mixtral-8x7b-32768",
    "deepseek-r1-distill-llama-70b",
    "llama-3.3-70b-specdec",
    "llama-3.2-1b-preview",
    "llama-3.2-3b-preview"
]

In [13]:
llm = ChatGroq(model_name=models[5], temperature=0.75, api_key=os.environ['GROQ_API_KEY'])

In [14]:
system_prompt = """
You are an AI assistant answering questions based on retrieved documents and additional context. 
Use the provided context from both database retrieval and additional sources to answer the question. 

- **Discard irrelevant context:** If one of the contexts (retrieved or additional) does not match the question, ignore it.
- **Highlight conflicting information:** If multiple sources provide conflicting information, explicitly mention it by saying:
  - "According to the retrieved context, ... but as per internet sources, ..."
  - "According to the retrieved context, ... but as per internet sources, ..."
- **Prioritize accuracy:** If neither context provides a relevant answer, say "I don't know" instead of guessing.

Provide concise yet informative answers, ensuring clarity and completeness.

Retrieved Context: {context}
Additional Context: {additional_context}
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}\n\nRetrieved Context: {context}\n\nAdditional Context: {additional_context}"),
    ]
)

In [15]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)

In [16]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [17]:
chain = create_retrieval_chain(retriever, question_answer_chain)

In [18]:
def chat_with_llm(question, additional_context=""):
    input_data = {
        "input": question,
        "additional_context": additional_context  
    }
    out = chain.invoke(input_data)
    return out

In [25]:
out = chat_with_llm('Tell me which collage did Aditya Attend?')

In [26]:
out['answer']

'According to the retrieved context, Aditya Singh attended Rajiv Gandhi Institute of Petroleum Technology (RGIPT) for his B.Tech. in Computer Science and Engineering, starting from October 2022.'