In [None]:
!pip install langchain_community tiktoken langchain-openai langchainhub langchain pinecone pypdf langchain_pinecone --upgrade pinecone-client

In [82]:
import os
import unstructured
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '<YOUR API KEY>'
os.environ['OPENAI_API_KEY'] = '<YOUR API KEY>'
os.environ['PINECONE_API_KEY'] = '<YOUR API KEY>'
os.environ['PINECONE_ENV'] = 'rag'

In [88]:
import openai
from pinecone import Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pdfplumber
from langchain.schema import Document

In [146]:
def read_data_from_doc(pdf_path):
    docs = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            tables = page.extract_tables()
            table_text = "\n".join([
                "\n".join(["\t".join(cell if cell is not None else "" for cell in row) for row in table])
                for table in tables if table
            ]) if tables else ""
            images = page.images
            image_text = f"[{len(images)} image(s) detected]" if images else ""
            content = f"{text}\n\n{table_text}\n\n{image_text}".strip()
            if content:
                docs.append(Document(page_content=content, metadata={"page": i + 1}))
    return docs

In [147]:
data = read_data_from_doc('/Users/aditya/Desktop/RAG/Corpus.pdf')

In [149]:
def make_chunks(docs, chunk_len=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_len, chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(docs)
    return [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]

In [150]:
splits = make_chunks(data)

In [151]:
embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])

In [152]:
pc = Pinecone(os.environ['PINECONE_API_KEY'])

In [153]:
text_field = "text"
vectorstore = PineconeVectorStore.from_documents(
    splits,
    embeddings,
    index_name = os.environ['PINECONE_ENV']
)

In [154]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.75)

In [161]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = """
You are an AI assistant answering questions based on retrieved documents and additional context. 
Use the provided context from both database retrieval and additional sources to answer the question. 

- **Discard irrelevant context:** If one of the contexts (retrieved or additional) does not match the question, ignore it.
- **Highlight conflicting information:** If multiple sources provide conflicting information, explicitly mention it by saying:
  - "According to the retrieved context, ... but as per internet sources, ..."
  - "According to the retrieved context, ... but as per internet sources, ..."
- **Prioritize accuracy:** If neither context provides a relevant answer, say "I don't know" instead of guessing.

Provide concise yet informative answers, ensuring clarity and completeness.

Retrieved Context: {context}
Additional Context: {additional_context}
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}\n\nRetrieved Context: {context}\n\nAdditional Context: {additional_context}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
chain = create_retrieval_chain(retriever , question_answer_chain)

In [162]:
def chat_with_llm(question, additional_context=""):
    input_data = {
        "input": question,
        "additional_context": additional_context  
    }
    out = chain.invoke(input_data)
    return out

In [163]:
out = chat_with_llm('What does this company sell?')

In [164]:
out['answer']

'This company sells wine, specifically Napa Valley Cabernet Sauvignon aged for 12 years in French oak. The wine offers pronounced aromas and a decadent palate. They also utilize the Solera method in the production of fortified wines like Sherry and Madeira.'