In [4]:
pip install -r ./requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [3]:
def load_pdf(file):
    from langchain.document_loaders import PyPDFLoader
    print(f'Loading {file}')
    loader = PyPDFLoader(file)
    data = loader.load()
    return data

In [19]:
# Testing the load_pdf function on test.pdf
data = load_pdf('./test.pdf')

#print(data[5].page_content)
#print(data[10].metadata)
#print(f'There are {len(data)} pages in your PDF')

Loading ./test.pdf


Document(page_content='', metadata={'source': './test.pdf', 'page': 0})

In [4]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [10]:
# Testing chunk_data function on test.pdf 
chunks = chunk_data(data)
#print(len(chunks))
#print(chunks[10].page_content)

In [5]:
 # Embedding and Uploading to Vector Database (Pinecone)

def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec

    pc = pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists!')
        print('Loading embeddings')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Embeddings successfully loaded!')
    else:
        print(f'Creating index {index_name} and embeddings...')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Embeddings successfully loaded!')
    return vector_store

In [6]:
# Deleting indexes from Pinecone

def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name=='all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes')
        for index in indexes:
            pc.delete_index(index)
        print('Done!')
    else:
        print(f'Deleting index {index_name}')
        pc.delete_index(index_name)
        print('Done!')

In [18]:
# Testing the embeddings and deleting_index functions:

delete_pinecone_index()

Deleting all indexes
Done!


In [11]:
index_name = 'talk2pdf'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Index talk2pdf already exists!
Loading embeddings
Embeddings successfully loaded!


In [16]:
# Asking and Getting Answers

def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)
    answer = chain.run(q)
    return answer

In [None]:
import time
i = 1
print('Ask me anything about your PDF!')
print('Write ''Exit'' to quit')
while True:
    q = input(f'Question {i}: ')
    i+=1
    if q.lower() == 'exit':
        print('Exited Successfully!')
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" *50}')

In [24]:
# Adding Memory

from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    verbose=True
)