In [None]:
# Load the libraries that are needed
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.prompt import PromptTemplate
from langchain.callbacks import get_openai_callback
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory


import os
import pinecone
import pickle

In [None]:
# Load the document that you need to parse
loader = UnstructuredPDFLoader("/mnt/ETF_Docs/Select_Global_Value_Fund.pdf")
data = loader.load()

In [None]:
# Get some stats about the document
print (f'You have {len(data)} document(s) in the dataset')
print (f'There are {len(data[0].page_content)} characters in the document')

In [None]:
# Chunk your data up into smaller documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
texts[:2]

In [None]:
print (f'There are now {len(texts)} documents')

In [None]:
#Create embeddings of your documents to get ready for semantic search

# Read your OpenAI key from the environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
# import pinecone

# PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
# PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

# # initialize pinecone
# pinecone.init(
#     api_key=PINECONE_API_KEY,  # find at app.pinecone.io
#     environment=PINECONE_API_ENV  # next to api key in console
# )
# index_name = "vanguard-etf"

# # Generate and store the embeddings in Pinecone
# docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [None]:
# Index and store the embeddings locally in a pickle file
store = FAISS.from_texts([t.page_content for t in texts], embeddings)
with open("faiss_etf_doc_store.pkl", "wb") as f:
    pickle.dump(store, f)


In [None]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [None]:
template = """You are an AI assistant for answering questions about information in Vanguards ETF documentation.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about investments, economics, finance or ML or or related to Vanguard, politely inform them that you are tuned to only answer questions about the finance industry.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])

In [None]:
# Load the embeddings from the pickle file; change the location if needed
if 'store' not in locals() or store is None:
    with open("faiss_etf_doc_store.pkl", "rb") as f:
        store = pickle.load(f)
        

In [None]:
def get_chat_history(inputs) -> str:
    res = []
    for human, ai in inputs:
        res.append(f"Human:{human}\nAI:{ai}")
    return "\n".join(res)

In [None]:
# If you already have a Pinecone Index, you can load it like this

# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# store = None
# check if index already exists, if not we create it
# if index_name in pinecone.list_indexes():
    # connect to index
    # store = Pinecone.from_existing_index(index_name, embeddings)

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), store.as_retriever(), memory=memory, qa_prompt=QA_PROMPT,
                                                     condense_question_prompt=CONDENSE_QUESTION_PROMPT, get_chat_history=get_chat_history)
# qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), store.as_retriever(), memory=memory, get_chat_history=get_chat_history)

In [None]:
with get_openai_callback() as cb:
    while True:
        print("Human:")
        question = input()
        if question.lower() == "quit()":
            question = None
            break
        if question.lower() == "clear_history()":
            qa.memory.clear()
            question = None
            continue
        if question is not None and question != "" :
            print("AI:")
            print(qa.run(question))
                
print(f"Total Tokens: {cb.total_tokens}")