## iPython notebook for conversational retrieval/Q&A with multiple Word documents using Langchain, OpenAI (API key required), and Pinecone (API key required). It adds the vectors obtained from the documents to a Pinecone index

In [None]:
#!pip3 install langchain pinecone-client	# install if needed
#!pip3 install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"

* Load multiple docx file with given filenames

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import UnstructuredWordDocumentLoader

file_paths = ["./R1-51-short.docx","./Rdoc.docx"]
all_texts = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
for file in file_paths:
    loader = UnstructuredWordDocumentLoader(file)
    data = loader.load()
    texts = text_splitter.split_documents(data)
    print(
        f'Loaded {len(data)} document(s) with {len(data[0].page_content)} characters, and split into {len(texts)} split-documents.')
    all_texts.extend(texts)

* Open AI API key (from .bashrc, Windows environment variables, etc. Or .env) and embeddings

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

* Set up Pinecone env

In [None]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# initialize pinecone
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)

* Store the vectors to the specified Pinecone index

In [None]:
# the index name which can be stored in pinecone.io as long-term memory 
index_name = "langchaints"  # example pinecone index; replace by yours
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, dimension=1536, metric="cosine", shards=1)
docsearch = Pinecone.from_texts([t.page_content for t in all_texts], embeddings, index_name=index_name)

* Use Open AI LLM with gpt-3.5-turbo. Set the temperature to be 0 if you do not want it to make up things. And set up to use the ConversationalRetrievalChain

In [5]:
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
CRqa = ConversationalRetrievalChain.from_llm(llm, retriever=docsearch.as_retriever(), return_source_documents=True)

* Initialize chat history. Provide a prompt and generate a reply.

In [None]:
chat_history = []
query = "What is the summary of the documents on RS?"
result = CRqa({"question": query, "chat_history": chat_history})
result['answer']

* Update the chat history, and provide another prompt. Generate a reply and also print out the source.

In [None]:
chat_history = [(query, result["answer"])]
query = "How is it defined in the context?"
result = CRqa({"question": query, "chat_history": chat_history})
result['answer']
if result['source_documents'][0].metadata:
    print(result['source_documents'][0].metadata['source'])