## iPython notebook for a Word document information retrieval / Q&A using Langchain FAISS and OpenAI (API key required). 

In [None]:
#!pip3 install langchain 	# install if needed

In [1]:
# import OpenAI API key for the LLM model
import os
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

* load and split the Word document into split-documents with overlaps

In [3]:
# load and split the Word document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import UnstructuredWordDocumentLoader
loader = UnstructuredWordDocumentLoader("./R1-51-short.docx")

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)
print(f'Loaded {len(data)} document(s) with {len(data[0].page_content)} characters, and split into {len(texts)} split-documents.')

Loaded 1 document(s) with 50455 characters, and split into 74 split-documents.


* Obtain embeddings and vector store

In [10]:
# generate embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [11]:
# utilize FAISS vector store
from langchain.vectorstores import FAISS
db = FAISS.from_documents(texts, embeddings)

* Use FAISS retriever, which returns a list of split-documents based on the query input

In [12]:
# obtain FAISS retriever
retriever = db.as_retriever()

In [None]:
# retrieval example 1
docs = retriever.get_relevant_documents("How many sections are included?")
print("\n\n".join([x.page_content for x in docs[:]]))

In [None]:
# retrieval example 1
docs = retriever.get_relevant_documents("can you summarize the doc?")
print("\nText:\n".join([x.page_content for x in docs[:]]))

* Use FAISS similarity search

In [None]:
docs = db.similarity_search("srs cyclic shift hopping")
print("\n\n".join([x.page_content for x in docs[:]]))

* Use FAISS similarity search, and return the most relevant (similar) split-document and its similarity score

In [None]:
docs = db.similarity_search_with_score("srs cyclic shift hopping")
print("MOST RELEVANT: {} \n\nTHE SCORE IS {}".format(
    docs[0][0].page_content, docs[0][1]))
