In [40]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts.prompt import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import ChatVectorDBChain
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores.base import VectorStoreRetriever


In [None]:
! pip install unstructured

In [None]:
loader = DirectoryLoader('/mnt/code/documentation-main/content/user_guide/', recursive='true')
docs = loader.load()
len(docs)

In [None]:
# Chunk your data up into smaller documents
text_splitter = RecursiveCharacterTextSplitter()
texts = text_splitter.split_documents(docs)
texts[:2]

In [16]:
print (f'There are now {len(texts)} documents')

There are now 654 documents


In [17]:
#Create embeddings of your documents to get ready for semantic search

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
import pickle

embeddings = OpenAIEmbeddings()
store = FAISS.from_texts([t.page_content for t in texts], embeddings)
with open("faiss_doc_store.pkl", "wb") as f:
    pickle.dump(store, f)

In [20]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [30]:
template = """You are an AI assistant for answering questions about information in Domino Data Labs product documentation.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about AI or ML or data science or MLOps or related to Domino Data Lab, politely inform them that you are tuned to only answer questions about MLOps, data science and Domino Data Lab.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])

In [46]:
def get_condense_prompt_qa_chain(store):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    memory = ConversationBufferMemory(
        memory_key="chat_history", return_messages=True)
    # see: https://github.com/langchain-ai/langchain/issues/5890
    model = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=VectorStoreRetriever(vectorstore=store),
        memory=memory,
        combine_docs_chain_kwargs={"prompt": QA_PROMPT})
    return model

In [22]:
# Load the embeddings from the pickle file; change the location if needed
if 'store' not in locals() or store is None:
    with open("faiss_doc_store.pkl", "rb") as f:
        store = pickle.load(f)
        

In [47]:
qa = get_condense_prompt_qa_chain(store)

In [52]:
result = qa({"question": "What is a Domino data set"})

In [53]:
result["answer"]

'A Domino data set is a managed folder within Domino Datasets that allows you to store and manage data within the Domino system. It provides several advantages over storing data in project files, such as the ability to store more files, bigger files, and access them faster. There is no limit to the number of files that can be stored in a Domino data set, and there is no limit to the size of any individual file stored in a data set. Additionally, data sets are attached to executors as networked file systems, eliminating the need to transfer their contents when starting a run or workspace. This makes it easier to organize and share curated subsets of data with your team members.'

In [55]:
qa({"question": "What is a Domino data set"})['answer']

"The advantages of using a Domino data set for storing and managing data within the Domino system are:\n\n1. Support for larger data: Domino data sets can handle much larger data than project artifacts, allowing you to store and manage data sets up to ~1TB per data set and hundreds of TB across multiple data sets.\n\n2. Reproducibility: Domino data sets support snapshots, which means you can version your data sets and easily reproduce previous versions. This is particularly useful for training sets that can't easily be shared or controlled outside of Domino.\n\n3. Flexibility in accessing data: Domino data sets provide a single interface for accessing all of your data, regardless of where it lives. You can connect Domino to popular data services using data source connectors or directly connect to any data service using the same code you use in your local environment.\n\n4. Shareability: Domino data sets are shareable within the Domino system, allowing you to easily collaborate and shar