In [21]:
from langchain.callbacks import get_openai_callback
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

import os
import pickle
import requests

In [3]:
loader = DirectoryLoader('/mnt/data/Docs_QA_AIHub', recursive='true')
docs = loader.load()
len(docs)

721

In [5]:
# Get some stats about the document
print (f'You have {len(docs)} document(s) in the dataset')
print (f'There are {len(docs[0].page_content)} characters in the first document')

You have 721 document(s) in the dataset
There are 603 characters in the first document


In [6]:
# Chunk your data up into smaller documents
text_splitter = RecursiveCharacterTextSplitter()
texts = text_splitter.split_documents(docs)
texts[:2]

[Document(page_content='[[tr1]] // View  json definition of a Launcher via Advanced Edit\n\n[[tr2]] // Edit  json definition of a Launcher via Advanced Edit\n\nNOTE: This feature is only available in private deployments of Domino Enterprise.\n\nUse the Advanced Launcher Editor to access the JSON representation of a Launcher. This is useful if you want to copy Launcher definitions between projects.\n\n. Go to the Launcher. ifeval::[{version} < 5.3] . Click the gear icon and then click *Edit (Advanced)*. endif::[] ifeval::[{version} >= 5.3] . Click the gear icon and then click *Edit*. . Click *Switch to JSON Edit Mode*. endif::[]', metadata={'source': '/mnt/data/Docs_QA_AIHub/content-reuse/copy-launcher-definitions.adoc'}),
 Document(page_content='To grant other users access to a project, you can add them as collaborators. To add collaborators, you must be a Contributor to the project, or the project Owner.\n\n[[tr5]]\n\n//add by user name\n\n[[tr6]]\n\n//add by email address\n\n\n\nexis

In [7]:
print (f'There are now {len(texts)} documents')

There are now 1182 documents


In [19]:
# Read your OpenAI key from the environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [3]:
store = FAISS.from_texts([t.page_content for t in texts], embeddings)
with open("faiss_doc_store.pkl", "wb") as f:
    pickle.dump(store, f)


In [None]:
# Index and store the embeddings locally in a pickle file
store = FAISS.from_texts([t.page_content for t in texts], embeddings)
with open("faiss_store.pkl", "wb") as f:
    pickle.dump(store, f)


In [10]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [11]:
template = """You are an AI assistant for answering questions about information in Domino Data Labs product documentation.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about AI or ML or data science or MLOps or related to Domino Data Lab, politely inform them that you are tuned to only answer questions about MLOps, data science and Domino Data Lab.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])


In [22]:
def get_chain(vectorstore):

    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa_chain = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(), memory=memory, qa_prompt=QA_PROMPT,
                                                     condense_question_prompt=CONDENSE_QUESTION_PROMPT)
    return qa_chain

In [17]:
# Load the embeddings from the pickle file; change the location if needed
if 'store' not in locals() or store is None:
    with open("faiss_ddl_doc_store.pkl", "rb") as f:
        store = pickle.load(f)
        

In [28]:
qa = get_chain(store)

In [29]:
with get_openai_callback() as cb:
    while True:
        print("Human:")
        question = input()
        if question.lower() == "quit()":
            question = None
            break
        if question.lower() == "clear_history()":
            qa.memory.clear()
            question = None
            continue
        if question is not None and question != "" :
            print("AI:")
            print(qa.run(question))
                
print(f"Total Tokens: {cb.total_tokens}")

Human:


 what is a datasource?


AI:

A data source is a structured mechanism to create and manage connection properties to a supported external data service. Domino data sources offer a way to securely store connection properties and access data from databases, cloud storage services, and other external systems. You can create data sources directly when you need access to a specific data source that the administrator might not have set up on the deployment. To learn more about data sources, see [Domino data sources](https://docs.dominodatalab.com/fbb41f/).
Human:


 clear_history()


Human:


 waht is a dataset?


AI:
 A dataset is a collection of files that are available in user executions as a filesystem directory. These files can be used and shared as a file system directory. A Dataset always reflects the most recent version of the data. You can modify the contents of a Dataset through the Domino application or through workload executions, at any time. You can also create a Snapshot with a read-only copy of the Dataset files at a given time.
Human:


 quit()


Total Tokens: 5574
