In [1]:
import json
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd

In [2]:
#load document
file_path = 'data/doc2dial/doc2dial_qa_train.csv'
loader = CSVLoader(file_path)
document = loader.load()
print(f'documents:{len(document)}')

documents:21998


In [3]:
#load qa pairs
file_path = 'data/doc2dial/doc2dial_qa_train.csv'
df = pd.read_csv(file_path)

In [4]:
doc1= df.loc[0]

In [5]:
doc1['domain']

'dmv'

In [6]:
file_path = 'data/doc2dial/doc2dial_doc.json'
with open(file_path, 'r') as f:
    doc2dial_doc = json.load(f)

In [7]:
doc1_text = doc2dial_doc['doc_data'][doc1['domain']][doc1['doc_id']]['doc_text']
print(doc1_text)

Many DMV customers make easily avoidable mistakes that cause them significant problems, including encounters with law enforcement and impounded vehicles. Because we see customers make these mistakes over and over again , we are issuing this list of the top five DMV mistakes and how to avoid them. 

1. Forgetting to Update Address 
By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. It is not sufficient to only: write your new address on the back of your old license; tell the United States Postal Service; or inform the police officer writing you a ticket. If you fail to keep your address current , you will miss a suspension order and may be charged with operating an unregistered vehicle and/or aggravated unlicensed operation, both misdemeanors. This really happens , but the good news is this is a problem tha

# embedding

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.docstore.document import Document

In [46]:
def split(document):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 0
    )
    # remove empty documents
    split_documents = text_splitter.split_documents(document)
    print(f'documents:{len(split_documents)}')
    return split_documents

def embedding(documents) -> FAISS:
    #load embeddings
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/gtr-t5-base')
    db = FAISS.from_documents(documents, embeddings)
    return db

def save_db(db):
    db.save_local("data/faiss_index")

def load_db(embeddings):
    new_db = FAISS.load_local("data/faiss_index", embeddings)
    return new_db

In [47]:
document = Document(page_content=doc1_text, metadata={"source": doc1['doc_id']})

In [48]:
split_documents = split([document])

documents:17


In [49]:
split_documents

[Document(page_content='Many DMV customers make easily avoidable mistakes that cause them significant problems, including encounters with law enforcement and impounded vehicles. Because we see customers make these mistakes over and over again , we are issuing this list of the top five DMV mistakes and how to avoid them.', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='1. Forgetting to Update Address', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. It is not sufficient to only: write your new address on the back of your old license; tell the United States Postal Service; or inform the police officer writing you a ticket. If you fail to keep your address current , y

In [50]:
db = embedding(split_documents)

In [20]:
save_db(db)

In [107]:
query = "Can I do my DMV transactions online?"
docs = db.similarity_search(query)
docs

[Document(page_content='About ten percent of customers visiting a DMV office do not bring what they need to complete their transaction, and have to come back a second time to finish their business. This can be as simple as not bringing sufficient funds to pay for a license renewal or not having the proof of auto insurance required to register a car. Better yet , don t visit a DMV office at all, and see if your transaction can be performed online, like an address change, registration renewal, license renewal, replacing', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='Many DMV customers make easily avoidable mistakes that cause them significant problems, including encounters with law enforcement and impounded vehicles. Because we see customers make these mistakes over and over again , we are issuing this list of the top five DMV mistakes and how to avoid them.', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(

In [43]:
doc1['references']

"[{'sp_id': '6', 'label': 'solution'}, {'sp_id': '7', 'label': 'solution'}]"

In [112]:
doc2dial_doc['doc_data'][doc1['domain']][doc1['doc_id']]['spans']['56']

{'id_sp': '56',
 'tag': 'u',
 'start_sp': 4496,
 'end_sp': 4527,
 'text_sp': 'Sign up or log into MyDMV [6 ] ',
 'title': '5. Not Bringing Proper Documentation to DMV Office',
 'parent_titles': [],
 'id_sec': '15',
 'start_sec': 4496,
 'text_sec': 'Sign up or log into MyDMV [6 ] ',
 'end_sec': 4527}

# answer agent

In [69]:
from langchain import VectorDBQA
from langchain.chains import qa_with_sources
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import os


In [90]:
#read from txt file
with open('api.txt', 'r') as f:
    lines = f.readlines()
    lines = [line.strip().split(":")[1] for line in lines]
    openai_api = lines[0]
    hf_api = lines[1]

In [97]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_api
os.environ["OPENAI_API_KEY"] = openai_api

In [110]:
qa_model = HuggingFaceHub(repo_id='google/flan-t5-large')
query = "Please answer the following question.\n"+"Can I do my DMV transactions online?"
chain = load_qa_chain(llm=qa_model, chain_type="stuff")
chain.run(input_documents=docs, question=query, raw_response=True)

'Yes'

In [111]:
docs

[Document(page_content='About ten percent of customers visiting a DMV office do not bring what they need to complete their transaction, and have to come back a second time to finish their business. This can be as simple as not bringing sufficient funds to pay for a license renewal or not having the proof of auto insurance required to register a car. Better yet , don t visit a DMV office at all, and see if your transaction can be performed online, like an address change, registration renewal, license renewal, replacing', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='Many DMV customers make easily avoidable mistakes that cause them significant problems, including encounters with law enforcement and impounded vehicles. Because we see customers make these mistakes over and over again , we are issuing this list of the top five DMV mistakes and how to avoid them.', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(

In [71]:
query

'Please answer the following question.\nHello, I forgot o update my address, can you help me with that?'

In [72]:
docs

[Document(page_content='1. Forgetting to Update Address', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. It is not sufficient to only: write your new address on the back of your old license; tell the United States Postal Service; or inform the police officer writing you a ticket. If you fail to keep your address current , you will miss a suspension order and may be', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='possible mail correspondence can reach you. Also , turning in your plates is important to avoid an insurance lapse.', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='receive their DRA assessment because we do 

In [85]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

chain = load_qa_with_sources_chain(qa_model, chain_type="map_reduce")
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1666 > 1024). Running this sequence through the model will result in indexing errors


{'output_text': 'SOURCES: Top 5 DMV Mistakes and How to Avoid Them#3'}

In [103]:
from langchain import OpenAI

query = "Please answer the following question using the given documents only.\n"+doc1['question']
openai_model = OpenAI(model_name="text-davinci-003", max_tokens=1024)
chain = load_qa_chain(llm=qa_model, chain_type="stuff")
chain.run(input_documents=docs, question=query, raw_response=True)

'Yes'

In [104]:
query

'Please answer the following question using the given documents only.\nHello, I forgot o update my address, can you help me with that?'

In [105]:
docs

[Document(page_content='1. Forgetting to Update Address', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. It is not sufficient to only: write your new address on the back of your old license; tell the United States Postal Service; or inform the police officer writing you a ticket. If you fail to keep your address current , you will miss a suspension order and may be', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='possible mail correspondence can reach you. Also , turning in your plates is important to avoid an insurance lapse.', metadata={'source': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}),
 Document(page_content='receive their DRA assessment because we do 