In [1]:
# !pip install langchain
# !pip install pypdf
# !pip install InstructorEmbedding
# !pip install faiss-cpu
# !pip install openai

In [2]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA


# Load PDF Using PyPDFLoader

In [3]:
# In this case we use PyPDFLoader but LangChain offers other loaders too (for PDF or other unstructured data)
loader = PyPDFLoader("./data/9001408066_B.pdf")
data = loader.load()

In [4]:
#every page in pdf is counted as unique document
print (f'{len(data)} document(s) in data')


40 document(s) in data


In [5]:
# If we check one of the documents we can see the page_content and metadata, in this case the document name and page number but we could define more
data[10]

Document(page_content="Laundry    en\n11ZLaundry\nLaundryPreparing the laundry\nCaution!\nDamage to the appliance/fabrics\nForeign objects (e.g. coins, paper clips, \nneedles, nails) may damage the laundry or components in the appliance. \nTherefore, note the following tips when \npreparing your laundry:\n■Empty any pockets.\n■Check for metal items (paper clips, etc.) and remove them.\n■Wash delicates in a laundry bag (tights, underwired bras, etc.).\n■Remove curtain fittings or place curtains in a laundry bag.\n■Close any zips, button up any cover buttons.\n■Brush sand out of pockets and collars.Sorting laundry\nSort your laundry according to the care instructions and manufacturer's information on the care labels:\n■Type of fabric/fibre\n■Colour\nNote:  Laundry can discolour or not \nbe cleaned correctly. Wash white and coloured laundry separately.Wash new bright fabrics separately the first time you wash them.\n■SoilingWash laundry with the same level of soiling together.You can find

# Splitting the documents

In [6]:
chunk_size = 400
chunk_overlap = 30
# We use the RecursiveCharacterTextSplitter but there are others as well 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(data)

In [7]:
print(f'Total documents after split: {len(docs)}')


Total documents after split: 184


In [8]:
# Not much has changed to the format except that the page content is now smaller due to the splitting
docs[10]

Document(page_content='Starting the programme  . . . . . . . . . .18Childproof lock . . . . . . . . . . . . . . . . .19Adding/removing laundry . . . . . . . . .19Changing the programme . . . . . . . . .19Cancel the programme . . . . . . . . . . .20Programme end during rinse hold. . .20Programme end  . . . . . . . . . . . . . . . .20Removing laundry/switching off the', metadata={'source': './data/9001408066_B.pdf', 'page': 2})

# Loading embeddings

We use the HuggingFace Instructor Embeddings but we could also use other embeddings

In [9]:
embeddings = HuggingFaceInstructEmbeddings(
            model_name="hkunlp/instructor-large",
            model_kwargs={"device": "cpu"}
        )

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [10]:
# Checking what is in the embeddings
embeddings

HuggingFaceInstructEmbeddings(client=INSTRUCTOR(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: T5EncoderModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (3): Normalize()
), model_name='hkunlp/instructor-large', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, embed_instruction='Represent the document for retrieval: ', query_instruction='Represent the question for retrieving supporting documents: ')

# Loading the embeddings into a vector store

For now we use FAISS in-memory as vector index but this may be changed out in the future

In [11]:
db = FAISS.from_documents(docs, embeddings)

In [12]:
# Exploring the db
db

<langchain.vectorstores.faiss.FAISS at 0x2a0407e90>

In [13]:
# As you can see, there are multiple built in functions for the similarity search
# But also for adding documents, and we can also save the db locally
dir(db)

['_FAISS__add',
 '_FAISS__from',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_aembed_documents',
 '_aembed_query',
 '_asimilarity_search_with_relevance_scores',
 '_cosine_relevance_score_fn',
 '_embed_documents',
 '_embed_query',
 '_euclidean_relevance_score_fn',
 '_get_retriever_tags',
 '_max_inner_product_relevance_score_fn',
 '_normalize_L2',
 '_select_relevance_score_fn',
 '_similarity_search_with_relevance_scores',
 'aadd_documents',
 'aadd_texts',
 'add_documents',
 'add_embeddings',
 'add_texts',
 'adelete',
 'afrom_documents',
 'afrom_embeddings',
 'afrom_texts',
 'amax_marginal_relevance

In [14]:
# Uncomment to see what happens
# db.save_local('output/vectordb')

In [15]:
# The FAISS db has a built-in similarity search function that we can use
query = "What do I do if water is leaking?"
docs = db.similarity_search(query, k=3)
print(len(docs))
print(docs[0].page_content)
print("----")
print(docs[1].page_content)
print("----")
print(docs[2].page_content)

3
Faults and what to do about them    en
27Faults and what to do about them
Faults Cause/Remedy
Water is leaking from the 
machine.■Attach the drain hose correctly or replace it.
■Tighten the screw connection of the drain hose.
The machine is not filling with water.
Detergent is not being dis-
pensed.■Have you pressed the A button?
■Is the tap turned on?
----
immersed in the drained water, water may be sucked back into the appliance and may damage the appliance/fabrics.Make sure that: – The plug does not block drainage 
from the sink.
– The end of the drain hose is not 
immersed in the drained water.
– The water drains away quickly 
enough.Secure the outlet hose so that it
----
damage. Do not open the washing machine door if water can be seen through the glass.If the laundry has to be removed, the 
washing machine door can be opened as follows:1.Switch off the appliance.
2.Disconnect the mains plug.
3.Drain the water. ~ Page 23
4.Pull the emergency release 
downwards with a tool and re

# Create the LLM Interface

Now that we have the results from our similarity search, we use these and add them to our prompt to send to the LLM

For now we will use the OpenAI API to get started quickly but this will definitely be a different LLM (API or local) in the future

In [31]:
# for now we use mainly the default settings
# TODO add env file for OpenAI Key
# For now you need to do export OPENAI_API_KEY=''
model_llm = OpenAI(api_key="", model="text-davinci-003")

# Setting up our chain

A Chain in LangChain is something that glues together operations.

There is a simple sequential chain, but there are also targeted chains, for instance for QA.

In [32]:
# Create chain
# Note that there are multiple chain types in langchain.
# retrieval qa and load qa are quite similar


qa = load_qa_chain(llm=model_llm, chain_type="stuff")

# This takes some default settings for the retriever
# qa = RetrievalQA.from_chain_type(llm=model_llm, 
#                                  chain_type="stuff", 
#                                  retriever=db.as_retriever())

In [33]:
query = "What do I do if water is leaking?"
ss_docs = db.similarity_search(query, k=3)
print(len(ss_docs))

3


In [34]:
qa.run(input_documents=ss_docs, question=query)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


' Attach the drain hose correctly or replace it, and tighten the screw connection of the drain hose. Make sure that the plug does not block drainage from the sink, and that the end of the drain hose is not immersed in the drained water.'