In [1]:
import os

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [2]:
base_dir = "pdfs"
db_dir = "./chroma_db"
st_embedding_model = "all-MiniLM-L6-v2"
openai_api_key = "YOUR KEY HERE"
open_ai_llm_model_name = "gpt-3.5-turbo"

In [3]:
os.environ["OPENAI_API_KEY"] = openai_api_key

In [4]:
files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

In [5]:
def split_docs(docs, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(docs)
    return docs

In [6]:
documents = []

In [7]:
for file in files:
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    pages = split_docs(pages)
    documents.extend(pages)

invalid pdf header: b'\x00\x05\x16\x07\x00'
EOF marker not found


PdfStreamError: Stream has ended unexpectedly

In [12]:
embeddings = SentenceTransformerEmbeddings(model_name=st_embedding_model)

In [12]:
Chroma.from_documents(documents, embeddings, persist_directory=db_dir)

<langchain.vectorstores.chroma.Chroma at 0x28996cbd0>

In [13]:
db = Chroma(persist_directory=db_dir, embedding_function=embeddings)

In [14]:
query = "what happen leadsulphide treated with oxygen"

In [15]:
llm = ChatOpenAI(model_name=open_ai_llm_model_name)

In [16]:
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)

matching_docs = db.similarity_search(query)
answer =  chain.run(input_documents=matching_docs, question=query)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
→
o ourless
However, the bleached product (colourless) is allowed to stand in air, it is reoxidised 
by atmospheric oxygen to its original colour. Hence bleaching action of sulphur dioxide is temporary. 
Uses: 
1. Sulphur dioxide is used in bleaching hair, silk, wool etc...
2. It can be used for disinfecting crops and plants in agriculture.
Structure of sulphur dioxide:
In sulphur dioxide, sulphur 
atom undergoes sp2 hybridisation. 
A double bond arises between S 
and O due to pπ- dπ overlapping.
3.2.3 Sulphuric acid: (H2SO4) 
Preparation:
Sulphuric acid can be manufactured by lead chamber process, cascade process or contact 
process. Here we discuss the contact process

In [17]:
answer

'When lead sulfide (PbS) is treated with oxygen (O2), it undergoes oxidation and forms lead oxide (PbO) and sulfur dioxide (SO2) as products. The balanced chemical equation for this reaction is:\n\n2PbS + 3O2 -> 2PbO + 2SO2\n\nSo, lead sulfide reacts with oxygen to produce lead oxide and sulfur dioxide.'

In [18]:
for doc in matching_docs:
    print(doc.metadata)

{'page': 86, 'source': 'pdfs/12th_Chemistry_Vol-1_EM - www.tntextbooks.in.pdf'}
{'page': 86, 'source': 'pdfs\\12th_Chemistry_Vol-1_EM - www.tntextbooks.in.pdf'}
{'page': 86, 'source': 'pdfs\\12th_Chemistry_Vol-1_EM - www.tntextbooks.in.pdf'}
{'page': 104, 'source': 'pdfs/12th_Chemistry_Vol-1_EM - www.tntextbooks.in.pdf'}
