-
Notifications
You must be signed in to change notification settings - Fork 5
/
doc_finder.py
65 lines (52 loc) · 1.95 KB
/
doc_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
from dotenv import load_dotenv
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
import chromadb
load_dotenv()
def load_chunk_persist_pdf() -> Chroma:
pdf_folder_path = "D:\\diptiman\\dataset\\consent_forms_cleaned"
documents = []
for file in os.listdir(pdf_folder_path):
if file.endswith('.pdf'):
pdf_path = os.path.join(pdf_folder_path, file)
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
chunked_documents = text_splitter.split_documents(documents)
client = chromadb.Client()
if client.list_collections():
consent_collection = client.create_collection("consent_collection")
else:
print("Collection already exists")
vectordb = Chroma.from_documents(
documents=chunked_documents,
embedding=OpenAIEmbeddings(),
persist_directory="D:\\testing_space\\chroma_store\\"
)
vectordb.persist()
return vectordb
def create_agent_chain():
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
chain = load_qa_chain(llm, chain_type="stuff")
return chain
def get_llm_response(query):
vectordb = load_chunk_persist_pdf()
chain = create_agent_chain()
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
return answer
# Streamlit UI
# ===============
st.set_page_config(page_title="Doc Searcher", page_icon=":robot:")
st.header("Query PDF Source")
form_input = st.text_input('Enter Query')
submit = st.button("Generate")
if submit:
st.write(get_llm_response(form_input))