-
Notifications
You must be signed in to change notification settings - Fork 0
/
chatbot_for_files.py
205 lines (173 loc) · 8 KB
/
chatbot_for_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# Import required libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import (
UnstructuredWordDocumentLoader,
PyMuPDFLoader,
UnstructuredFileLoader,
)
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone, Chroma
from langchain.chains import ConversationalRetrievalChain
from colorama import init, Fore, Style
import os
import pinecone
# Set up OpenAI API key (from .bashrc, Windows environment variables, .env)
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
# Set up Pinecone env
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
def load_files():
file_path = "./docs/"
all_texts = []
n_files = 0
n_char = 0
n_texts = 0
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400, chunk_overlap=50
)
for filename in os.listdir(file_path):
file = os.path.join(file_path, filename)
if os.path.isfile(file):
if file.endswith(".docx"):
loader = UnstructuredWordDocumentLoader(file)
elif file.endswith(".pdf"):
loader = PyMuPDFLoader(file)
else: # assume a pure text format and attempt to load it
loader = UnstructuredFileLoader(file)
data = loader.load()
texts = text_splitter.split_documents(data)
n_files += 1
n_char += len(data[0].page_content)
n_texts += len(texts)
all_texts.extend(texts)
print(
f"Loaded {n_files} file(s) with {n_char} characters, and split into {n_texts} split-documents."
)
return all_texts, n_texts
def ingest(all_texts, use_pinecone, embeddings, pinecone_index_name, chroma_collection_name, persist_directory):
if use_pinecone:
docsearch = Pinecone.from_texts(
[t.page_content for t in all_texts], embeddings, index_name=pinecone_index_name) # add namespace=pinecone_namespace if provided
else:
docsearch = Chroma.from_documents(
all_texts, embeddings, collection_name=chroma_collection_name, persist_directory=persist_directory)
return docsearch
def setup_retriever(docsearch, k):
retriever = docsearch.as_retriever(
search_type="similarity", search_kwargs={"k": k}, include_metadata=True)
return retriever
def setup_docsearch(use_pinecone, pinecone_index_name, embeddings, chroma_collection_name, persist_directory):
if use_pinecone:
# Load the pre-created Pinecone index.
# The index which has already be stored in pinecone.io as long-term memory
if pinecone_index_name in pinecone.list_indexes():
docsearch = Pinecone.from_existing_index(
pinecone_index_name, embeddings) # add namespace=pinecone_namespace if provided
index_client = pinecone.Index(pinecone_index_name)
# Get the index information
index_info = index_client.describe_index_stats()
namespace_name = ''
n_texts = index_info['namespaces'][namespace_name]['vector_count']
else:
raise ValueError('''Cannot find the specified Pinecone index.
Create one in pinecone.io or using
pinecone.create_index(
name=index_name, dimension=1536, metric="cosine", shards=1)''')
else:
docsearch = Chroma(persist_directory=persist_directory, embedding_function=embeddings,
collection_name=chroma_collection_name)
n_texts = docsearch._client._count(
collection_name=chroma_collection_name)
return docsearch, n_texts
def chatbot_loop(CRqa):
# Get a response from the retriever based on the user input and chat history
def get_response(query, chat_history):
result = CRqa({"question": query, "chat_history": chat_history})
return result['answer'], result['source_documents']
# Initialize chat history
chat_history = []
# Initialize color
init()
# Start the chat loop
while True:
# Get user input
query = input(
Fore.GREEN + "Enter your question; enter 'exit' to exit: " + Style.RESET_ALL)
if query.lower() == 'exit':
break
# Generate a reply based on the user input and chat history
reply, source = get_response(query, chat_history)
print(Fore.RED + "Bot: " + Style.RESET_ALL, reply)
# Print the two most relevant sources
# (only the first 400 characters for brevity here) if needed
for i, source_i in enumerate(source):
if i < 2:
if len(source_i.page_content) > 400:
page_content = source_i.page_content[:400]
else:
page_content = source_i.page_content
if source_i.metadata:
metadata_source = source_i.metadata['source']
print(Fore.GREEN + "Source " + str(i+1) + " info and content: " +
Style.RESET_ALL, metadata_source, ": ", page_content)
# print(source_i.metadata)
else:
print(Fore.GREEN + "Source " + str(i+1) + " content: " +
Style.RESET_ALL, page_content)
# Update the chat history with the user input and system response
chat_history.append((query, reply))
def setup_em_llm(OPENAI_API_KEY):
# Set up OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# Use Open AI LLM with gpt-3.5-turbo.
# Set the temperature to be 0 if you do not want it to make up things
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True,
openai_api_key=OPENAI_API_KEY)
return embeddings, llm
def setup_pinecone():
# setup_pinecone(pinecone_index_name, embeddings)
pinecone_index_name = input(
Fore.GREEN + "Enter your Pinecone index: " + Style.RESET_ALL)
return pinecone_index_name
def setup_chroma():
chroma_collection_name = input(
Fore.GREEN +
'Not using Pinecone or empty Pinecone API key provided. Using Chroma. Enter Chroma collection name: ' + Style.RESET_ALL)
persist_directory = "./vectorstore"
return chroma_collection_name, persist_directory
def main():
pinecone_index_name = []
chroma_collection_name = []
persist_directory = []
# Set up OpenAI API key (from .bashrc, Windows environment variables, .env)
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
embeddings, llm = setup_em_llm(OPENAI_API_KEY)
# Get user input of whether to use Pinecone or not
r = input(
Fore.GREEN + 'Do you want to use Pinecone index? (y/n): ' + Style.RESET_ALL)
if r.lower() == 'y' and PINECONE_API_KEY != '':
use_pinecone = True
pinecone_index_name = setup_pinecone()
else:
use_pinecone = False
chroma_collection_name, persist_directory = setup_chroma()
# Get user input of whether to ingest files or using existing vector store
r = input(Fore.GREEN +
'Do you want to ingest the file(s) in ./docs/? (y/n): ' + Style.RESET_ALL)
if r.lower() == 'y':
all_texts, n_texts = load_files()
docsearch = ingest(all_texts, use_pinecone, embeddings, pinecone_index_name,
chroma_collection_name, persist_directory)
else:
print('No data is to be ingested. Make sure the Pinecone index or Chroma collection name you provided contains data.')
docsearch, n_texts = setup_docsearch(use_pinecone, pinecone_index_name,
embeddings, chroma_collection_name, persist_directory)
# number of sources (split-documents when ingesting files); default is 4
k = min([20, n_texts])
retriever = setup_retriever(docsearch, k)
CRqa = ConversationalRetrievalChain.from_llm(
llm, retriever=retriever, return_source_documents=True)
chatbot_loop(CRqa)
main()