# 1. Installing Reqiured Packages and Setting Up OpenAI Environment

In [56]:
!pip install langchain
!pip install faiss-cpu
!pip install openai
!pip install unstructured
!pip install tiktoken



In [57]:
# Setting up the OpenAI Environment using Private Key
import os
os.environ["OPENAI_API_KEY"] = "YOUR_PRIVATE_KEY"

# 2. Data Collection and Processing for LLM

In [58]:
# provide list of URLS to grab data from
urls = [
    'https://www.ecfmg.org/certification/',
    'https://www.ecfmg.org/psv/',
    'https://www.ecfmg.org/cvs/',
    'https://www.ecfmg.org/eras/',
    'https://www.ecfmg.org/evsp/about.html'
]

In [59]:
# Use langchain to load the data from URLS
from langchain.document_loaders import UnstructuredURLLoader
loaders = UnstructuredURLLoader(urls=urls)
data = loaders.load()

In [60]:
# split the data in smaller documents to be processed by LLM
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(separator='\n',
                                      chunk_size=1000,
                                      chunk_overlap=300)


docs = text_splitter.split_documents(data)



In [61]:
# checking the number of smaller document
print(len(docs))

43


In [62]:
# setting up the embeddings environment
import pickle
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={})

In [63]:
# creating a vector store of embeddings from the documents and saving the vector store to drive
import pickle

# Define the path where you want to save the file
file_path = "/content/drive/MyDrive/Data Science/Data Science Projects/5. Final Project/4. Website ChatBot/faiss_store_openai.pkl"

vectorStore_openAI = FAISS.from_documents(docs, embeddings)

with open(file_path, "wb") as f:
    pickle.dump(vectorStore_openAI, f)

In [64]:
# Loading the vector store
file_path = "/content/drive/MyDrive/Data Science/Data Science Projects/5. Final Project/4. Website ChatBot/faiss_store_openai.pkl"

with open(file_path, "rb") as f:
    VectorStore = pickle.load(f)

VectorStore

<langchain.vectorstores.faiss.FAISS at 0x7e54866b42b0>

# 3. Setting up the LLM

In [65]:
# initializing the LLM
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

llm=OpenAI(temperature=0)

llm

OpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.completion.Completion'>, model_name='text-davinci-003', temperature=0.0, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0, n=1, best_of=1, model_kwargs={}, openai_api_key='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', openai_api_base='', openai_organization='', openai_proxy='', batch_size=20, request_timeout=None, logit_bias={}, max_retries=6, streaming=False, allowed_special=set(), disallowed_special='all', tiktoken_model_name=None)

In [66]:
# Create a chain for Retrieval-based Question Answering with Sources
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=VectorStore.as_retriever())

# 4. Interacting with the LLM

In [68]:
# chatbot function
def chatbot_interface():
    while True:
        print("   ")
        user_question = input("You: ")  # Get the user's question
        if user_question.lower() == 'exit':
            print("WebChatMate: Goodbye!")
            break

        # Perform question-answering
        response = chain({"question": user_question}, return_only_outputs=True)

        # Get the answer from the response and remove the newline character

        answer = response['answer'].replace('\n', '')
        # Provide the answer to the user
        print("   ")
        print("WebChatMate:",answer)


welcome_text = "Welcome to <b>WebChatMate</b>, your personalized web chatbot"

print("Welcome to WebChatMate, your Conversational URL Companion")
print("   ")
print("Ask me a question or type 'exit' to quit.")

chatbot_interface()

Welcome to WebChatMate, your Conversational URL Companion
   
Ask me a question or type 'exit' to quit.
   
You: Who is an IMG?
   
WebChatMate:  An IMG is an individual who received their basic medical degree from a medical school located outside the United States and Canada.
   
You: What is the ECFMG Certification process?
   
WebChatMate:  The ECFMG Certification process requires applicants to submit an Application for ECFMG Certification, pass Step 1 and Step 2 Clinical Knowledge (CK) of the USMLE, and confirm that their medical school meets ECFMG requirements. Individuals should begin the ECFMG certification process by requesting a USMLE/ECFMG Identification Number through ECFMG’s Interactive Web Applications (IWA).
   
You: what is psv
   
WebChatMate:  PSV stands for the Physicians Skills Verification program.
   
You: exit
WebChatMate: Goodbye!
