Website Summarizer using Langchain RecursiveUrlLoader and OpenAI GPT-4o.

In [None]:
%pip install -qU langchain-community beautifulsoup4 lxml

In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

# imports for langchain

from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

from langchain_community.document_loaders import RecursiveUrlLoader
import re

from bs4 import BeautifulSoup


In [2]:
MODEL = "gpt-4o"
db_name = "vector_db"


load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [3]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()


In [4]:
def prepareLLM(website_url):
    loader = RecursiveUrlLoader(website_url, extractor=bs4_extractor)
    docs = loader.load()
    print(f"Loaded {len(docs)} documents")
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(docs)
    print(f"Loaded {len(chunks)} chunks")

    embeddings = OpenAIEmbeddings()

    # Delete if already exists

    if os.path.exists(db_name):
        Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

    # Create vectorstore

    vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
    print(f"Vectorstore created with {vectorstore._collection.count()} documents")

    # create a new Chat with OpenAI
    llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

    # set up the conversation memory for the chat
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

    # the retriever is an abstraction over the VectorStore that will be used during RAG
    retriever = vectorstore.as_retriever()

    # putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

    return conversation_chain

In [5]:
website_global= None
conversational_chain_global = None

In [6]:
def chat(website,question):
    global website_global
    global conversational_chain_global
    if website_global != website:
        conversation_chain = prepareLLM(website)
        website_global = website
        conversational_chain_global = conversation_chain
    result = conversational_chain_global.invoke({"question":question})
    return result['answer']

In [7]:
with gr.Blocks() as ui:
    website = gr.Textbox(label="Website URL (Only required for the first submit)")
    question = gr.Textbox(label="Your Question")
    submit = gr.Button("Submit")
    answer = gr.Textbox(label="Response")
    submit.click(fn=chat, inputs=[website,question], outputs=[answer])

In [None]:
ui.launch()