In [68]:
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF

# URL of the webpage containing PDF links
webpage_url = 'https://arxiv.org/list/cs.CV/recent?skip=1&show=1'# i just giving only one paper to extract we can alter show=

# Fetch webpage content
response = requests.get(webpage_url)
html_content = response.text

# Parse HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the publication date
date_element = soup.find('h3')
publication_date = date_element.get_text(strip=True) if date_element else 'Unknown date'

# Extract all titles, authors, and PDF URLs
base_url = 'https://arxiv.org'
papers = []

# Find all entries
entries = soup.find_all('dl')

for entry in entries:
    titles = entry.find_all('div', class_='list-title mathjax')
    authors = entry.find_all('div', class_='list-authors')
    pdf_links = entry.find_all('a', title='Download PDF')

    for title, author, pdf_link in zip(titles, authors, pdf_links):
        paper_title = title.get_text(strip=True).replace('Title:', '').strip()
        paper_authors = [a.get_text(strip=True) for a in author.find_all('a')]
        paper_pdf_url = base_url + pdf_link['href']
        papers.append({
            'title': paper_title,
            'authors': paper_authors,
            'pdf_url': paper_pdf_url,
            'date': publication_date
        })

def extract_text_from_pdf(paper):
    text = f"publication date of {paper['title']} is {paper['date']}\nTitle {paper['title']}\nAuthors of {paper['title']} {', '.join(paper['authors'])}\n\n"
    try:
        # Download PDF or directly process it
        response = requests.get(paper['pdf_url'], stream=True)
        document = fitz.open(stream=response.content, filetype="pdf")

        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text += page.get_text()

    except Exception as e:
        text += f"Error processing PDF at {paper['pdf_url']}: {e}"

    return text

# Extract text from each PDF and print it along with title, authors, and date
for paper in papers:
    extracted_text = extract_text_from_pdf(paper)
    print(extracted_text)
    print("\n" + "="*80 + "\n")


publication date of BACON: Supercharge Your VLM with Bag-of-Concept Graph to Mitigate Hallucinations is Thu, 4 Jul 2024 (continued, showing 1 of 115 entries )
Title BACON: Supercharge Your VLM with Bag-of-Concept Graph to Mitigate Hallucinations
Authors of BACON: Supercharge Your VLM with Bag-of-Concept Graph to Mitigate Hallucinations Zhantao Yang, Ruili Feng, Keyu Yan, Huangji Wang, Zhicai Wang, Shangwen Zhu, Han Zhang, Jie Xiao, Pingyu Wu, Kai Zhu, Jixuan Chen, Chen-Wei Xie, Chaojie Mao, Yue Yang, Hongyang Zhang, Yu Liu, Fan Cheng

BACON: Supercharge Your VLM with Bag-of-Concept Graph
to Mitigate Hallucinations
Zhantao Yang1,2⋆, Ruili Feng2⋆⋄, Keyu Yan2, Huangji Wang1, Zhicai Wang2
Shangwen Zhu1, Han Zhang1,2, Jie Xiao2, Pingyu Wu2, Kai Zhu2, Jixuan Chen2
Chen-Wei Xie2, Chaojie Mao2, Yue Yang3, Hongyang Zhang4, Yu Liu2, Fan Cheng1†
1Shanghai Jiao Tong University, 2Alibaba group
3University of Pennsylvania, 4University of Waterloo
https://ztyang23.github.io/bacon-page
Abstract
This p

In [69]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
extracted_texts=[extracted_text]
# Split text into manageable chunks/documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=35)

documents = []
for text in extracted_texts:
    chunks = text_splitter.split_text(text)
    documents.extend(chunks)


In [70]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# Initialize the embeddings model from HuggingFace
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



In [71]:
from langchain_community.vectorstores import FAISS
# Create a FAISS vector store from documents and embeddings
vector = FAISS.from_texts(documents, embeddings)



In [72]:
from langchain_community.llms import Ollama
# Initialize the Ollama language model
llm = Ollama(model="llama3")

In [73]:
# Create a retriever from the vector store
retriever = vector.as_retriever()

In [74]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
# Define the system instruction for reformulating the user's question

instruction_to_system = """
Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is.
"""
# Create a prompt template for reformulating questions
question_maker_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", instruction_to_system),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


# Define a chain that reformulates the question if needed
question_chain = question_maker_prompt | llm #| StrOutputParser()

In [75]:
# Define the system prompt for the question-answering assistant
qa_system_prompt =  """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
There might be multiple research papers so check every document for better answer . \
If you don't know the answer,do not hallicuniate i need concise answer . Do not generate your answer.\
{context}"""

# Create a prompt template for the question-answering task
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

In [76]:
# Function to determine if question needs reformulation based on chat history
def contextualized_question(input: dict):
    if input.get("chat_history"):
        return question_chain
    else:
        return input["question"]

In [77]:
from langchain_core.runnables import RunnablePassthrough
# Create a retriever chain to fetch relevant context for the question
retriever_chain = RunnablePassthrough.assign(
        context=contextualized_question | retriever 
    )

In [78]:
# Define the Retrieval-Augmented Generation (RAG) chain
rag_chain = (
    retriever_chain
    | qa_prompt
    | llm
)

In [79]:
chat_history = []
question = "Method of enhancing SDXL by BACON "
from langchain_core.messages import AIMessage, HumanMessage

#Invoke the RAG chain with the question and chat history, and update chat history with responses
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
print(ai_msg)

According to the provided context, BACON can significantly assist SDXL in simplifying complex tasks by breaking down complex texts into basic elements.


In [80]:
###code to automate the scraping process everyday
#import schedule
#import time
#import subprocess

#def job():
#    subprocess.run(["python", "C:\\Path\\To\\YourScript\\scrape.py"])

#schedule.every().day.at("12:00").do(job)

#while True:
#    schedule.run_pending()
#    time.sleep(1)

In [81]:
import gradio as gr
from langchain_core.messages import HumanMessage

# Function to handle the chat interaction
def chat_complete(message, state):
    if state is None:
        state = []
    ai_msg = rag_chain.invoke({"question": message, "chat_history": state})
    state.append({"role": "user", "content": message})
    state.append({"role": "assistant", "content": ai_msg})
    response = [(msg["content"], state[i+1]["content"]) for i, msg in enumerate(state) if msg["role"] == "user"]
    return response, state

# Define the Gradio interface
with gr.Blocks() as block:
    gr.Markdown("""<h1><center> EduVisionBot </center></h1>""")
    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder="Type your Message.........")
    state = gr.State([])
    submit = gr.Button("SEND")
    
    submit.click(chat_complete, inputs=[message, state], outputs=[chatbot, state])
block.launch(debug=True)



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


