In [4]:
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF

# URL of the webpage containing PDF links
webpage_url = 'https://arxiv.org/list/cs.CV/recent?show=1'# Here we can adjust how many pdfs to extract for now i just kept it as 1

# Fetch webpage content
response = requests.get(webpage_url)
html_content = response.text

# Parse HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract all PDF URLs
pdf_urls = []
base_url = 'https://arxiv.org'

# Find all 'a' tags with title 'Download PDF' and extract href attributes
for a_tag in soup.find_all('a', title='Download PDF'):
    pdf_link = a_tag['href']
    pdf_urls.append(base_url + pdf_link)

def extract_text_from_pdf(pdf_url):
    text = ""
    try:
        #  PDF  processing
        response = requests.get(pdf_url, stream=True)
        document = fitz.open(stream=response.content, filetype="pdf")
        
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text += page.get_text()
        
    except Exception as e:
        print(f"Error processing PDF at {pdf_url}: {e}")
    
    return text

# Extract text from each PDF and print it
for pdf_url in pdf_urls:
    extracted_text = extract_text_from_pdf(pdf_url)


In [5]:
from langchain.text_splitter import CharacterTextSplitter
extracted_texts=[extracted_text]
# Split text into manageable chunks/documents
text_splitter = CharacterTextSplitter(
    separator="\n",  # Use newline as the separator for paragraphs
    chunk_size=1000,  # Adjust chunk size based on  requirements
    chunk_overlap=200  # Adjust overlap size based on requirements
)

documents = []
for text in extracted_texts:
    chunks = text_splitter.split_text(text)
    documents.extend(chunks)


In [6]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# Initialize the embeddings model from HuggingFace
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



In [7]:
from langchain_community.vectorstores import FAISS
# Create a FAISS vector store from documents and embeddings
vector = FAISS.from_texts(documents, embeddings)
# Create a retriever from the vector store
retriever = vector.as_retriever()

In [8]:
from langchain_community.llms import Ollama
# Initialize the Ollama language model
llm = Ollama(model="llama3")

In [9]:
from langchain_core.output_parsers import StrOutputParser
# Initialize the output parser for string outputs
output_parser = StrOutputParser()

In [10]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
# Define the system instruction for reformulating the user's question

instruction_to_system = """
Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is.
"""
# Create a prompt template for reformulating questions
question_maker_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", instruction_to_system),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


# Define a chain that reformulates the question if needed
question_chain = question_maker_prompt | llm | StrOutputParser()

In [11]:
# Define the system prompt for the question-answering assistant
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, provide a summary of the context. Do not generate your answer.\
{context}"""

# Create a prompt template for the question-answering task
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

In [12]:
# Function to determine if question needs reformulation based on chat history
def contextualized_question(input: dict):
    if input.get("chat_history"):
        return question_chain
    else:
        return input["question"]

In [13]:
from langchain_core.runnables import RunnablePassthrough
# Create a retriever chain to fetch relevant context for the question
retriever_chain = RunnablePassthrough.assign(
        context=contextualized_question | retriever #| format_docs
    )

In [14]:
# Define the Retrieval-Augmented Generation (RAG) chain
rag_chain = (
    retriever_chain
    | qa_prompt
    | llm
)

In [15]:
question = "there are some authors for this particular paper please analyze and find who are they?"
from langchain_core.messages import AIMessage, HumanMessage
chat_history = []
#Invoke the RAG chain with the question and chat history, and update chat history with responses
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
print(ai_msg)

Based on the provided context, it appears that the authors of this paper have not been explicitly mentioned. The text does not include a list of authors or their affiliations.


In [16]:
question = "not specific authors there are few persons please find them"

In [17]:
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
print(ai_msg)

Summary of the context:

The provided context is from a research paper discussing various topics such as ward modeling, preference data collection, and DPO alignment for high-quality article generation. The paper also mentions instruction-aware webpage generation, chatbots, and other AI-related concepts.

There are no specific authors mentioned in the provided text.


In [18]:
question = "yes you are correct ok lemme know what is supervised fine tuning??"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
ai_msg

'According to the context, Supervised Fine-tuning (SFT) is a model that begins with an initial model π and uses instruction tuning data from IXC2 [33] focused on article writing. The SFT model rewrites original prompts using the Chain-of-Thought (CoT) technique [152], generating step-by-step prompts to supplement the instruction tuning data as augmented data D∗.'

In [19]:
import gradio as gr
from langchain_core.messages import HumanMessage

# Function to handle the chat interaction
def chat_complete(message, state):
    if state is None:
        state = []
    ai_msg = rag_chain.invoke({"question": message, "chat_history": state})
    state.append({"role": "user", "content": message})
    state.append({"role": "assistant", "content": ai_msg})
    response = [(msg["content"], state[i+1]["content"]) for i, msg in enumerate(state) if msg["role"] == "user"]
    return response, state

# Define the Gradio interface
with gr.Blocks() as block:
    gr.Markdown("""<h1><center> EduVisionBot </center></h1>""")
    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder="Type your Message.........")
    state = gr.State([])
    submit = gr.Button("SEND")
    
    submit.click(chat_complete, inputs=[message, state], outputs=[chatbot, state])
block.launch(debug=True)



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


