In [18]:
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF

# URL of the webpage containing PDF links
webpage_url = 'https://arxiv.org/list/cs.CV/recent?skip=2&show=3'# i just giving only one paper to extract we can alter show=

# Fetch webpage content
response = requests.get(webpage_url)
html_content = response.text

# Parse HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the publication date
date_element = soup.find('h3')
publication_date = date_element.get_text(strip=True) if date_element else 'Unknown date'

# Extract all titles, authors, and PDF URLs
base_url = 'https://arxiv.org'
papers = []

# Find all entries
entries = soup.find_all('dl')

for entry in entries:
    titles = entry.find_all('div', class_='list-title mathjax')
    authors = entry.find_all('div', class_='list-authors')
    pdf_links = entry.find_all('a', title='Download PDF')

    for title, author, pdf_link in zip(titles, authors, pdf_links):
        paper_title = title.get_text(strip=True).replace('Title:', '').strip()
        paper_authors = [a.get_text(strip=True) for a in author.find_all('a')]
        paper_pdf_url = base_url + pdf_link['href']
        papers.append({
            'title': paper_title,
            'authors': paper_authors,
            'pdf_url': paper_pdf_url,
            'date': publication_date
        })

def extract_text_from_pdf(paper):
    text = f"publication date of {paper['title']}: {paper['date']}\nTitle: {paper['title']}\nAuthors of {paper['title']}: {', '.join(paper['authors'])}\n\n"
    try:
        # Download PDF or directly process it
        response = requests.get(paper['pdf_url'], stream=True)
        document = fitz.open(stream=response.content, filetype="pdf")

        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text += page.get_text()

    except Exception as e:
        text += f"Error processing PDF at {paper['pdf_url']}: {e}"

    return text

# Extract text from each PDF and print it along with title, authors, and date
for paper in papers:
    extracted_text = extract_text_from_pdf(paper)
    print(extracted_text)
    print("\n" + "="*80 + "\n")


publication date of Smart City Surveillance Unveiling Indian Person Attributes in Real Time: Thu, 4 Jul 2024 (continued, showing 3 of 115 entries )
Title: Smart City Surveillance Unveiling Indian Person Attributes in Real Time
Authors of Smart City Surveillance Unveiling Indian Person Attributes in Real Time: Shubham Kale, Shashank Sharma, Abhilash Khuntia

Smart City Surveillance Unveiling Indian Person
Attributes in Real Time
Shubham Kale
M.Tech CSE
Dept. of CSE
IIIT Delhi
shubham23094@iiitd.ac.in
Shashank Sharma
M.Tech CSE
Dept. of CSE
IIIT Delhi
shashank23088@iiitd.ac.in
Abhilash Khuntia
M.Tech CSE
Dept. of CSE
IIIT Delhi
abhilash23007@iiitd.ac.in
Abstract—This project focuses on creating a smart surveillance
system for Indian cities that can identify and analyze people’s
attributes in real time. Using advanced technologies like artificial
intelligence and machine learning, the system can recognize
attributes such as upper body color what the person is wearing,
accessories that he 

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
extracted_texts=[extracted_text]
# Split text into manageable chunks/documents
text_splitter = RecursiveCharacterTextSplitter(
    
    chunk_size=1024,
    chunk_overlap=200, # Adjust overlap size based on requirements
)

documents = []
for text in extracted_texts:
    chunks = text_splitter.split_text(text)
    documents.extend(chunks)


In [20]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# Initialize the embeddings model from HuggingFace
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



In [21]:
from langchain_community.vectorstores import FAISS
# Create a FAISS vector store from documents and embeddings
vector = FAISS.from_texts(documents, embeddings)
# Create a retriever from the vector store
retriever = vector.as_retriever()

In [22]:
from langchain_community.llms import Ollama
# Initialize the Ollama language model
llm = Ollama(model="llama3")

In [23]:
from langchain_core.output_parsers import StrOutputParser
# Initialize the output parser for string outputs
output_parser = StrOutputParser()

In [24]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
# Define the system instruction for reformulating the user's question

instruction_to_system = """
Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is.
"""
# Create a prompt template for reformulating questions
question_maker_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", instruction_to_system),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


# Define a chain that reformulates the question if needed
question_chain = question_maker_prompt | llm | StrOutputParser()

In [25]:
# Function to determine if question needs reformulation based on chat history
def contextualized_question(input: dict):
    if input.get("chat_history"):
        return question_chain
    else:
        return input["question"]

In [26]:
from langchain_core.runnables import RunnablePassthrough
# Create a retriever chain to fetch relevant context for the question
retriever_chain = RunnablePassthrough.assign(
        context=contextualized_question | retriever #| format_docs
    )

In [27]:
# Define the system prompt for the question-answering assistant
qa_system_prompt =  """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer,dont hallicuniate i need concise answer . Do not generate your answer.\
{context}"""

# Create a prompt template for the question-answering task
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

In [28]:
# Define the Retrieval-Augmented Generation (RAG) chain
rag_chain = (
    retriever_chain
    | qa_prompt
    | llm
)

In [29]:
chat_history = []
question = "breif the paper what it is all about ??  "
from langchain_core.messages import AIMessage, HumanMessage

#Invoke the RAG chain with the question and chat history, and update chat history with responses
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
print(ai_msg)

Based on the provided documents, I can summarize the paper as follows:

The paper discusses the application of Physics-Informed Neural Networks (PINNs) to solve the problem of identifying material properties in soft-tissue deformation modeling and image registration. The authors demonstrate that PINNs can be used to estimate material properties, such as Young's modulus, from incomplete and noisy data. They also highlight the challenges of image registration, where exact values of stress and strain are unknown, and boundary conditions need to be estimated.

The paper appears to explore the use of PINNs in various biomedical applications, including soft-tissue deformation modeling and image registration, with a focus on identifying material properties and their importance in these fields.


In [33]:
###code to automate the scraping process everyday
#import schedule
#import time
#import subprocess

#def job():
#    subprocess.run(["python", "C:\\Path\\To\\YourScript\\scrape.py"])

#schedule.every().day.at("12:00").do(job)

#while True:
#    schedule.run_pending()
#    time.sleep(1)

In [None]:
import gradio as gr
from langchain_core.messages import HumanMessage

# Function to handle the chat interaction
def chat_complete(message, state):
    if state is None:
        state = []
    ai_msg = rag_chain.invoke({"question": message, "chat_history": state})
    state.append({"role": "user", "content": message})
    state.append({"role": "assistant", "content": ai_msg})
    response = [(msg["content"], state[i+1]["content"]) for i, msg in enumerate(state) if msg["role"] == "user"]
    return response, state

# Define the Gradio interface
with gr.Blocks() as block:
    gr.Markdown("""<h1><center> EduVisionBot </center></h1>""")
    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder="Type your Message.........")
    state = gr.State([])
    submit = gr.Button("SEND")
    
    submit.click(chat_complete, inputs=[message, state], outputs=[chatbot, state])
block.launch(debug=True)



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


