In [26]:
import requests
from bs4 import BeautifulSoup

def extract_paper_details(html_url):
    # Step 1: Fetch HTML content
    response = requests.get(html_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Step 2: Extract title
    title = soup.find('h1', class_='ltx_title ltx_title_document').text.strip()
    
   # Step 3: Extract authors
    authors = []
    authors_tag = soup.find_all('span', class_='ltx_creator ltx_role_author')
    for tag in authors_tag:
        author_names = tag.find_all('span', class_='ltx_text ltx_font_bold')
        for author_name in author_names:
            authors.append(author_name.text.strip())
    
    authors = ", ".join(authors) if authors else "Authors not found"
    
    # Step 4: Find and extract the main content (assuming it's within an article tag)
    article_content = soup.find('article')
    if article_content:
        # Extract all paragraphs within the article, excluding side headings
        paragraphs = []
        for p in article_content.find_all('h2'):
            if 'class' in p.attrs and 'ltx_title ltx_title_section' in p['class']:
                continue  # Skip side headings or other labeled sections
            paragraphs.append(p.text.strip())
        
        paper_text = "\n".join(paragraphs)
        
    else:
        paper_text = "Article content section not found."

    # Combine title, authors, and full text into a single string
    output = f"Title:\n{title}\n\nAuthors:\n{authors}\n\nFull Text:\n{paper_text}"
    
    return output

def main():
    # URL of the HTML page
    html_url = 'https://arxiv.org/html/2407.01519v1'
    
    # Extract paper details and full text
    paper_output = extract_paper_details(html_url)
    
    # Print the output
    print(paper_output)

if __name__ == "__main__":
    main()


Title:
DiffIR2VR-Zero: Zero-Shot Video Restoration with Diffusion-based Image Restoration Models

Authors:
Chi-Wei Hsiao3, Ting-Hsuan Chen1, Yu-Lun Liu1
1

Full Text:
1 Introduction
2 Related Work
3 Method
4 Experiments
5 Conclusion
References
Appendix A Appendix / supplemental material


In [29]:
import fitz  # PyMuPDF
import requests
from io import BytesIO

def extract_text_from_online_pdf(pdf_url):
    text = ""
    try:
        # Fetch the PDF file content
        response = requests.get(pdf_url)
        pdf_bytes = BytesIO(response.content)

        # Open the PDF document
        pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")

        # Iterate through each page and extract text
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()

        # Close the PDF document
        pdf_document.close()

    except Exception as e:
        print(f"Error extracting text from PDF: {e}")

    return text

# URL of the online PDF
pdf_url = 'https://arxiv.org/pdf/2407.01519.pdf'

# Extract text from the online PDF
extracted_text = extract_text_from_online_pdf(pdf_url)

# Print the extracted text
print(extracted_text)




DiffIR2VR-Zero: Zero-Shot Video Restoration with
Diffusion-based Image Restoration Models
Chang-Han Yeh1
Chin-Yang Lin1
Zhixiang Wang2
Chi-Wei Hsiao3
Ting-Hsuan Chen1
Yu-Lun Liu1
1National Yang Ming Chiao Tung University
2University of Tokyo
3MediaTek Inc.
Abstract
This paper introduces a method for zero-shot video restoration using pre-trained im-
age restoration diffusion models. Traditional video restoration methods often need
retraining for different settings and struggle with limited generalization across vari-
ous degradation types and datasets. Our approach uses a hierarchical token merging
strategy for keyframes and local frames, combined with a hybrid correspondence
mechanism that blends optical flow and feature-based nearest neighbor matching
(latent merging). We show that our method not only achieves top performance in
zero-shot video restoration but also significantly surpasses trained models in gener-
alization across diverse datasets and extreme degradations (8× super-res

AttributeError: 'str' object has no attribute 'page_content'

In [37]:
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup

In [39]:
import bs4

In [11]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("https://arxiv.org/pdf/2407.01519.pdf")
pages = loader.load_and_split()

In [12]:
pages

[Document(page_content='DiffIR2VR-Zero: Zero-Shot Video Restoration with\nDiffusion-based Image Restoration Models\nChang-Han Yeh1Chin-Yang Lin1Zhixiang Wang2\nChi-Wei Hsiao3Ting-Hsuan Chen1Yu-Lun Liu1\n1National Yang Ming Chiao Tung University2University of Tokyo3MediaTek Inc.\nAbstract\nThis paper introduces a method for zero-shot video restoration using pre-trained im-\nage restoration diffusion models. Traditional video restoration methods often need\nretraining for different settings and struggle with limited generalization across vari-\nous degradation types and datasets. Our approach uses a hierarchical token merging\nstrategy for keyframes and local frames, combined with a hybrid correspondence\nmechanism that blends optical flow and feature-based nearest neighbor matching\n(latent merging). We show that our method not only achieves top performance in\nzero-shot video restoration but also significantly surpasses trained models in gener-\nalization across diverse datasets and ex

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

documents = text_splitter.split_documents(pages)

In [15]:
documents[:3]

[Document(page_content='DiffIR2VR-Zero: Zero-Shot Video Restoration with\nDiffusion-based Image Restoration Models\nChang-Han Yeh1Chin-Yang Lin1Zhixiang Wang2\nChi-Wei Hsiao3Ting-Hsuan Chen1Yu-Lun Liu1\n1National Yang Ming Chiao Tung University2University of Tokyo3MediaTek Inc.\nAbstract\nThis paper introduces a method for zero-shot video restoration using pre-trained im-\nage restoration diffusion models. Traditional video restoration methods often need\nretraining for different settings and struggle with limited generalization across vari-\nous degradation types and datasets. Our approach uses a hierarchical token merging\nstrategy for keyframes and local frames, combined with a hybrid correspondence\nmechanism that blends optical flow and feature-based nearest neighbor matching\n(latent merging). We show that our method not only achieves top performance in\nzero-shot video restoration but also significantly surpasses trained models in gener-', metadata={'source': 'https://arxiv.org/

In [16]:
print(f"{len(pages)} vs {len(documents)}")

19 vs 66


In [17]:
import os
from dotenv import load_dotenv

load_dotenv(".env")
hf_key = os.getenv("HF_TOKEN")

In [18]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



In [19]:
from langchain_community.vectorstores import FAISS
vector = FAISS.from_documents(documents, embeddings)

In [20]:
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()

In [22]:
retriever = vector.as_retriever()

In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the model name
model_name = "gpt2"  # You can replace this with any other open-source model name

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = AutoModelForCausalLM.from_pretrained(model_name)

In [24]:
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()

In [25]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

instruction_to_system = """
Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is.
"""

question_maker_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", instruction_to_system),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


question_chain = question_maker_prompt | llm | StrOutputParser()

In [26]:
# Use three sentences maximum and keep the answer concise.\
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, provide a summary of the context. Do not generate your answer.\


{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

In [27]:
def contextualized_question(input: dict):
    if input.get("chat_history"):
        return question_chain
    else:
        return input["question"]

In [28]:
from langchain_core.runnables import RunnablePassthrough
retriever_chain = RunnablePassthrough.assign(
        context=contextualized_question | retriever #| format_docs
    )

In [31]:
rag_chain = (
    retriever_chain
    | qa_prompt
    | llm
)

In [32]:
question = "what percentage of patients have pathogenic germline gene variants?"

In [33]:
chat_history = []

ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
ai_msg

AttributeError: 'ChatPromptValue' object has no attribute 'size'