In [None]:
import os
import requests
from bs4 import BeautifulSoup
import time

def download_pdf(pdf_url, save_path):
    # Send a GET request to the PDF link
    response = requests.get(pdf_url)
    if response.status_code != 200:
        print(f"Error downloading PDF from {pdf_url}: {response.status_code}")
        return
    
    # Save the PDF content to a file
    with open(save_path, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded PDF to {save_path}")

def get_article_links(url):
    # Send GET request to the page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching page: {response.status_code}")
        return []

    # Parse the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links to the articles (they're inside <div class="list-title">)
    links = []
    for link in soup.find_all('a', title="Abstract"):
        article_id = link['href'].split('/')[-1]  # Extract article ID from the URL
        article_url = f"https://arxiv.org/abs/{article_id}"  # Full URL for abstract
        links.append(article_url)
    return links

def scrape_multiple_pages(start_page=0, max_pages=5):
    # URL of the recent quantum physics submissions (adjusted for pagination)
    base_url = 'https://arxiv.org/list/quant-ph/recent?skip={}&show=50'
    
    # Create a directory to store downloaded PDFs
    if not os.path.exists('quant_physics_articles'):
        os.makedirs('quant_physics_articles')
    
    # Loop through the pages and fetch articles
    for page_num in range(start_page, max_pages):
        print(f"Scraping page {page_num + 1}...")
        url = base_url.format(page_num * 50)  # Calculate skip based on the page number
        article_links = get_article_links(url)
        
        if not article_links:
            print("No articles found. Exiting.")
            break
        
        # Loop through each article link and download the PDF
        for article_link in article_links:
            print(f"Processing: {article_link}")
            
            # Send GET request to the article's abstract page
            response = requests.get(article_link)
            if response.status_code != 200:
                print(f"Error fetching article page: {response.status_code}")
                continue
            
            # Parse the abstract page to get the article ID (for the PDF URL)
            soup = BeautifulSoup(response.text, 'html.parser')
            article_id = article_link.split('/')[-1]  # Extract article ID
            
            # Construct the correct PDF URL using the article ID
            pdf_url = f"https://arxiv.org/pdf/{article_id}.pdf"
            
            # Define save path for the PDF
            save_path = os.path.join('quant_physics_articles', f"{article_id}.pdf")
            
            # Download and save the PDF
            download_pdf(pdf_url, save_path)

        # Pause to avoid hitting the server too frequently
        time.sleep(3)

if __name__ == "__main__":
    scrape_multiple_pages(start_page=0, max_pages=5)


In [None]:
import os
import fitz
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document

# Define the path to the directory containing your documents (e.g., PDFs)
pdf_dir = r"D:/LLM/Quant_phys/quant_physics_files"

# Define the model for embeddings (e.g., 'all-MiniLM-L6-v2')
model_name = "all-MiniLM-L6-v2"

# Function to extract text from PDFs (optional if your files are PDFs)index
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

# Define a function to wrap text into Document objects
def create_document_from_text(text, filename):
    return Document(
        text=text, 
        metadata={"file_name": filename}  # Optional: Add file name as metadata
    )

# Extract text from all PDF files in the directory and create Document objects
documents = []
for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, filename)
        pdf_text = extract_text_from_pdf(pdf_path)
        document = create_document_from_text(pdf_text, filename)
        documents.append(document)

# Check how many documents were loaded
print(f"Loaded {len(documents)} documents.")

# Create the embedding model for HuggingFace
embed_model = HuggingFaceEmbedding(model_name=model_name)

# Create the Vector Store Index from the loaded documents
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Define the directory where the index will be saved
index_directory = r"D:/LLM/Quant_phys/"

# Save the index to the specified directory
index.storage_context.persist(persist_dir=index_directory)

# Print success message
print(f"Index has been saved to {index_directory}")

MuPDF error: syntax error: could not parse color space (156 0 R)

MuPDF error: syntax error: could not parse color space (311 0 R)

MuPDF error: syntax error: could not parse color space (443 0 R)

MuPDF error: syntax error: could not parse color space (504 0 R)

MuPDF error: syntax error: could not parse color space (689 0 R)

MuPDF error: syntax error: could not parse color space (5667 0 R)

MuPDF error: syntax error: could not parse color space (5749 0 R)

Loaded 71 documents.
Index has been saved to D:/LLM/LangChain/


In [None]:
from llama_index.core import load_index_from_storage
from llama_index.core.storage.storage_context import StorageContext


# Define the persist directory where index was saved
persist_dir = r"D:/LLM/Quant_phys/"

# Recreate the storage context from the saved directory
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)

# Now you can load the index from the storage context
loaded_index = load_index_from_storage(storage_context,embed_model=embed_model)

# Now the index is ready for querying or further operations
print("Index loaded successfully!")
print(f"Loaded index: {loaded_index}")


Index loaded successfully!
Loaded index: <llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x0000024FD41ED9D0>


In [None]:
import openai

openai.api_key = "YOUR_API_KEY"
# Create the query engine with local embeddings
query_engine = index.as_query_engine(Open_API_key=openai.api_key)

# Example query
query = "What is dechorence?"
response = query_engine.query(query)

# Print the response
print(response)

Decoherence is the process by which a quantum system loses its coherence due to interactions with its environment, leading to the emergence of classical behavior.
