In [None]:
import logging
import os
import time
import requests
from pathlib import Path
from urllib.parse import urlparse
import hashlib
import re
import xml.etree.ElementTree as ET
import sqlite3
from typing import Optional, List, Tuple

# Configuration Class
class Config:
    def __init__(self):
        self.search_query = "agent"
        self.max_results = 50
        self.output_folder = "data"
        self.base_url = "http://export.arxiv.org/api/query?"
        self.log_db_path = "app_logs.db"

    def load_from_env(self):
        self.search_query = os.environ.get("SEARCH_QUERY", self.search_query)
        self.max_results = int(os.environ.get("MAX_RESULTS", self.max_results))
        self.output_folder = os.environ.get("OUTPUT_FOLDER", self.output_folder)
        self.base_url = os.environ.get("BASE_URL", self.base_url)
        self.log_db_path = os.environ.get("LOG_DB_PATH", self.log_db_path)
        return self

    # Optional: Load from a file (e.g. JSON, YAML) - Implement as needed
    # def load_from_file(self, config_file: str):
    #     pass # Implement file loading logic


# Initialize Configuration
config = Config().load_from_env()  # Load from env variables first

# DatabaseHandler for logging
class DatabaseHandler(logging.Handler):
    def __init__(self, db_path):
        super().__init__()
        self.db_path = db_path
        self.conn = None

    def emit(self, record):
        try:
            if self.conn is None or self.conn.closed:
                self.conn = sqlite3.connect(self.db_path)
                self.conn.execute("""
                    CREATE TABLE IF NOT EXISTS logs (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        timestamp TEXT,
                        level TEXT,
                        message TEXT
                    )
                """)
                self.conn.commit()

            cursor = self.conn.cursor()
            import datetime
            timestamp = datetime.datetime.fromtimestamp(record.created).isoformat()
            log_entry = (timestamp, record.levelname, record.getMessage())
            cursor.execute("INSERT INTO logs (timestamp, level, message) VALUES (?, ?, ?)", log_entry)
            self.conn.commit()

        except sqlite3.Error as e:
            print(f"Error logging to database: {e}")
            if self.conn:
                self.conn.rollback()
        except Exception as e:
            print(f"Unexpected error in DatabaseHandler: {e}")
            if self.conn:
                self.conn.rollback()
        finally:
            if self.conn:
                self.conn.close()

# Configure logging to console, file, and database
log_file_path = "app.log"
file_handler = logging.FileHandler(log_file_path)
db_handler = DatabaseHandler(config.log_db_path)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
file_handler.setFormatter(formatter)
db_handler.setFormatter(formatter)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(db_handler)



@tool
def fetch_arxiv_papers(search_query: str = None, max_results: Optional[int] = None) -> List[Tuple[str, str]]:
    """Searches arXiv, downloads papers, and adds metadata to a database."""

    search_query = search_query or config.search_query
    max_results = max_results or config.max_results

    db_name = f"{search_query}.db"
    conn = None

    try:
        Path(config.output_folder).mkdir(parents=True, exist_ok=True)

        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()

        cursor.execute("""
            CREATE TABLE IF NOT EXISTS papers (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                search_query TEXT,
                title TEXT UNIQUE,
                pdf_link TEXT,
                file_path TEXT,
                file_hash TEXT,
                file_content BLOB
            )
        """)
        conn.commit()

        logger.info(f"Searching for papers on '{search_query}'...")
        response_text = _fetch_arxiv_metadata(search_query, max_results)
        papers = _parse_paper_links(response_text)

        logger.info(f"Found {len(papers)} papers. Starting download...")
        downloaded_count = 0
        for title, pdf_link in papers:
            try:
                file_path = _download_paper(title, pdf_link, config.output_folder)
                if file_path:
                    file_hash = compute_file_hash(file_path)

                    with open(file_path, "rb") as f:
                        file_content = f.read()

                    cursor.execute("""
                        INSERT OR IGNORE INTO papers (search_query, title, pdf_link, file_path, file_hash, file_content)
                        VALUES (?, ?, ?, ?, ?, ?)
                    """, (search_query, title, pdf_link, file_path, file_hash, file_content))
                    conn.commit()

                    downloaded_count += 1
                    time.sleep(2)
                else:
                    logger.warning(f"Skipping database entry for {title} due to download failure.")

            except sqlite3.Error as e:
                logger.error(f"Database error: {e}")
                if conn:
                    conn.rollback()
                return []
            except Exception as e:
                logger.exception(f"An unexpected error occurred during processing of {title}: ")
                if conn:
                    conn.rollback()
                return []

        logger.info(f"Download and database update complete! {downloaded_count} papers processed.")
        return papers

    except sqlite3.Error as e:
        logger.error(f"Database connection error: {e}")
        return []
    except Exception as e:
        logger.exception("A general error occurred:")
        return []
    finally:
        if conn:
            conn.close()


def sanitize_filename(title: str) -> str:
    return re.sub(r"[^\w\s-]", "", title).strip().replace(" ", "_")


def _get_filename_from_url(url: str) -> str:
    parsed_url = urlparse(url)
    return os.path.basename(parsed_url.path).split(".")[0]


def compute_file_hash(file_path: str, algorithm: str = "sha256") -> str:
    hash_func = hashlib.new(algorithm)
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(8192), b""):
            hash_func.update(chunk)
    return hash_func.hexdigest()


def _fetch_arxiv_metadata(search_query: str, max_results: int) -> str:
    url = f"{config.base_url}search_query=all:{search_query}&start=0&max_results={max_results}"
    response = requests.get(url)
    response.raise_for_status()
    return response.text


def _parse_paper_links(response_text: str) -> List[Tuple[str, str]]:
    root = ET.fromstring(response_text)
    papers = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        pdf_link = None
        title = None
        for link in entry.findall("{http://www.w3.org/2005/Atom}link"):
            if link.attrib.get("title") == "pdf":
                pdf_link = link.attrib["href"] + ".pdf"
                title = entry.find("{http://www.w3.org/2005/Atom}title").text
                break

        if pdf_link and title:
            papers.append((title, pdf_link))

    return papers


