In [1]:
import os
# Configuration Class
class Config:
    def __init__(self):
        self.search_query = "agent"
        self.max_results = 5
        self.output_folder = "data"
        self.base_url = "http://export.arxiv.org/api/query?"
        self.log_db_path = "app_logs.db"
        self.log_file_path = "app.log"

    def load_from_env(self):
        self.search_query = os.environ.get("SEARCH_QUERY", self.search_query)
        self.max_results = int(os.environ.get("MAX_RESULTS", self.max_results))
        self.output_folder = os.environ.get("OUTPUT_FOLDER", self.output_folder)
        self.base_url = os.environ.get("BASE_URL", self.base_url)
        self.log_db_path = os.environ.get("LOG_DB_PATH", self.log_db_path)
        self.log_file_path = os.environ.get("LOG_FILE_PATH", self.log_file_path)
        return self



# Initialize Configuration
config = Config().load_from_env()  # Load from env variables first



In [2]:

import sqlite3
import datetime
import logging



# DatabaseHandler for logging
class DatabaseHandler(logging.Handler):
    def __init__(self, db_path):
        super().__init__()
        self.db_path = db_path
        self.conn = sqlite3.connect(self.db_path)

    def emit(self, record):
        try:
            self.conn.execute("""
                CREATE TABLE IF NOT EXISTS logs (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    timestamp TEXT,
                    level TEXT,
                    message TEXT
                )
            """)
            self.conn.commit()
            cursor = self.conn.cursor()
            timestamp = datetime.datetime.fromtimestamp(record.created).isoformat()
            log_entry = (timestamp, record.levelname, record.getMessage())
            cursor.execute("INSERT INTO logs (timestamp, level, message) VALUES (?, ?, ?)", log_entry)
            self.conn.commit()
        except sqlite3.Error as e:
            print(f"Error logging to database: {e}")
        except Exception as e:
            print(f"Unexpected error in DatabaseHandler: {e}")

# Configure logging to console, file, and database
file_handler = logging.FileHandler(config.log_file_path)
print(config.log_db_path)
db_handler = DatabaseHandler(config.log_db_path)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
file_handler.setFormatter(formatter)
db_handler.setFormatter(formatter)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(db_handler)


logger.info("test")
logger.info("test 2")
logger.info("test 3")
logger.info("test 4")

app_logs.db


In [3]:
import sqlite3
import datetime
import logging
import os

class DatabaseHandler(logging.Handler):
    def __init__(self, db_path):
        super().__init__()
        self.db_path = db_path
        self._initialize_db()

    def _initialize_db(self):
        """Ensure the logs table exists."""
        try:
            with sqlite3.connect(self.db_path) as conn:
                conn.execute("""
                    CREATE TABLE IF NOT EXISTS logs (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        timestamp TEXT,
                        level TEXT,
                        message TEXT
                    )
                """)
                conn.commit()
        except sqlite3.Error as e:
            print(f"Error initializing database: {e}")

    def emit(self, record):
        """Insert log record into the database."""
        try:
            with sqlite3.connect(self.db_path) as conn:
                cursor = conn.cursor()
                timestamp = datetime.datetime.fromtimestamp(record.created).isoformat()
                log_entry = (timestamp, record.levelname, record.getMessage())
                cursor.execute("INSERT INTO logs (timestamp, level, message) VALUES (?, ?, ?)", log_entry)
                conn.commit()
        except sqlite3.Error as e:
            print(f"Database error: {e}")
        except Exception as e:
            print(f"Unexpected error in DatabaseHandler: {e}")

# Ensure Config class is instantiated before using it
config = Config().load_from_env()

# Ensure log directory exists
if not os.path.exists(config.output_folder):
    os.makedirs(config.output_folder)

# Set up logging
log_file_path = os.path.join(config.output_folder, "app.log")

file_handler = logging.FileHandler(log_file_path)
db_handler = DatabaseHandler(config.log_db_path)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
file_handler.setFormatter(formatter)
db_handler.setFormatter(formatter)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(db_handler)

# Test logging
logger.info("test")
logger.info("test 2")
logger.info("test 3")
logger.info("test 4")


In [4]:
import sqlite3
import datetime
import logging
import os
import threading
from concurrent.futures import ThreadPoolExecutor

# Config class (ensure it's properly loaded before use)
class Config:
    def __init__(self):
        self.search_query = "agent"
        self.max_results = 50
        self.output_folder = "data"
        self.base_url = "http://export.arxiv.org/api/query?"
        self.log_db_path = "app_logs.db"

    def load_from_env(self):
        self.search_query = os.environ.get("SEARCH_QUERY", self.search_query)
        self.max_results = int(os.environ.get("MAX_RESULTS", self.max_results))
        self.output_folder = os.environ.get("OUTPUT_FOLDER", self.output_folder)
        self.base_url = os.environ.get("BASE_URL", self.base_url)
        self.log_db_path = os.environ.get("LOG_DB_PATH", self.log_db_path)
        return self


# DatabaseHandler for SQLite logging
import sqlite3
import datetime
import logging
import os

class DatabaseHandler(logging.Handler):
    def __init__(self, db_path):
        super().__init__()
        self.db_path = db_path
        self._initialize_db()

    def _initialize_db(self):
        """Ensures the logs table exists in SQLite."""
        try:
            with sqlite3.connect(self.db_path, check_same_thread=False) as conn:
                cursor = conn.cursor()
                cursor.execute("""
                    CREATE TABLE IF NOT EXISTS logs (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        timestamp TEXT NOT NULL,
                        level TEXT NOT NULL,
                        message TEXT NOT NULL
                    )
                """)  # Executes the statement separately
                conn.commit()
        except sqlite3.Error as e:
            print(f"Error initializing database: {e}")

    def emit(self, record):
        """Inserts a log record into the database safely."""
        try:
            with sqlite3.connect(self.db_path, check_same_thread=False) as conn:
                cursor = conn.cursor()
                timestamp = datetime.datetime.fromtimestamp(record.created).isoformat()
                log_entry = (timestamp, record.levelname, record.getMessage())
                cursor.execute(
                    "INSERT INTO logs (timestamp, level, message) VALUES (?, ?, ?)",
                    log_entry
                )  # Ensuring a single statement is executed at a time
                conn.commit()
        except sqlite3.Error as e:
            print(f"Database error: {e}")
        except Exception as e:
            print(f"Unexpected error in DatabaseHandler: {e}")


# Ensure Config is initialized before logging
config = Config().load_from_env()

# Ensure log directory exists
if not os.path.exists(config.output_folder):
    os.makedirs(config.output_folder)

# Set up logging
log_file_path = os.path.join(config.output_folder, "app.log")

file_handler = logging.FileHandler(log_file_path)
db_handler = DatabaseHandler(config.log_db_path)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
file_handler.setFormatter(formatter)
db_handler.setFormatter(formatter)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(db_handler)




In [5]:
from smolagents import tool
import xml.etree.ElementTree as ET


from typing import Optional, List, Tuple
import os
import time
import requests
from pathlib import Path
from urllib.parse import urlparse
import hashlib
import re
import sqlite3

@tool
def fetch_arxiv_papers(search_query: str = None, max_results: Optional[int] = None) -> List[Tuple[str, str]]:
    """
    Searches arXiv for research papers on a topic and saves the papers to a folder
    Args:
        search_query: the topic to search for
        max_results: max results to return
    """

    search_query = search_query or config.search_query
    max_results = max_results or config.max_results

    db_name = f"{search_query}.db"
    conn = None

    try:
        Path(config.output_folder).mkdir(parents=True, exist_ok=True)

        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()

        cursor.execute("""
            CREATE TABLE IF NOT EXISTS papers (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                search_query TEXT,
                title TEXT UNIQUE,
                pdf_link TEXT,
                file_path TEXT,
                file_hash TEXT,
                file_content BLOB
            )
        """)
        conn.commit()

        logger.info(f"Searching for papers on '{search_query}'...")
        response_text = _fetch_arxiv_metadata(search_query, max_results)
        papers = _parse_paper_links(response_text)

        logger.info(f"Found {len(papers)} papers. Starting download...")
        downloaded_count = 0
        for title, pdf_link, file_name in papers:
            try:
                file_path = _download_paper(title, pdf_link, file_name, config.output_folder)
                if file_path:
                    file_hash = compute_file_hash(file_path)

                    with open(file_path, "rb") as f:
                        file_content = f.read()

                    cursor.execute("""
                        INSERT OR IGNORE INTO papers (search_query, title, pdf_link, file_path, file_hash, file_content)
                        VALUES (?, ?, ?, ?, ?, ?)
                    """, (search_query, title, pdf_link, file_path, file_hash, file_content))
                    conn.commit()

                    downloaded_count += 1
                    time.sleep(2)
                else:
                    logger.warning(f"Skipping database entry for {title} due to download failure.")

            except sqlite3.Error as e:
                logger.error(f"Database error: {e}")
                if conn:
                    conn.rollback()
                return []
            except Exception as e:
                logger.exception(f"An unexpected error occurred during processing of {title}: ")
                if conn:
                    conn.rollback()
                return []

        logger.info(f"Download and database update complete! {downloaded_count} papers processed.")
        return papers

    except sqlite3.Error as e:
        logger.error(f"Database connection error: {e}")
        return []
    except Exception as e:
        logger.exception("A general error occurred:")
        return []
    finally:
        if conn:
            conn.close()


def sanitize_filename(title: str) -> str:
    return re.sub(r"[^\w\s-]", "", title).strip().replace(" ", "_")


def _get_filename_from_url(url: str) -> str:
    parsed_url = urlparse(url)
    return os.path.basename(parsed_url.path).split(".")[0]


def compute_file_hash(file_path: str, algorithm: str = "sha256") -> str:
    hash_func = hashlib.new(algorithm)
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(8192), b""):
            hash_func.update(chunk)
    return hash_func.hexdigest()


def _fetch_arxiv_metadata(search_query: str, max_results: int) -> str:
    url = f"{config.base_url}search_query=all:{search_query}&start=0&max_results={max_results}"
    logger.info(f"Fetching metadata from: {url}")
    response = requests.get(url)
    response.raise_for_status()
    return response.text


def _parse_paper_links(response_text: str) -> List[Tuple[str, str]]:
    root = ET.fromstring(response_text)
    papers = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        pdf_link = None
        title = None
        for link in entry.findall("{http://www.w3.org/2005/Atom}link"):
            if link.attrib.get("title") == "pdf":
                pdf_link = link.attrib["href"] + ".pdf"
                file_name = os.path.basename(urlparse(pdf_link).path)
                title = entry.find("{http://www.w3.org/2005/Atom}title").text
                break

        if pdf_link and title:
            papers.append((title, pdf_link, file_name))
            logger.info(f"Found paper: {title}, pdf: {pdf_link}")

    return papers

def _download_paper(title, pdf_link, file_name, output_folder):
    """Downloads a single paper PDF."""
    # Create a safe filename
    safe_title = sanitize_filename(title)
    filename = os.path.join(output_folder, f"{file_name}")
    response = requests.get(pdf_link, stream=True)
    response.raise_for_status()

    # Write the PDF to the specified folder
    with open(filename, "wb") as file:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)
    logger.info(f"Downloaded: {title}: {file_name}")





In [6]:
result = fetch_arxiv_papers(search_query="Cellular Automata")
print(result)


--- Logging error ---
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\logging\__init__.py", line 1113, in emit
    stream.write(msg + self.terminator)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u03bb' in position 151: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\projects\deepresearch\venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\projects\deepresearch\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in

Downloaded: Cellular automata on a $G$-set
Downloaded: Cellular Automata, PDEs, and Pattern Formation
Downloaded: Randomized Cellular Automata
Downloaded: About the embedding of one dimensional cellular automata into hyperbolic
  cellular automata
Downloaded: Two Cellular Automata for the 3x+1 Map
Downloaded: Cellular Automata: Wolfram's Metaphors for Complex Systems
Downloaded: New Cellular Automata associated with the Schroedinger Discrete Spectral
  Problem
Downloaded: On Reversible Cellular Automata with Triplet Local Rules
Downloaded: Non-Uniform Cellular Automata: classes, dynamics, and decidability
Downloaded: Unraveling simplicity in elementary cellular automata
Downloaded: A Linear Acceleration Theorem for 2D Cellular Automata on all Complete
  Neighborhoods
Downloaded: Phase Space Invertible Asynchronous Cellular Automata
Downloaded: On the decomposition of stochastic cellular automata
Downloaded: Eventually Number-Conserving Cellular Automata
Downloaded: CAX: Cellular Automa