In [None]:
%load_ext lab_black
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2
import sqlite3
from typing import Optional

import pandas as pd
from dotenv import load_dotenv
from paper_cat import PaperClassifier, PaperClassifierOOP
from pydantic import BaseModel
from tqdm import tqdm


class Article(BaseModel):
    title: Optional[str] = None
    abstract: Optional[str] = None
    pdf_link: Optional[str] = None
    id: Optional[str] = None
    text: Optional[str] = None
    field: Optional[str] = None
    subfield: Optional[str] = None
    branch: Optional[str] = None
    sub_branch: Optional[str] = None
    area: Optional[str] = None
    sub_area: Optional[str] = None


def insert_article(article: Article):
    db_name = "paper_infos.sqlite"
    table_name = "paper_infos"
    with sqlite3.connect(db_name) as conn:
        cursor = conn.cursor()
        filtered_fields = {k: v for k, v in article.dict().items() if k != "text"}
        fields = ", ".join(filtered_fields.keys())
        placeholders = ", ".join(["?" for _ in filtered_fields])
        query = (
            f"INSERT OR REPLACE INTO {table_name} ({fields}) VALUES ({placeholders})"
        )
        cursor.execute(query, tuple(filtered_fields.values()))
        conn.commit()


load_dotenv()

url = "https://arxiv.org/list/cs.AI/pastweek?show=389"
classifier = PaperClassifier()
classifier2 = PaperClassifierOOP()
arxiv_links = classifier.get_arxiv_links(url)


def create_paper_info(link, conn):
    try:
        paper_info = classifier.fetch_paper_info(link)
    except Exception as e:
        print(f"Failed to fetch paper info from {link}: {e}")
        return None
    if paper_info is None:
        return None
    if (
        len(
            pd.read_sql(
                f"SELECT id FROM paper_infos where id='{paper_info['id']}'", conn
            )
        )
        > 0
    ):
        # logger.info(f"Already have {paper_info['id']}")
        return None
    try:
        classifier.download_pdf(paper_info["pdf_link"])
        paper_info["text"] = classifier.extract_pdf_text("temp.pdf")
    except Exception as e:
        print(f"Failed to extract text from {link}: {e}")
        return None
    return paper_info


paper_infos = []
with sqlite3.connect("paper_infos.sqlite") as conn:
    for link in tqdm(arxiv_links):
        paper_info = create_paper_info(link, conn)
        if paper_info is None:
            continue
        try:
            classification = classifier2.classify_paper(paper_info["text"])
            if classification is None:
                print("Failed to classify {link}")
                continue
            paper_info.update(classification)
        except Exception as e:
            print(f"Failed to classify {link}: {e}")
            continue
        paper_infos.append(paper_info)
        article = Article(**paper_info)
        insert_article(article)