In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path

# from edison_client import EdisonClient, JobNames
from dotenv import load_dotenv

load_dotenv()
# trajectory_id = "2e8d7bbb-8cb9-4b7d-b94a-2e34c8254c59"
# api_key = os.environ.get("EDISON_API_KEY")

# client = EdisonClient(api_key=api_key)
# task_response = client.get_task(trajectory_id)
# task_response
projects_dir = Path("projects")
project_name = "cpas"
projects_dir.mkdir(exist_ok=True, parents=True)
project_dir = projects_dir / project_name
project_dir.mkdir(exist_ok=True, parents=True)
# with open(project_dir / f"task_response_{trajectory_id}.json", "w") as f:
#     f.write(task_response.model_dump_json(indent=2))

In [None]:
from papers2dataset.bfs_queue import BFSQueue
from papers2dataset.openalex_client import fetch_work

q = BFSQueue(project_dir / "bfs_queue.json")
# search_result = search_works("High-Throughput Evaluation of Cryoprotective Agents for")
# q.add_many([x.get("id", "").split('/')[-1] for x in search_result['results']])

In [None]:

from papers2dataset.openalex_client import fetch_pdf
from papers2dataset.openalex_client import (
    fetch_cited_works,
    fetch_citing_works,
    fetch_related_works,
)
from papers2dataset.extractor import extract_cpa_from_pdf
from papers2dataset.extractor import check_paper_relevance
from tqdm import tqdm_notebook as tqdm

for i in tqdm(range(10)):
    pid = q.pop()
    paper = fetch_work(pid)
    pdf_path = await fetch_pdf(paper, project_dir)
    if pdf_path is None:
        print(f"Failed to download PDF for {paper['id']}")
        q.mark_failed(pid, "no_pdf")
        continue

    res = await check_paper_relevance(paper)
    if not res['has_cpa_compositions']:
        print(f"Paper {paper['id']} is not relevant because {res['reason']}")
        q.mark_skipped(pid, res['reason'])
        continue

    resp = await extract_cpa_from_pdf(pdf_path, project_dir)


    related_works = fetch_related_works(pid)
    cited_works = fetch_cited_works(pid)
    citing_works = fetch_citing_works(pid)


    q.add_many([x.get("id", "").split('/')[-1] for x in related_works+cited_works+citing_works])
    q.mark_processed(pid)
    print(f"Processed {paper['id']}")

In [None]:
# # Known problem
# pid = "W4405309370"
# paper = fetch_work(pid)
# pdf_path = await fetch_pdf(paper, project_dir)

In [None]:
import asyncio
from tqdm.asyncio import tqdm
from papers2dataset.extractor import process_one_paper
# Configuration
MAX_CONCURRENT = 5  # Number of papers to process in parallel

            

async def run_async_pipeline(q, project_dir, num_items=30):
    semaphore = asyncio.Semaphore(MAX_CONCURRENT)
    tasks = []
    
    # 1. Pop IDs from the queue first
    for _ in range(num_items):
        pid = q.pop()
        if not pid:
            break
        tasks.append(process_one_paper(pid, q, project_dir, semaphore))
    
    if not tasks:
        print("Queue is empty.")
        return

    # 2. Run all tasks with a progress bar
    await tqdm.gather(*tasks)

# Usage in Jupyter (top-level await)
await run_async_pipeline(q, project_dir)