In [None]:
# Imports for axiv api
from datetime import datetime, timedelta, timezone
# Imports for Llama parse
from dotenv import load_dotenv
load_dotenv()
from llama_parse import LlamaParse
import requests
# Import for both
import dr_util.file_utils as fu
import os
import time

In [6]:
# This should be the only needed util
import logging

import bytom.arxiv_utils as xu
import bytom.author_profiles as ap

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
logging.basicConfig(level=logging.INFO)

In [12]:
gab_all = ap.get_author_papers('Gabriel Synnaeve', kwargs={"max_results": 100})

In [None]:
# Path constants, make into config
RAW_PDF_DIR = "/Users/daniellerothermel/drotherm/data/raw_pdfs/"
PARSED_PDF_DIR = "/Users/daniellerothermel/drotherm/data/parsed_pdfs/"
METADATA_DIR = "/Users/daniellerothermel/drotherm/data/pdf_metadata/"

## Parsing Metrics

## Util Fxns

In [None]:
def filter_dicts_by_years(dict_list, years, date_key):
    # Get the current date
    today = datetime.now(timezone.utc)
    
    # Calculate the threshold date (years ago from today)
    threshold_date = today - timedelta(days=years*365)  # Approximation for leap years
    
    # Filter the list of dicts
    filtered_dicts = [d for d in dict_list if datetime.strptime(d[date_key], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) >= threshold_date]
    
    return filtered_dicts

In [None]:
def download_pdf(url, save_path):
    """
    Downloads a PDF from the given URL and saves it to the specified location.
    
    Args:
    url (str): The URL of the PDF to download.
    save_path (str): The path (including filename) where the PDF will be saved.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Write the content to a file in binary mode
            with open(save_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"PDF downloaded and saved to {save_path}")
        else:
            print(f"Failed to download PDF. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {e}")

## Combine Util Fxns

In [None]:
def query_dump_download_by_author(author, max_results):
    print(f">> Getting papers from author: {author}")
    # First get all the recent papers by this author, ordered by date
    query = xu.build_author_query(author, kwargs={'max_results': max_results})
    entries = xu.query_api(query)
    #query = make_arxiv_api_query_by_author(author, max_results=max_results)
    #parsed_feed = feedparser.parse(query)

    print(f">> Total number papers: {len(entries)}")
    structured_responses = [xu.parse_paper_entry(pent) for pent in entries]
    return structured_responses

    pdfs_metadata = {}

    # Save all structured responses individually by pdf_url info
    # then download the pdf to the right location
    print(">> Dumping Metadata and Downloading PDFs")
    for sr  in structured_responses:
        pdf_url = sr['pdf_link']
        pdf_name = pdf_url.split('/')[-1]
        pdfs_metadata[pdf_name] = {**sr}

        # Dump metadata
        metadata_path = f'{METADATA_DIR}{pdf_name}.json'
        fu.dump_file(sr, metadata_path, verbose=False)
        pdfs_metadata[pdf_name]['metadata_path'] = metadata_path

        # Download the raw PDF
        raw_path = f'{RAW_PDF_DIR}{pdf_name}.pdf'
        if not os.path.exists(raw_path):
            download_pdf(pdf_url.replace("arxiv.org", "export.arxiv.org"), raw_path)
            time.sleep(4)
        pdfs_metadata[pdf_name]['raw_path'] = raw_path

        print(f" - {pdf_name}: {sr['title']}\n")

    query_metadata = {
        'author': author,
        'query': query,
        'max_results': max_results,
        'num_results': len(structured_responses),
        'pdfs_metadata': pdfs_metadata,
    }
    return query_metadata
    #fu.dump_file(query_metadata, f'{METADATA_DIR}{author.replace(" ", "_")}_query_metadata.json', verbose=True)

In [None]:
# Requires the following two lines to run first
# import nest_asyncio
# nest_asyncio.apply()
def load_author_select_recent_llama_parse(author, num_years=2, max_parse=1):
    # First load the author metadata
    author_metadata_path = f'{METADATA_DIR}{author.replace(" ", "_")}_query_metadata.json'
    amd = fu.load_file(author_metadata_path)
    all_papers = [{**v, 'name': k} for k, v in amd['pdfs_metadata'].items()]

    # Then get the papers from the last N years
    recent_papers = filter_dicts_by_years(all_papers, num_years, date_key="published")
    print(f">> {author} has {len(recent_papers)} / {len(all_papers)} in the last {num_years} years")

    parser = LlamaParse(result_type="markdown")

    # Then for each paper use llama parse to process and dump the paper
    num_parsed = 0
    for rp in recent_papers:
        if num_parsed >= max_parse:
            print(f">> Reached parse: {num_parsed}, break")
            break
        pdf_name = rp['name']
        full_pdf_path = rp['raw_path']
        parsed_pdf_path = f'{PARSED_PDF_DIR}{pdf_name}.pkl'
        #print(f">> Parsed pdf to write: {parsed_pdf_path}")
        if not os.path.exists(parsed_pdf_path):
            print(f">> Begin parsing number: {num_parsed}")
            rp_docs = parser.load_data(full_pdf_path)
            fu.dump_file(rp_docs, parsed_pdf_path, verbose=True)
            print(f" - {len(rp_docs)} blocks for paper: {rp['title']}")
            num_parsed += 1
            time.sleep(3)
    

## Test Util Fxns

In [None]:
title = "Gradient Matching for Domain Generalization"
first_author = "Yuge Shi"
last_author = "Gabriel Synnaeve"
arxiv_id = "2104.09937"
pdf_url = "https://arxiv.org/pdf/2104.09937"

In [None]:
gab_metadata = query_dump_download_by_author(last_author, 22)

In [None]:
gab_metadata[-1]

### Query Single Paper, Extract Data

In [None]:
q1 = make_arxiv_api_query_by_ids([arxiv_id])
q1

In [None]:
paper_feed_to_structured_info(feedparser.parse(q1))

### Query Papers by Author

In [None]:
q2 = make_arxiv_api_query_by_author(last_author, max_results=1000)
q2

In [None]:
p2s = feedparser.parse(q2)
len(p2s['entries'])

In [None]:
parsed_p2s = [paper_feed_to_structured_info(pent) for pent in p2s['entries']]

In [None]:
parsed_p2s[-1]

In [None]:
within_2_yrs = filter_dicts_by_years(parsed_p2s, 2, date_key="published")

In [None]:
within_2_yrs[-1]

### Download PDF and Parse with LlamaParse

In [None]:
pdf_name = pdf_url.split('/')[-1]
full_pdf_path = f'{raw_pdf_path}{pdf_name}.pdf'
download_pdf(pdf_url, full_pdf_path)

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
parser = LlamaParse(result_type="markdown")

In [None]:
docs = parser.load_data(full_pdf_path)

In [None]:
print(docs[1].text[:1000])

In [None]:
full_parsed_pdf_path = f'{parsed_pdf_path}{pdf_name}.pkl'
fu.dump_file(docs, full_parsed_pdf_path)

In [None]:
docs_load = fu.load_file(full_parsed_pdf_path)

In [None]:
print(docs_load[1].text[:1000])

## Query Arxiv API and Download PDF

#### Pavel Izmailov: 25 (2, 5, 9, 13, 17)

In [None]:
query_dump_download_by_author('Pavel Izmailov', 100)

#### Mengye Ren: 47 (10, 12, 14, 26, 36)

In [None]:
query_dump_download_by_author('Mengye Ren', 50)

#### Eunsol Choi: 64 (13, 32, 39, 50, 54)

In [None]:
query_dump_download_by_author('Eunsol Choi', 70)

#### Tal Linzen: 64 Papers (11, 19, 26, 35, 44)

In [None]:
query_dump_download_by_author('Tal Linzen', 1000)

#### He He: 67 (13, 27, 36, 40, 44)

In [None]:
query_dump_download_by_author('He He', 70)

#### Lerrel Pinto: 68 (14, 23, 34, 43, 53)

In [None]:
query_dump_download_by_author('Lerrel Pinto', 70)

#### Rajesh Ranganath: 70 (8, 18, 28, 35, 40)

In [None]:
query_dump_download_by_author('Rajesh Ranganath', 80)

#### Kyunghyun Cho: 272 (30, 64, 85, 106, 134)

In [None]:
query_dump_download_by_author('Kyunghyun Cho', 1000)

## Test Llama Parse Per-Author on Most Recent Papers

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
MAX_PARSE=1

In [None]:
load_author_select_recent_llama_parse("Tal Linzen", num_years=5, max_parse=MAX_PARSE)

In [None]:
load_author_select_recent_llama_parse("Pavel Izmailov", num_years=5, max_parse=MAX_PARSE)

In [None]:
load_author_select_recent_llama_parse("Lerrel Pinto", num_years=5, max_parse=MAX_PARSE)

In [None]:
load_author_select_recent_llama_parse("Kyunghyun Cho", num_years=5, max_parse=MAX_PARSE)

In [None]:
load_author_select_recent_llama_parse("Mengye Ren", num_years=5, max_parse=MAX_PARSE)

In [None]:
load_author_select_recent_llama_parse("Rajesh Ranganath", num_years=5, max_parse=MAX_PARSE)

In [None]:
load_author_select_recent_llama_parse("Eunsol Choi", num_years=5, max_parse=MAX_PARSE)

In [None]:
load_author_select_recent_llama_parse("He He", num_years=5, max_parse=MAX_PARSE)

In [None]:
pdfn = '2409.04556v1'
read_doc = fu.load_file(f'{PARSED_PDF_DIR}{pdfn}.pkl')
print(read_doc[1].text)

## Run Llama Parse on All Authors top 50 papers in last 5 years

In [None]:
MAX_PARSE = 1
for r in range(50):
    for author in [
        "Tal Linzen",
        "Pavel Izmailov",
        "Lerrel Pinto",
        "Kyunghyun Cho",
        "Mengye Ren",
        "Rajesh Ranganath",
        "Eunsol Choi",
        "He He",
    ]:
        load_author_select_recent_llama_parse(author, num_years=5, max_parse=MAX_PARSE)
    
    