In [24]:
import os


id_file = "cache/arxiv/arxiv_ids.txt"

# read 1line
ids = set()
with open(id_file, "r") as f:
    for line in f:
        # split by v and take the first part
        ids.add(line.split("v")[0].strip())

ids = sorted(list(ids))
ids[:5]

['0803.0476', '0912.3995', '1103.0398', '1112.6209', '1206.5538']

In [2]:
from langchain_community.retrievers import ArxivRetriever

retriever = ArxivRetriever(
    load_max_docs=2,
    get_ful_documents=True,
)

In [3]:
book1 = retriever.invoke("0912.3995v4")

In [7]:
from langchain_community.document_loaders import ArxivLoader

# Supports all arguments of `ArxivAPIWrapper`
loader = ArxivLoader(
    query="reasoning",
    load_max_docs=2,
)

'Many applications require optimizing an unknown, noisy function that is\nexpensive to evaluate. We formalize this task as a multi-armed bandit problem,\nwhere the payoff function is either sampled from a Gaussian process (GP) or has\nlow RKHS norm. We resolve the important open problem of deriving regret bounds\nfor this setting, which imply novel convergence rates for GP optimization. We\nanalyze GP-UCB, an intuitive upper-confidence based algorithm, and bound its\ncumulative regret in terms of maximal information gain, establishing a novel\nconnection between GP optimization and experimental design. Moreover, by\nbounding the latter in terms of operator spectra, we obtain explicit sublinear\nregret bounds for many commonly used covariance functions. In some important\ncases, our bounds have surprisingly weak dependence on the dimensionality. In\nour experiments on real sensor data, GP-UCB compares favorably with other\nheuristical GP optimization approaches.'

In [42]:
folder_to_save = "cache/arxiv/papers"
# create folder if not exists
os.makedirs(folder_to_save, exist_ok=True)

# obtain content
from langchain_community.document_loaders import ArxivLoader
from langchain_core.load import dumpd, dumps, load, loads
from json import dump
import time
for id in ids:
    # skip if file already exists
    filename = f"{id}.json"  # Change file extension to .pkl for pickle files
    if os.path.exists(os.path.join(folder_to_save, filename)):
        print(f"Skipping {id} because it already exists")
        continue

    try:
        loader = ArxivLoader(
            query=id,
            load_max_docs=1,
        )

        print()
        print("loading", id)
        paper = loader.load()
    except ConnectionError as e:
        print("ConnectionError, skipping")
        time.sleep(5)
        continue
    print("loaded", id)
    print()

    if isinstance(paper, list):
        if len(paper) > 0:
            paper = paper[0]
        else:
            print("no paper for id", id)
            continue

    with open(os.path.join(folder_to_save, filename), "w") as f:  # Use "wb" for writing bytes
        dump(dumps(paper), f)
        print(f"Saved {id}")
    
    # sleep 3 seconds
    time.sleep(8)

Skipping 0803.0476 because it already exists
Skipping 0912.3995 because it already exists
Skipping 1103.0398 because it already exists
Skipping 1112.6209 because it already exists
Skipping 1206.5538 because it already exists
Skipping 1207.0580 because it already exists
Skipping 1301.3781 because it already exists
Skipping 1302.4389 because it already exists
Skipping 1303.5778 because it already exists
Skipping 1308.0850 because it already exists
Skipping 1310.1531 because it already exists
Skipping 1311.2901 because it already exists
Skipping 1312.4400 because it already exists
Skipping 1312.5602 because it already exists
Skipping 1312.6114 because it already exists
Skipping 1312.6199 because it already exists
Skipping 1312.6229 because it already exists
Skipping 1404.2188 because it already exists
Skipping 1404.7828 because it already exists
Skipping 1405.3531 because it already exists
Skipping 1405.4053 because it already exists
Skipping 1406.1078 because it already exists
Skipping 1

{'Published': '2008-07-25',
 'Title': 'Fast unfolding of communities in large networks',
 'Authors': 'Vincent D. Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Etienne Lefebvre',
 'Summary': 'We propose a simple method to extract the community structure of large\nnetworks. Our method is a heuristic method that is based on modularity\noptimization. It is shown to outperform all other known community detection\nmethod in terms of computation time. Moreover, the quality of the communities\ndetected is very good, as measured by the so-called modularity. This is shown\nfirst by identifying language communities in a Belgian mobile phone network of\n2.6 million customers and by analyzing a web graph of 118 million nodes and\nmore than one billion links. The accuracy of our algorithm is also verified on\nad-hoc modular networks. .'}