In [8]:
import arxiv
import pandas as pd
from tqdm.notebook import tqdm
import os
import random
import time


In [61]:
def tag_to_major(tag: str) -> str | None:
    if '.' not in tag:
        return None
    major = tag.split('.')[0]

    if major in ('cs', 'math', 'physics', 'q-bio', 'q-fin', 'stat', 'econ', 'eess'):
        return major

    if major in ('astro-ph', 'nlin', 'cond-mat'):
        return 'physics'

    return None

def random_period(year_range):
    year = random.randint(year_range[0], year_range[1] - 1)
    month = random.randint(1, 12)
    next_month = month + 1 if month < 12 else 1
    next_year = year if month < 12 else year + 1

    start_date = f"{year}{month:02d}01"
    end_date = f"{next_year}{next_month:02d}01"

    return start_date, end_date

In [66]:
MAJORS = ('cs', 'math', 'physics', 'q-bio', 'q-fin', 'stat', 'econ', 'eess')
TOTAL = 50_000
PAPERS_PER_MAJOR = TOTAL / len(MAJORS)
YEAR_RANGE = (2017, 2024)

all_papers = list()

for major in tqdm(MAJORS):
    papers_collected = 0
    while papers_collected < PAPERS_PER_MAJOR:
        print(f'{papers_collected} / {PAPERS_PER_MAJOR}')

        start_date, end_date = random_period(YEAR_RANGE)
        search_query = f"cat:{major}* AND submittedDate:[{start_date} TO {end_date}]"

        client = arxiv.Client(page_size=100, delay_seconds=3)
        search = arxiv.Search(query=search_query, max_results=100)

        try:
            new_papers = list()
            for result in client.results(search):
                majors = set()
                for tag in result.categories:
                    major_tag = tag_to_major(tag)
                    if major_tag is not None:
                        majors.add(major_tag)

                if len(majors) == 0:
                    continue

                paper = {
                    'id': result.entry_id.split('/')[-1],
                    'title': result.title,
                    'abstract': result.summary,
                    'majors': list(majors)
                }
                new_papers.append(paper)

            all_papers.extend(new_papers)
            papers_collected += len(new_papers)

            if papers_collected < PAPERS_PER_MAJOR:
                time.sleep(1)

        except Exception as e:
            print(f"error on {major}\n\n{e}")
            time.sleep(5)

  0%|          | 0/8 [00:00<?, ?it/s]

0 / 6250.0
100 / 6250.0
200 / 6250.0
300 / 6250.0
400 / 6250.0
500 / 6250.0
600 / 6250.0
700 / 6250.0
800 / 6250.0
900 / 6250.0
1000 / 6250.0
1100 / 6250.0
1200 / 6250.0
1300 / 6250.0
1400 / 6250.0
1500 / 6250.0
1600 / 6250.0
1700 / 6250.0
1800 / 6250.0
1900 / 6250.0
2000 / 6250.0
2100 / 6250.0
2200 / 6250.0
2300 / 6250.0
2400 / 6250.0
2500 / 6250.0
2600 / 6250.0
2700 / 6250.0
2800 / 6250.0
2900 / 6250.0
3000 / 6250.0
3100 / 6250.0
3200 / 6250.0
3300 / 6250.0
3400 / 6250.0
3500 / 6250.0
3600 / 6250.0
3700 / 6250.0
3800 / 6250.0
3900 / 6250.0
4000 / 6250.0
4100 / 6250.0
4200 / 6250.0
4300 / 6250.0
4400 / 6250.0
4500 / 6250.0
4600 / 6250.0
4700 / 6250.0
4800 / 6250.0
4900 / 6250.0
5000 / 6250.0
5100 / 6250.0
5200 / 6250.0
5300 / 6250.0
5400 / 6250.0
5500 / 6250.0
5600 / 6250.0
5700 / 6250.0
5800 / 6250.0
5900 / 6250.0
6000 / 6250.0
6100 / 6250.0
6200 / 6250.0
0 / 6250.0
100 / 6250.0
200 / 6250.0
300 / 6250.0
400 / 6250.0
500 / 6250.0
600 / 6250.0
700 / 6250.0
800 / 6250.0
900 / 6250.0
10

Bozo feed; consider handling: document declared as utf-8, but parsed as iso-8859-2


2400 / 6250.0
2500 / 6250.0
2600 / 6250.0
2700 / 6250.0
2800 / 6250.0
2900 / 6250.0
3000 / 6250.0
3100 / 6250.0
3200 / 6250.0
3300 / 6250.0
3400 / 6250.0
3500 / 6250.0
3600 / 6250.0
3700 / 6250.0
3800 / 6250.0


Bozo feed; consider handling: document declared as utf-8, but parsed as iso-8859-2


3900 / 6250.0
4000 / 6250.0
4100 / 6250.0
4200 / 6250.0
4300 / 6250.0
4400 / 6250.0
4500 / 6250.0
4600 / 6250.0
4700 / 6250.0
4800 / 6250.0
4900 / 6250.0
5000 / 6250.0
5100 / 6250.0
5200 / 6250.0


Bozo feed; consider handling: document declared as utf-8, but parsed as iso-8859-2


5300 / 6250.0
5400 / 6250.0
5500 / 6250.0
5600 / 6250.0
5700 / 6250.0


Bozo feed; consider handling: document declared as utf-8, but parsed as iso-8859-2


5800 / 6250.0
5900 / 6250.0
6000 / 6250.0
6100 / 6250.0
6200 / 6250.0
0 / 6250.0
100 / 6250.0
200 / 6250.0
300 / 6250.0
400 / 6250.0
500 / 6250.0
600 / 6250.0
700 / 6250.0
800 / 6250.0
900 / 6250.0
1000 / 6250.0
1100 / 6250.0
1200 / 6250.0
1300 / 6250.0
1400 / 6250.0
1500 / 6250.0
1600 / 6250.0
1700 / 6250.0
1800 / 6250.0
1900 / 6250.0
2000 / 6250.0
2100 / 6250.0
2200 / 6250.0
2300 / 6250.0
2400 / 6250.0
2500 / 6250.0
2600 / 6250.0
2700 / 6250.0
2800 / 6250.0
2900 / 6250.0
3000 / 6250.0
3100 / 6250.0
3200 / 6250.0
3300 / 6250.0
3400 / 6250.0
3500 / 6250.0
3600 / 6250.0
3700 / 6250.0
3800 / 6250.0
3900 / 6250.0
4000 / 6250.0
4100 / 6250.0
4200 / 6250.0
4300 / 6250.0
4400 / 6250.0
4500 / 6250.0
4600 / 6250.0
4700 / 6250.0
4800 / 6250.0
4900 / 6250.0
5000 / 6250.0
5100 / 6250.0
5200 / 6250.0
5300 / 6250.0
5400 / 6250.0
5500 / 6250.0
5600 / 6250.0
5700 / 6250.0
5800 / 6250.0
5900 / 6250.0
6000 / 6250.0
6100 / 6250.0
6200 / 6250.0
0 / 6250.0
100 / 6250.0
200 / 6250.0
300 / 6250.0
400 / 6250

Bozo feed; consider handling: document declared as utf-8, but parsed as iso-8859-2


834 / 6250.0
907 / 6250.0
992 / 6250.0
1092 / 6250.0
1192 / 6250.0
1280 / 6250.0
1361 / 6250.0
1461 / 6250.0
1530 / 6250.0
1630 / 6250.0
1706 / 6250.0
1806 / 6250.0
1882 / 6250.0
1982 / 6250.0
2082 / 6250.0
2182 / 6250.0
2282 / 6250.0
2346 / 6250.0
2446 / 6250.0
2546 / 6250.0
2646 / 6250.0
2746 / 6250.0
2846 / 6250.0
2946 / 6250.0
3019 / 6250.0
3107 / 6250.0
3207 / 6250.0
3307 / 6250.0
3407 / 6250.0
3507 / 6250.0
3607 / 6250.0
3695 / 6250.0
3795 / 6250.0
3864 / 6250.0
3964 / 6250.0
4064 / 6250.0
4164 / 6250.0
4264 / 6250.0
4364 / 6250.0
4464 / 6250.0
4564 / 6250.0
4653 / 6250.0
4753 / 6250.0
4853 / 6250.0
4953 / 6250.0
5046 / 6250.0
5146 / 6250.0
5246 / 6250.0
5346 / 6250.0
5446 / 6250.0
5546 / 6250.0
5646 / 6250.0
5727 / 6250.0
5827 / 6250.0
5891 / 6250.0
5977 / 6250.0
6077 / 6250.0
6177 / 6250.0
0 / 6250.0
100 / 6250.0
200 / 6250.0
300 / 6250.0
400 / 6250.0
500 / 6250.0
600 / 6250.0
700 / 6250.0
800 / 6250.0
900 / 6250.0
1000 / 6250.0
1100 / 6250.0
1200 / 6250.0
1300 / 6250.0
1400 / 

Bozo feed; consider handling: document declared as utf-8, but parsed as windows-1252


2877 / 6250.0
2977 / 6250.0
3077 / 6250.0
3177 / 6250.0
3277 / 6250.0
3377 / 6250.0
3477 / 6250.0
3577 / 6250.0
3677 / 6250.0
3761 / 6250.0
3784 / 6250.0
3884 / 6250.0
3984 / 6250.0
4010 / 6250.0
4110 / 6250.0
4190 / 6250.0
4283 / 6250.0
4288 / 6250.0
4388 / 6250.0
4488 / 6250.0
4588 / 6250.0
4688 / 6250.0
4788 / 6250.0
4837 / 6250.0
4937 / 6250.0
4941 / 6250.0
5041 / 6250.0
5141 / 6250.0
5241 / 6250.0
5301 / 6250.0
5350 / 6250.0
5450 / 6250.0
5490 / 6250.0
5530 / 6250.0
5610 / 6250.0
5633 / 6250.0
5733 / 6250.0
5833 / 6250.0
5933 / 6250.0
6033 / 6250.0
6038 / 6250.0
6104 / 6250.0
6204 / 6250.0
0 / 6250.0
100 / 6250.0
200 / 6250.0
300 / 6250.0
400 / 6250.0
500 / 6250.0
600 / 6250.0
700 / 6250.0
713 / 6250.0
813 / 6250.0
913 / 6250.0
1013 / 6250.0
1113 / 6250.0
1213 / 6250.0
1313 / 6250.0
1317 / 6250.0
1417 / 6250.0
1517 / 6250.0
1617 / 6250.0
1717 / 6250.0
1817 / 6250.0
1917 / 6250.0
2017 / 6250.0
2117 / 6250.0
2217 / 6250.0
2317 / 6250.0
2417 / 6250.0
2517 / 6250.0
2617 / 6250.0
2717 

In [67]:
path_to_save = 'better_data.csv'

df = pd.DataFrame(all_papers)
df.to_csv(path_to_save, index=False)

In [68]:
pd.read_csv(path_to_save)

Unnamed: 0,id,title,abstract,majors
0,2305.07271v1,Complexity of conjunctive regular path query h...,A graph database is a digraph whose arcs are l...,['cs']
1,2306.00120v1,VMap: An Interactive Rectangular Space-filling...,"We present VMap, a map-like rectangular space-...",['cs']
2,2305.09538v3,A LOCAL View of the Polynomial Hierarchy,We extend classical methods of computational c...,['cs']
3,2305.02244v1,NVMM cache design: Logging vs. Paging,Modern NVMM is closing the gap between DRAM an...,['cs']
4,2305.07147v1,COLA: Characterizing and Optimizing the Tail L...,Autonomous vehicles (AVs) are envisioned to re...,['cs']
...,...,...,...,...
50326,1805.08867v2,Superconducting receiver arrays for magnetic r...,Superconducting QUantum-Interference Devices (...,"['eess', 'physics']"
50327,1805.09366v2,Semi-supervised classification by reaching con...,Deep learning has demonstrated abilities to le...,"['eess', 'stat', 'cs']"
50328,1805.00237v3,Randomly weighted CNNs for (music) audio class...,The computer vision literature shows that rand...,"['eess', 'cs']"
50329,1805.01198v1,Deep Denoising for Hearing Aid Applications,Reduction of unwanted environmental noises is ...,"['eess', 'cs']"
