In [2]:
import arxiv
import os
import tarfile
import preprocess_paper as preprocessor
import time

# Construct the default API client.
client = arxiv.Client()

def search_cat(cat, tmpdir, dirpath, required_papers=10):
    search = arxiv.Search(
    query = f"cat:{cat}",
    sort_by = arxiv.SortCriterion.SubmittedDate
    )

    successful_downloads = 0

    for i,r in enumerate(client.results(search)):
        filename = f"paper_{i}.tar.gz"
        time.sleep(2)
        filepath = r.download_source(dirpath=tmpdir, filename=filename)
        successful_extract = False
        tex_path = os.path.join(dirpath, f'{cat.split(".")[1]}_paper_{i}.tex')
        try:
            with tarfile.open(filepath, 'r:gz') as tar:
                files = tar.getmembers()
                tex_files = [f.name for f in files if f.name.endswith('.tex')]
                if len(tex_files) == 1:
                    tex_file = tex_files[0]
                    print(f"Extracting {tex_file}")
                    tar.extract(tex_file, path=dirpath)
                    os.rename(os.path.join(dirpath, tex_file), tex_path)
                    successful_extract = True
                else:
                    print(f"Paper {i} has {len(tex_files)} tex files. Skipping.")
                tar.close()
        except Exception as e:
            print(f"Error extracting {filepath}: {e}")
        os.remove(filepath)

        if successful_extract:
            try:
                successful_pp = preprocessor.preprocess(tex_path)
            except Exception as e:
                successful_pp = False
            
            if successful_pp:
                successful_downloads += 1
            else:
                print(f"Error preprocessing {tex_path}. Skipping.")
                os.remove(tex_path)

        if successful_downloads == required_papers:
            break

    return successful_downloads


# Arxiv categories
cats = {
    'Computer Science': ['cs.' + cat for cat in [
        'AI', 'AR', 'CC', 'CE', 'CG', 'CL', 'CR', 'CV', 'CY', 'DB',
        'DC', 'DL', 'DM', 'DS', 'ET', 'FL', 'GL', 'GR', 'GT', 'HC',
        'IR', 'IT', 'LG', 'LO', 'MA', 'MM', 'MS', 'NA', 'NE', 'NI',
        'OH', 'OS', 'PF', 'PL', 'RO', 'SC', 'SD', 'SE', 'SI', 'SY',
    ]],
    'Mathematics': ['math.' + cat for cat in [
        'AG', 'AP', 'AT', 'CA', 'CO', 'CT', 'CV', 'DG', 'DS', 'FA',
        'GM', 'GN', 'GR', 'GT', 'HO', 'IT', 'KT', 'LO', 'MG', 'MP',
        'NA', 'NT', 'OA', 'OC', 'PR', 'QA', 'RT', 'RA', 'SP', 'ST',
    ]],
    'Quantitative Finance': ['q-fin.' + cat for cat in [
        'CP', 'EC', 'GN', 'MF', 'PM', 'PR', 'RM', 'ST', 'TR',
    ]],
    'Statistics': ['stat.' + cat for cat in [
        'AP', 'CO', 'ME', 'ML', 'OT', 'TH',
    ]],
    'Economics': ['econ.' + cat for cat in ['EM', 'GN', 'TH']],
}
tmpdir = './tarfiles'

if not os.path.exists(tmpdir):
    os.makedirs(tmpdir)

for cat, subcats in cats.items():
    dirpath = f'./selected_papers/{cat}'
    required_papers = 10

    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
    print(f"Searching for papers in category {cat}")
    for subcat in subcats:
        print(f"Searching for papers in subcategory {subcat}")
        required_papers -= search_cat(subcat, tmpdir, dirpath, required_papers=required_papers)
        if required_papers == 0:
            break
    print("--------------------------------------------------------------")
    time.sleep(5)

os.removedirs(tmpdir)


Searching for papers in category Mathematics
Searching for papers in subcategory math.AG
Extracting main.tex
Preprocessing AG_paper_0.tex: 
Preprocessed text is too long: 56085 tokens
Error preprocessing ./selected_papers/Mathematics/AG_paper_0.tex. Skipping.
Error extracting ./tarfiles/paper_1.tar.gz: invalid header
Error extracting ./tarfiles/paper_2.tar.gz: invalid header
Extracting main.tex
Preprocessing AG_paper_3.tex: 
Extracting Coiso-IT-arxiv.tex
Preprocessing AG_paper_4.tex: 
Error preprocessing ./selected_papers/Mathematics/AG_paper_4.tex. Skipping.
Extracting main.tex
Preprocessing AG_paper_5.tex: 
Preprocessed text is too long: 27596 tokens
Error preprocessing ./selected_papers/Mathematics/AG_paper_5.tex. Skipping.
Extracting main.tex
Preprocessing AG_paper_6.tex: 
Error extracting ./tarfiles/paper_7.tar.gz: invalid header
Paper 8 has 34 tex files. Skipping.
Paper 9 has 38 tex files. Skipping.
Paper 10 has 19 tex files. Skipping.
Paper 11 has 60 tex files. Skipping.
Extract