In [1]:
import arxiv
import os
import tarfile

# Construct the default API client.
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
cat = 'Computer Science'
tmpdir = './tarfiles'
dirpath = f'./selected_papers/{cat}'

if not os.path.exists(dirpath):
  os.makedirs(dirpath)

if not os.path.exists(tmpdir):
    os.makedirs(tmpdir)


search = arxiv.Search(
  query = f"cat={cat}",
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

successful_downloads = 0

for i,r in enumerate(client.results(search)):
    filename = f"paper_{i}.tar.gz"
    filepath = r.download_source(dirpath=tmpdir, filename=filename)
    try:
        with tarfile.open(filepath, 'r:gz') as tar:
            files = tar.getmembers()
            tex_files = [f.name for f in files if f.name.endswith('.tex')]
            if len(tex_files) == 1:
                tex_file = tex_files[0]
                print(f"Extracting {tex_file}")
                tar.extract(tex_file, path=dirpath)
                os.rename(os.path.join(dirpath, tex_file), os.path.join(dirpath, f'paper_{i}.tex'))
                successful_downloads += 1
            else:
                print(f"Paper {i} has {len(tex_files)} tex files. Skipping.")
            tar.close()
    except Exception as e:
        print(f"Error extracting {filepath}: {e}")
    os.remove(filepath)
    if successful_downloads == 10:
        break

os.removedirs(tmpdir)


Extracting main.tex
Error extracting ./tarfiles/paper_1.tar.gz: not a gzip file
Extracting main.tex
Extracting okuyama.tex
Error extracting ./tarfiles/paper_4.tar.gz: not a gzip file
Extracting main.tex
Paper 6 has 18 tex files. Skipping.
Extracting main.tex
Extracting EuOKerrv6.tex
Extracting main.tex
Extracting main.tex
Error extracting ./tarfiles/paper_11.tar.gz: not a gzip file
Extracting main.tex
Error extracting ./tarfiles/paper_13.tar.gz: not a gzip file
Extracting article.tex


In [19]:
import re
tex_files = [f for f in os.listdir(dirpath) if f.endswith('.tex')]
tex_file = tex_files[8]
print(f"Processing {tex_file}")
tex_file_path = os.path.join(dirpath, tex_file)

def remove_all_comments(text):
    text = '\n'.join([line for line in text.split('\n') if not line.strip().startswith('%')])
    return re.sub(r'%.*\n', '', text)

def remove_email(text):
    text = re.sub(r'\\email\{.*?\}', '', text)
    return re.sub(r'\\ead\{.*?\}', '', text)

def replace_affiliations(text):
    # TODO do not forget group [] in replacement
    text = re.sub(r'\\affiliation(?:\[\d+\])?\{[^}]*\}', r"\\affiliation{$$$_REPLACE_AFF_VAR_$$$}", text)
    return re.sub(r'\\affil(?:\[\d+\])?\{[^}]*\}', r"\\affil{$$$_REPLACE_AFF_VAR_$$$}", text)
    

with open(tex_file_path, 'r') as f:
    tex_source = f.read()
    tex_source = remove_all_comments(tex_source)
    tex_source = remove_email(tex_source)
    tex_source = replace_affiliations(tex_source)

with open(tex_file_path, 'w') as f:
    f.write(tex_source)



Processing paper_5.tex
