# Query based on category and time-interval 

For a category + time-interval based query, the arXiv API needs

- cat: category in (astro-ph, cond-mat, gr-qc, hep-ex, hep-lat, hep-th,hep-ph, math-ph, nlin, nucl-ex, nucl-th, physics, quant-ph, math, CoRR, q-bio, q-fin, stat, eess, econ). See https://arxiv.org/category_taxonomy for details

- start: Start date in YYYY-MM-DD format

- end: End date in YYYY-MM-DD format

In [None]:
%conda install feedparser

In [None]:
import urllib, urllib.request
import feedparser
import os
import json

In [None]:
# Output files
path = os.getcwd()+'/Output/'

In [None]:
# Query parameters
base_url = 'http://export.arxiv.org/api/query?'

cat = 'hep-th'
start = '2024-09-12'
end = '2024-09-25'

start = start.replace('-','').replace('+','').replace('/','')
end = end.replace('-','').replace('+','').replace('/','')

results_per_iteration = 10000       # maximum 10 000 results at a time [api maximum]


if start > end:
    raise UserWarning('Start date needs to precede end date')


# Query
search_query = f'cat:{cat}+AND+submittedDate:[{start}0000+TO+{end}2359]'     
query = 'search_query=%s&start=%i&max_results=%i' % (search_query, 0, results_per_iteration)

data = urllib.request.urlopen(base_url+query)
feed = feedparser.parse(data.read().decode('utf-8'))    # This throws: 'utf-8' codec can't decode byte 0xc9.
                                                        # try/except does not help 

output = []
filename = f'{cat}_period_{start}_{end}.txt'

# Run through each entry, and print out information
for entry in feed.entries:
    entry_info = {
        'id' : entry.id.split('/abs/')[-1],
        'published' : entry.published,
        'authors' : entry.authors,
        'title': entry.title,
        'summary': entry.summary
        }
    output.append(entry_info)

with open(path+filename,'w') as file:
    json.dump(output,file)
    