# Query based on author information 

For an author based query, the arXiv API needs

- full_name: author's full name. The expected format is 'first name' + 'middle name' + 'surname', separated with spaces and with middle name possibly null.

- cat: category in (astro-ph, cond-mat, gr-qc, hep-ex, hep-lat, hep-th,hep-ph, math-ph, nlin, nucl-ex, nucl-th, physics, quant-ph, math, CoRR, q-bio, q-fin, stat, eess, econ). See https://arxiv.org/category_taxonomy for details

In [None]:
import urllib, urllib.request
import feedparser
import os
import json

# import pandas as pd

In [None]:
# Output files
path = os.getcwd()+'/Output/'

In [None]:
# Query parameters
full_name = 'Emanuel Malek'
category = 'hep-th'

base_url = 'http://export.arxiv.org/api/query?'
max_results=1000


# Name parsing
name_list = full_name.split()

if len(name_list) == 1:
    name = None
    middle = None
    surname = name_list[0]
    name_list = [surname]
    search_query = f'au:{surname}+AND+cat:{category}&sortBy=submittedDate&sortOrder=descending' 
elif len(name_list) == 2:
    name = name_list[0][0]+'.'
    middle = None
    surname = name_list[1]
    name_list = [name, surname]
    search_query = f'au:{name}+{surname}+AND+cat:{category}&sortBy=submittedDate&sortOrder=descending'                   

elif len(name_list) == 3:
    name = name_list[0][0]+'.'
    middle = name_list[1][0]+'.'
    surname = name_list[2]
    name_list = [name, middle, surname]
    search_query = f'au:{name}+{middle}+{surname}+AND+cat:{category}&sortBy=submittedDate&sortOrder=descending' 

else:
    raise UserWarning('Name not in expeceted format: \'first name\' + \'middle name\' + \'surname\'')



# Query
query = 'search_query=%s&max_results=%i' % (search_query,max_results)

data = urllib.request.urlopen(base_url+query)
feed = feedparser.parse(data.read().decode('utf-8'))

In [None]:
# entries = pd.DataFrame(feed.entries)[['id','published','authors','title','summary']]
# entries

In [None]:
# The following might be streamlined using the re package
def initials(name):
    name_split = name.lower().split()
    if len(name_split) == 1:
        return name_split
    else:
        return [name_split[i][0]+'.' for i in range(len(name_split)-1)] + [name_split[-1]]
    
def author_in_list(authors):
    for aut in authors:
        true = 1
        aut_initials = initials(aut)
        if len(aut_initials) == len(name_list):
            for i, j in zip(name_list, aut_initials):
                true *= (i.lower() == j)
        elif len(name_list) > 1:
            true = (name_list[-1].lower() == aut_initials[-1])*(name_list[0].lower() == aut_initials[0])
        else:
            true = (name_list[-1].lower() == aut_initials[-1])
        
        if true == 1: return True
    return False

In [None]:
output = []
filename = f'{name}_{surname}_with_abstracts.txt'


# Select and record relevant information for each entry, if the query author is among the authors of the entry
for entry in feed.entries:
    entry_info = {
        'id' : entry.id.split('/abs/')[-1],
        'published' : entry.published,
        'authors' : [aut['name'] for aut in entry.authors],
        'title': entry.title,
        'summary': entry.summary
        }
    
    # Filter
    if author_in_list(entry_info['authors']):
        output.append(entry_info)

with open(path+filename,'w') as file:
    json.dump(output,file)

In [None]:
# output