# #Readme

This ipython notebook allows to download all [arxiv metadata](https://arxiv.org/help/oa/index) using the [OAI standard](http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm) and _pyoai_ python package:  
`pip install pyoai`. The pyoai api is described on [git](https://github.com/infrae/pyoai/blob/master/doc/API.html).

In [22]:
## import packages

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
# conda install -c anaconda beautiful-soup=4.3.2
# conda install lxml

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import datetime
import dateutil

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
# pip install pyoai

import json
import os

import requests

from tqdm import tqdm_notebook as tqdm

### set os path

In [23]:
# let's checkout first metadata file
# default_path = r'C:\Users\user\Downloads\arxvivdata'
default_path = r'C:\Users\user\Downloads\arxivdata' 
os.chdir(default_path)

### initalize

In [24]:
# service base url
url = "http://export.arxiv.org/oai2"

In [25]:
## set up server
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(url, registry)
client.updateGranularity()

In [26]:
## get server info 
serverinfo = client.identify()
print(serverinfo.repositoryName())
print(serverinfo.baseURL())
print(serverinfo.protocolVersion())

arXiv
http://export.arxiv.org/oai2
2.0


for info see https://arxiv.org/help/oa/index

In [27]:
# check metadata formats
client.listMetadataFormats()

[('oai_dc',
  'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
  'http://www.openarchives.org/OAI/2.0/oai_dc/'),
 ('arXiv', 'http://arxiv.org/OAI/arXiv.xsd', 'http://arxiv.org/OAI/arXiv/'),
 ('arXivOld',
  'http://arxiv.org/OAI/arXivOld.xsd',
  'http://arxiv.org/OAI/arXivOld/'),
 ('arXivRaw',
  'http://arxiv.org/OAI/arXivRaw.xsd',
  'http://arxiv.org/OAI/arXivRaw/')]

### get single record

In [12]:
# get single record
id = 'oai:arXiv.org:1610.02097'
# need to use oai_dc as metadata format
metapre = 'oai_dc'
record = client.getRecord(identifier = id, metadataPrefix=metapre)

In [13]:
# header is element 0 and contains info about record
header = record[0]
print(header.identifier())
print(header.datestamp())

oai:arXiv.org:1610.02097
2017-05-16 00:00:00


In [14]:
# metadata is element 1 and can be converted to python dictionary with getMap()
metadata = record[1]
metadata.getMap()

{'contributor': [],
 'coverage': [],
 'creator': ['Jaskula, Jean-Christophe',
  'Bauch, Erik',
  'Arroyo-Camejo, Silvia',
  'Lukin, Mikhail D.',
  'Hell, Stefan W.',
  'Trifonov, Alexei S.',
  'Walsworth, Ronald L.'],
 'date': ['2016-10-06'],
 'description': ['  Nitrogen vacancy (NV) color centers in diamond are a leading modality for\nboth superresolution optical imaging and nanoscale magnetic field sensing. In\nthis work, we solve the remaining key challenge of performing optical magnetic\nimaging and spectroscopy selectively on multiple NV centers that are located\nwithin a diffraction-limited field-of-view. We use spin-RESOLFT microscopy to\nenable precision nanoscale mapping of magnetic field patterns with resolution\ndown to ~20 nm, while employing a low power optical depletion beam. Moreover,\nwe use a shallow NV to demonstrate the detection of proton nuclear magnetic\nresonance (NMR) signals exterior to the diamond, with 50 nm lateral imaging\nresolution and without degrading t

### get all open access ids (optional)

In [49]:
# get list of PMCIDs

from_date = '1901-01-01'
from_date = '2017-11-22'
from_datetime = datetime.datetime.strptime(from_date,"%Y-%m-%d")

ids = []
# allrecords = client.listRecords(metadataPrefix=metapre, from_=from_date)
for record in client.listIdentifiers(metadataPrefix='oai_dc',from_=from_datetime):
     
    ids.append(record.identifier())
    
print('total # of ids:', len(ids))

total # of ids: 7845


In [12]:
totalpapers = 1678766 # total numbers of OA papers

672.0

### download all records

In [46]:
# number of records per file
batchsize = 2500

# last file number - set to 0 if complete downlaod
j = 1

metadata_all = []
id_counter = 0
i = 0

for record in client.listRecords(metadataPrefix='oai_dc'):
#     print(record)
    
    # get current record id/metadata
    header = record[0]
    id = header.identifier().split("oai:arXiv.org:")[1]
        
    metadata_raw = record[1]
    
    if metadata_raw:
        metadata = metadata_raw.getMap()
    else: 
        metadata = []
            
    metadata['id'] = id
        
    # append record data
    metadata_all.append(metadata)
            
    if i >= batchsize:
        
        with open(r"metadata_arxiv_%04d.txt" % j, 'w') as outfile:     
            json.dump(metadata_all, outfile)
        
        # reset counter/variables
        i = 0
        metadata_all = []
        # increase file counter
        j += 1
    
        print("percent done: ", 100*(j*batchsize)/1.3e6)
    # increase paper counts
    i += 1

# save last amount of metadata
j += 1
with open(r"metadata_arxiv_%04d.txt" % j, 'w') as outfile:     
            json.dump(metadata_all, outfile)
print("total records:", j*batchsize + len(metadata_all))

percent done:  0.7692307692307693
percent done:  1.1538461538461537
percent done:  1.5384615384615385
percent done:  1.9230769230769231
percent done:  2.3076923076923075
percent done:  2.6923076923076925
percent done:  3.076923076923077


ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [38]:
df_read = pd.read_json(r"metadata_arxiv_0511.txt")
df_read

Unnamed: 0,contributor,coverage,creator,date,description,format,id,identifier,language,publisher,relation,rights,source,subject,title,type
0,[],[],"[Gu, Ying-Qiu]","[2007-08-22, 2017-11-27]","[ In this paper, we provide a procedure to so...",[],0708.2962,"[http://arxiv.org/abs/0708.2962, Quant. Phys. ...",[],[],[],[],[],[Physics - General Physics],[A Procedure to Solve the Eigen Solution to Di...,[text]
1,[],[],"[Achlioptas, Dimitris, Coja-Oghlan, Amin]","[2008-03-14, 2008-05-21]",[ For many random Constraint Satisfaction Pro...,[],0803.2122,"[http://arxiv.org/abs/0803.2122, Proc. 49th FO...",[],[],[],[],[],"[Mathematics - Combinatorics, Mathematics - Pr...",[Algorithmic barriers from phase transitions],[text]
2,[],[],"[Liu, Changli]","[2010-02-03, 2017-11-28]","[ Maxwell's equations seem overdetermined, wh...",[],1002.0892,"[http://arxiv.org/abs/1002.0892, Physics and E...",[],[],[],[],[],"[Physics - General Physics, Physics - Classica...",[Explanation on Overdetermination of Maxwell's...,[text]
3,[],[],"[Coja-Oghlan, Amin]","[2010-07-08, 2014-04-25]",[ Let F be a uniformly distributed random k-S...,[],1007.1328,"[http://arxiv.org/abs/1007.1328, Proc. 22nd SO...",[],[],[],[],[],"[Mathematics - Combinatorics, Computer Science...",[On belief propagation guided decimation for r...,[text]
4,[],[],"[Coja-Oghlan, Amin, Efthymiou, Charilaos]","[2010-07-08, 2013-04-09]",[ The independence number of a sparse random ...,[],1007.1378,"[http://arxiv.org/abs/1007.1378, Random Struct...",[],[],[],[],[],[Computer Science - Discrete Mathematics],[On independent sets in random graphs],[text]
5,[],[],"[Coja-Oghlan, Amin, Pachon-Pinzon, Angelica Y.]",[2011-02-15],[ Let F be a uniformly distributed random k-S...,[],1102.3145,"[http://arxiv.org/abs/1102.3145, SIAM Journal ...",[],[],[],[],[],"[Mathematics - Combinatorics, Computer Science...",[The decimation process in random k-SAT],[text]
6,[],[],"[Visser, Matt]","[2011-03-29, 2011-04-03]",[ Horava gravity is a relatively recent (Jan ...,[],1103.5587,"[http://arxiv.org/abs/1103.5587, J.Phys.Conf.S...",[],[],[],[],[],"[High Energy Physics - Theory, General Relativ...",[Status of Horava gravity: A personal perspect...,[text]
7,[],[],"[Gautier, Eric, Pennec, Erwan Le]","[2011-06-17, 2017-11-28]",[ In the random coefficients binary choice mo...,[],1106.3503,[http://arxiv.org/abs/1106.3503],[],[],[],[],[],[Mathematics - Statistics Theory],[Adaptive estimation in the nonparametric rand...,[text]
8,[],[],"[Griffeth, Stephen]","[2011-06-24, 2017-11-28]",[ We classify the irreducible unitary modules...,[],1106.5094,[http://arxiv.org/abs/1106.5094],[],[],[],[],[],"[Mathematics - Representation Theory, Mathemat...",[Unitary representations of cyclotomic rationa...,[text]
9,[],[],"[Najafizadeh, Mojtaba, Saadat, Mehdi]","[2011-08-22, 2013-04-05]",[ We study the formulation of statistical mec...,[],1108.4273,"[http://arxiv.org/abs/1108.4273, Chin.J.Phys. ...",[],[],[],[],[],"[High Energy Physics - Theory, Condensed Matte...",[Thermodynamics of Classical Systems on Noncom...,[text]


### download updates only using by date
note that arxiv only accepts from dates and will download all papers to present

In [75]:
# adjust start date in case request fails/we get kicked out

from_ = "2017-09-13"
from_datetime = datetime.datetime.strptime(from_, "%Y-%m-%d")

metadata_all = []

i = 0 
for record in client.listRecords(metadataPrefix='oai_dc',from_=from_datetime):
    
    # get current record id/metadata
    header = record[0]
    id = header.identifier().split("oai:arXiv.org:")[1]
        
    metadata_raw = record[1]
    
    if metadata_raw:
        metadata = metadata_raw.getMap()
    else: 
        metadata = []
            
    metadata['id'] = id
        
    # append record data
    metadata_all.append(metadata)
            
    
    i += 1

print("total records:", len(metadata_all))

total records: 52261


In [77]:
# save update
j = 522 # set to last file + 1
with open(r"metadata_arxiv_%04d.txt" % j, 'w') as outfile:     
            json.dump(metadata_all, outfile)


In [79]:
# consistentcy check
ids = [metadata['id'] for metadata in metadata_all]
ids.index('1711.02023')

43005