# ArXiv metadata harvester

## 1. Imports

In [1]:
import xmltodict          # to convert the raw metadata from xml format to dict
import pandas as pd       # final format of stored data
from sickle import Sickle # to retrieve data from the OAI arxiv interface

## 2. Connection

In [2]:
#initialise an OAI interface
connection = Sickle('http://export.arxiv.org/oai2')

In [3]:
print('Getting papers...')
data = connection.ListRecords(**{'metadataPrefix': 'arXiv', 'from': '2007-01-01', 'until': '2019-06-01', 'ignore_deleted': True})
print('Papers retrieved.')

Getting papers...
Papers retrieved.


## 3. Export raw metadata in txt file

In [4]:
iters = 0
with open('arXiv_metadata_raw.txt','a+') as f:
    while True:
        try:
            f.write(data.next().raw)
            errors = 0
            iters +=1
            
            if iters % 10000 == 0:
                print('On iter', iters)
        
        except AttributeError:
            if errors >5:
                raise AttributeError('\nQUITTING: Too many sequential errors\n')
            else:
                print('\nERROR!\n')
                errors +=1
                
        except StopIteration:
            print('On iter', iters)
            print('\nDONE!')
            break

On iter 10000
On iter 20000
On iter 30000
On iter 40000
On iter 50000
On iter 60000
On iter 70000
On iter 80000
On iter 90000
On iter 100000
On iter 110000
On iter 120000
On iter 130000
On iter 140000

ERROR!

On iter 150000
On iter 160000
On iter 170000
On iter 180000
On iter 190000
On iter 200000
On iter 210000
On iter 220000
On iter 230000
On iter 240000
On iter 250000
On iter 260000
On iter 270000
On iter 280000
On iter 290000
On iter 300000
On iter 310000
On iter 320000
On iter 330000
On iter 340000
On iter 350000
On iter 360000
On iter 370000
On iter 380000
On iter 390000
On iter 400000
On iter 410000
On iter 420000
On iter 430000
On iter 440000
On iter 450000
On iter 460000
On iter 470000
On iter 480000
On iter 490000
On iter 500000
On iter 510000
On iter 520000
On iter 530000
On iter 540000
On iter 550000
On iter 560000
On iter 570000
On iter 580000
On iter 590000
On iter 600000
On iter 610000
On iter 620000
On iter 630000
On iter 640000
On iter 650000
On iter 660000
On iter 67

## 4. Format raw data 

In [5]:
raw_data = ''

with open('arXiv_metadata_raw.txt','r') as f:
    while True:
        data = f.read(100_000_000)
        if not data:
            break
        else:
            raw_data += data

In [6]:
def convert_dict(record_xml):
    record_dict = xmltodict.parse(record_xml, process_namespaces=False)['record']['metadata']['arXiv']
    
    record_dict['id'] = str(record_dict['id'])
    
    if not isinstance(record_dict['authors']['author'], list):
        authors = [record_dict['authors']['author']]
    else:
        authors = record_dict['authors']['author']
    
    authors = [(author['forenames'] + ' ' if 'forenames' in author.keys() else '') + author['keyname'] for author in authors]
        
    record_dict['authors'] = authors
    return record_dict

In [7]:
list_of_xml = raw_data.split('</record>')
list_of_xml = [_ + '</record>' for _ in list_of_xml]
list_of_dicts = [convert_dict(list_of_xml[i]) for i in range(1,len(list_of_xml)-1)]

In [8]:
df = pd.DataFrame(list_of_dicts)

## 5. Export formatted data

In [10]:
df.head()

Unnamed: 0,@xmlns,@xmlns:xsi,@xsi:schemaLocation,id,created,updated,authors,title,categories,comments,msc-class,license,abstract,journal-ref,doi,report-no,acm-class,proxy
0,http://arxiv.org/OAI/arXiv/,http://www.w3.org/2001/XMLSchema-instance,http://arxiv.org/OAI/arXiv/ http://arxiv.org/O...,704.0002,2007-03-30,2008-12-13,"[Ileana Streinu, Louis Theran]",Sparsity-certifying Graph Decompositions,math.CO cs.CG,To appear in Graphs and Combinatorics,05C85; 05C70; 68R10; 05B35,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-pe...",,,,,
1,http://arxiv.org/OAI/arXiv/,http://www.w3.org/2001/XMLSchema-instance,http://arxiv.org/OAI/arXiv/ http://arxiv.org/O...,704.0003,2007-04-01,2008-01-12,[Hongjun Pan],The evolution of the Earth-Moon system based o...,physics.gen-ph,"23 pages, 3 figures",,,The evolution of Earth-Moon system is describe...,,,,,
2,http://arxiv.org/OAI/arXiv/,http://www.w3.org/2001/XMLSchema-instance,http://arxiv.org/OAI/arXiv/ http://arxiv.org/O...,704.0004,2007-03-30,,[David Callan],A determinant of Stirling cycle numbers counts...,math.CO,11 pages,05A15,,We show that a determinant of Stirling cycle n...,,,,,
3,http://arxiv.org/OAI/arXiv/,http://www.w3.org/2001/XMLSchema-instance,http://arxiv.org/OAI/arXiv/ http://arxiv.org/O...,704.0005,2007-04-02,,"[Wael Abu-Shammala, Alberto Torchinsky]",From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,,"42B30, 42B35",,In this paper we show how to compute the $\Lam...,"Illinois J. Math. 52 (2008) no.2, 681-689",,,,
4,http://arxiv.org/OAI/arXiv/,http://www.w3.org/2001/XMLSchema-instance,http://arxiv.org/OAI/arXiv/ http://arxiv.org/O...,704.0006,2007-03-31,,"[Y. H. Pong, C. K. Law]",Bosonic characters of atomic Cooper pairs acro...,cond-mat.mes-hall,"6 pages, 4 figures, accepted by PRA",,,We study the two-particle wave function of pai...,,10.1103/PhysRevA.75.043613,,,


In [16]:
df = df.drop(columns = ['@xmlns', '@xmlns:xsi', '@xsi:schemaLocation'])

In [19]:
df.to_csv('arXiv_metadata_formatted.csv')