In [None]:
import xml.etree.ElementTree as ET
import json
import os
import pandas as pd
import re
import numpy as np

This code read the HEP and HepNames xml files that can be found here: http://inspirehep.net/dumps/inspire-dump.html .
Various tags are read and saved into two json files 'INSPIREjson' (from HEP) and 'HEPNAMESjson' (from HepNames). In this code I save more tags than those that are actually needed. A reduced version of HEPNAMESjson and INSPIREjson, Can be found in this same repository. The markup record for the xml can be found here https://twiki.cern.ch/twiki/bin/view/Inspire/DevelopmentRecordMarkup .

<h1> Read HEP </h1>

The file to read is called 'HEP-records.xml'

In [None]:
def append_record(record):
    with open('inspire_0', 'a') as f:
        json.dump(record, f)
        f.write(os.linesep)

    
def process_buffer_big(buf):
    flag=1
    tnode = ET.fromstring(buf)
    vec=[i for i in range(25)]
    vec[0]=tnode.findall("./controlfield[@tag='001']")
    vec[1]=tnode.findall("./datafield[@tag='035']/subfield[@code='a']")
    vec[2]=tnode.findall("./datafield[@tag='035']/subfield[@code='9']")
    vec[3]=tnode.findall("./datafield[@tag='037']/subfield[@code='9']")
    vec[4]=tnode.findall("./datafield[@tag='037']/subfield[@code='a']")
    vec[5]=tnode.findall("./datafield[@tag='037']/subfield[@code='c']")
    vec[6]=tnode.findall("./datafield[@tag='100']/subfield[@code='a']")
    vec[7]=tnode.findall("./datafield[@tag='100']/subfield[@code='x']")
    vec[8]=tnode.findall("./datafield[@tag='700']/subfield[@code='a']")
    vec[9]=tnode.findall("./datafield[@tag='700']/subfield[@code='x']")
    vec[10]=tnode.findall("./datafield[@tag='245']/subfield[@code='a']")
    vec[11]=tnode.findall("./datafield[@tag='269']/subfield[@code='c']")
    vec[12]=tnode.findall("./datafield[@tag='961']/subfield[@code='x']")
    vec[13]=tnode.findall("./datafield[@tag='961']/subfield[@code='c']")
    vec[14]=tnode.findall("./datafield[@tag='650'][@ind1='1'][@ind2='7']/subfield[@code='a']")
    vec[15]=tnode.findall("./datafield[@tag='693']/subfield[@code='0']")
    vec[16]=tnode.findall("./datafield[@tag='693']/subfield[@code='a']")
    vec[17]=tnode.findall("./datafield[@tag='693']/subfield[@code='e']")
    vec[18]=tnode.findall("./datafield[@tag='710']/subfield[@code='0']")
    vec[19]=tnode.findall("./datafield[@tag='710']/subfield[@code='g']")
    vec[20]=tnode.findall("./datafield[@tag='999'][@ind1='C'][@ind2='5']/subfield[@code='0']")
    vec[21]=tnode.findall("./datafield[@tag='999'][@ind1='C'][@ind2='5']/subfield[@code='1']")
    vec[22]=tnode.findall("./datafield[@tag='999'][@ind1='C'][@ind2='5']/subfield[@code='2']")
    vec[23]=tnode.findall("./datafield[@tag='999'][@ind1='C'][@ind2='5']/subfield[@code='r']")
    vec[24]=tnode.findall("./datafield[@tag='100']/subfield[@code='w']")

    for i in range(len(vec)):
        vec[i]=[vec[i][j].text for j in range(len(vec[i]))]
    my_dict = {'item':vec[0], 'key1':vec[1], 'key2':vec[2], 'cat0':vec[3], 'cat1':vec[4], 'cat2':vec[5],\
               'aut1N':vec[6], 'aut1ID':vec[7], 'autN':vec[8], 'autID':vec[9], 'title':vec[10],\
               'date0':vec[11], 'date1':vec[12], 'date2':vec[13], 'content':vec[14], 'exp0':vec[15], 'exp1':vec[16], \
               'exp2':vec[17], 'coll0':vec[18], 'coll1':vec[19], \
               'cit0':vec[20], 'cit1':vec[21], 'cit2':vec[22], 'cit3':vec[23],'aut1BAI':vec[23]}
    append_record(my_dict)

In [None]:
count=0
inputbuffer = ''
with open('HEP-records.xml','rb') as inputfile:
    append = False
    for line in inputfile:
        if '<record>' in line:
            inputbuffer = line
            append = True
        elif '</record>'in line:
            inputbuffer += line
            append = False
            process_buffer_big(inputbuffer)
            count+=1
            if count%10000==0:
                print count
            inputbuffer = None
        elif append:
            inputbuffer += line

Wrap the file as a json and save it as a json

In [None]:
with open('inspire_0') as f:
    my_list = [json.loads(line) for line in f]
    
with open('inspire_big.json', 'w') as f:
    json.dump(my_list, f)

<h1> Read HepNames </h1>

The file to read is called 'HepNames-records.xml'

In [None]:
def append_record(record):
    with open('hepnames_0', 'a') as f:
        json.dump(record, f)
        f.write(os.linesep)


def process_buffer_big(buf):
    tnode = ET.fromstring(buf)
    vec=[i for i in range(7)]
    vec[0]=tnode.findall("./controlfield[@tag='001']")
    vec[1]=tnode.findall("./datafield[@tag='035']/subfield[@code='a']")
    vec[2]=tnode.findall("./datafield[@tag='035']/subfield[@code='9']")
    vec[3]=tnode.findall("./datafield[@tag='100']/subfield[@code='a']")
    vec[4]=tnode.findall("./datafield[@tag='100']/subfield[@code='q']")
    vec[5]=tnode.findall("./datafield[@tag='961']/subfield[@code='x']")
    vec[6]=tnode.findall("./datafield[@tag='961']/subfield[@code='c']")
    for i in range(len(vec)):
        vec[i]=[vec[i][j].text for j in range(len(vec[i]))]
    my_dict = {'item':vec[0], 'key1':vec[1], 'key2':vec[2], 'autN':vec[3], 'autID':vec[4], 'date1':vec[5],\
               'date2':vec[6]}
    append_record(my_dict)

In [None]:
count=0
inputbuffer = ''
with open('HepNames-records.xml','rb') as inputfile:
    append = False
    for line in inputfile:
        if '<record>' in line:
            inputbuffer = line
            append = True
        elif '</record>'in line:
            inputbuffer += line
            append = False
            process_buffer_big(inputbuffer)
            count+=1
            if count%10000==0:
                print count
            inputbuffer = None
        elif append:
            inputbuffer += line

In [None]:
with open('hepnames_0') as f:
    my_list = [json.loads(line) for line in f]

with open('hepnames.json', 'w') as f:
    json.dump(my_list, f)