In [29]:
import json
from xml.etree import ElementTree

In [2]:
FNAME = '../data/samples/US09788470B2.xml'

In [3]:
xml_root = ElementTree.parse(FNAME).getroot()

In [4]:
xml_root

<Element 'us-patent-grant' at 0x7fdfc07c67c8>

In [5]:
xml_biblio = xml_root.find('us-bibliographic-data-grant')
xml_abst = xml_root.find('abstract')
xml_clms = xml_root.find('claims')

In [6]:
for child in xml_biblio:
    print(child.tag, child.attrib)

publication-reference {}
application-reference {'appl-type': 'utility'}
us-application-series-code {}
us-term-of-grant {}
classifications-ipcr {}
classifications-cpc {}
invention-title {'id': 'd2e53'}
us-references-cited {}
number-of-claims {}
us-exemplary-claim {}
us-field-of-classification-search {}
figures {}
us-related-documents {}
us-parties {}
assignees {}
examiners {}
pct-or-regional-filing-data {}
pct-or-regional-publishing-data {}


In [36]:
def parse_biblio(xml_biblio):
    biblio = {}
    pub_doc = xml_biblio.find('publication-reference').find('document-id')
    app_doc = xml_biblio.find('application-reference').find('document-id')
    
    if app_doc:
        biblio['type'] = xml_biblio.find('application-reference').attrib.get('appl-type')
        biblio['ap_iso'] = app_doc.find('country').text
        biblio['apno'] = app_doc.find('doc-number').text
        biblio['apdt'] = app_doc.find('date').text
    
    if pub_doc:
        biblio['pb_iso'] = pub_doc.find('country').text
        biblio['pbno'] = pub_doc.find('doc-number').text
        biblio['kind'] = pub_doc.find('kind').text
        biblio['pbdt'] = pub_doc.find('date').text
        
    biblio['title'] = app_doc = xml_biblio.find('invention-title').text
    
    return biblio

In [37]:
parse_biblio(xml_biblio)

{'ap_iso': 'US',
 'apdt': '20121029',
 'apno': '14438452',
 'kind': 'B2',
 'pb_iso': 'US',
 'pbdt': '20171010',
 'pbno': '09788470',
 'title': 'Component supply device',
 'type': 'utility'}

In [46]:
def parse_clms(xml_clms):
    claims = {}
    for child in xml_clms.findall('claim'):
        _id = child.attrib.get('id')
        claims[_id] = {}
        claims[_id]['id'] = _id
        claims[_id]['text'] = child.find('claim-text').text.strip()

        claims[_id]['elements'] = []
        if child.find('claim-text').tail.strip():
            claims[_id]['elements'].append(child.find('claim-text').tail)

        ref = child.find('claim-text').find('claim-ref')
        claims[_id]['ref'] = ref.attrib['idref'] if ref is not None else None
        if child.find('claim-text').find('claim-ref') is not None:
            ref = child.find('claim-text').find('claim-ref')
            if ref.tail.strip(',\n '):
                claims[_id]['elements'].append(ref.tail.strip(', \n'))

        for el in child.find('claim-text').findall('claim-text'):
            claims[_id]['elements'].append(el.text.strip())

    return claims

In [47]:
parse_clms(xml_clms)

{'CLM-00001': {'elements': ['a wafer sheet replenishment section which replenishes a wafer sheet on which multiple components are arranged; and',
   'a wafer sheet conveyance section which conveys a wafer sheet supplied from the wafer sheet replenishment section, the wafer sheet conveyance section including a table on which the wafer sheet is carried and a pickup head which picks up a component on the wafer sheet carried on the table;',
   'wherein the pickup head includes multiple component supply side suction nozzles, and a switching mechanism which switches the multiple component supply side suction nozzles between a pickup position at which a respective tip of the multiple component supply side suction nozzles faces down and between a transfer position at which the respective tip of the multiple component supply side suction nozzles faces up;',
   'wherein the switching mechanism includes a pinion gear fixed to a rotation axis around which the multiple component supply side suction