In [1]:
import pandas as pd
import io
import xml.etree.ElementTree as ElementTree
import os

In [2]:
class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                # treat like dict
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                # treat like list
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)


class XmlDictConfig(dict):
    '''
    Example usage:

    >>> tree = ElementTree.parse('your_file.xml')
    >>> root = tree.getroot()
    >>> xmldict = XmlDictConfig(root)

    Or, if you want to use an XML string:

    >>> root = ElementTree.XML(xml_string)
    >>> xmldict = XmlDictConfig(root)

    And then use xmldict for what it is... a dict.
    '''
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                # treat like dict - we assume that if the first two tags
                # in a series are different, then they are all different.
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                # treat like list - we assume that if the first two tags
                # in a series are the same, then the rest are the same.
                else:
                    # here, we put the list in dictionary; the key is the
                    # tag name the list elements all share in common, and
                    # the value is the list itself 
                    aDict = {element[0].tag: XmlListConfig(element)}
                # if the tag has attributes, add those to the dict
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            # this assumes that if you've got an attribute in a tag,
            # you won't be having any text. This may or may not be a 
            # good idea -- time will tell. It works for the way we are
            # currently doing XML configuration files...
            elif element.items():
                self.update({element.tag: dict(element.items())})
            # finally, if there are no child tags and no attributes, extract
            # the text
            else:
                self.update({element.tag: element.text})

# Method 1 - Test with the First Regulation ('SOR-49-42')

In [97]:
#Hide future version warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Create the parent-child dictionary 
tree = ElementTree.parse('SOR-49-42.xml')
root = tree.getroot()
xmldict = XmlDictConfig(root)

##### Identification

In [99]:
#The Entire Identification Block
xmldict['Identification']

{'ConsolidationDate': {'Date': {'DD': '21', 'MM': '06', 'YYYY': '2018'}},
 'EnablingAuthority': {'XRefExternal': ['APPROPRIATION ACT, NO. 7, 1949',
   'APPROPRIATION ACTS']},
 'InstrumentNumber': 'SOR/49-42',
 'LastModifiedDate': {'Date': {'DD': '26', 'MM': '10', 'YYYY': '2006'}},
 'LongTitle': 'Regulations Governing Payment of School Fees and Transportation Costs Re Children of Certain Employees of the Government of Canada',
 'RegulationMakerOrder': {'Date': {'DD': '09', 'MM': '11', 'YYYY': '1954'},
  'OrderNumber': '1954-1694',
  'RegulationMaker': 'P.C.'},
 'ShortTitle': 'Regulations re school fees and transportation costs for children of certain Government employees'}

In [100]:
#Obtaining each element
InstrumentNumber = xmldict['Identification']['InstrumentNumber']
ConsolidationDate = xmldict['Identification']['ConsolidationDate']['Date']
LastModificationDate = xmldict['Identification']['LastModifiedDate']['Date']
EnablingAuthority = xmldict['Identification']['EnablingAuthority']['XRefExternal'][0]
ShortTitle = xmldict['Identification']['ShortTitle']
LongTitle = xmldict['Identification']['LongTitle']
RegulationMaker = xmldict['Identification']['RegulationMakerOrder']['RegulationMaker']
RetulationOrderNumber = xmldict['Identification']['RegulationMakerOrder']['OrderNumber']
RegulationMakerDate = xmldict['Identification']['RegulationMakerOrder']['Date']

In [101]:
#Checking all the elements
print (InstrumentNumber)
print (ConsolidationDate)
print (LastModificationDate)
print (EnablingAuthority)
print (ShortTitle)
print (LongTitle)
print (RegulationMaker)
print (RetulationOrderNumber)
print (RegulationMakerDate)

SOR/49-42
{'YYYY': '2018', 'MM': '06', 'DD': '21'}
{'YYYY': '2006', 'MM': '10', 'DD': '26'}
APPROPRIATION ACT, NO. 7, 1949
Regulations re school fees and transportation costs for children of certain Government employees
Regulations Governing Payment of School Fees and Transportation Costs Re Children of Certain Employees of the Government of Canada
P.C.
1954-1694
{'YYYY': '1954', 'MM': '11', 'DD': '09'}


##### Order

In [111]:
#The Entire Order Block
xmldict['Order']

{'Provision': [{'Text': 'His Excellency the Governor General in Council, on the recommendation of the Minister of Finance and pursuant to The Appropriation Act, No. 7, 1949, Vote No 938, is pleased to order as follows:',
   'format-ref': 'indent-0-0',
   'language-align': 'yes',
   'list-item': 'no'},
  {'Label': '1.',
   'Text': 'The Regulations governing the payment of school fees and transportation costs of children of certain employees of the Government of Canada, established by Order in Council PC 3455 of 19th July, 1950, are hereby revoked; and',
   'format-ref': 'indent-0-0',
   'language-align': 'no',
   'list-item': 'no'},
  {'Label': '2.',
   'Text': u'The annexed \u201cRegulations governing payment of school fees and transportation costs re children of certain employees of the Government of Canada\u201d are hereby made and established in substitution for the regulations hereby revoked.',
   'format-ref': 'indent-0-0',
   'language-align': 'no',
   'list-item': 'no'}]}

In [126]:
provisions = ""
for i in range(0,len(xmldict['Order']['Provision'])):
    provisions += (xmldict['Order']['Provision'][i]["Text"] + " ")

In [127]:
print (provisions)

His Excellency the Governor General in Council, on the recommendation of the Minister of Finance and pursuant to The Appropriation Act, No. 7, 1949, Vote No 938, is pleased to order as follows: The Regulations governing the payment of school fees and transportation costs of children of certain employees of the Government of Canada, established by Order in Council PC 3455 of 19th July, 1950, are hereby revoked; and The annexed “Regulations governing payment of school fees and transportation costs re children of certain employees of the Government of Canada” are hereby made and established in substitution for the regulations hereby revoked. 


In [116]:
number_provisions = len(xmldict['Order']['Provision']) - 1
number_provisions

2

##### Body

In [132]:
#The Entire Body Block
xmldict['Body']

{'Section': [{'Definition': {'Text': {'DefinedTermEn': 'school authority',
     'DefinedTermFr': u'autorit\xe9 scolaire'},
    'generate-in-text': 'no'},
   'Label': '1',
   'Text': 'In these regulations,'},
  {'Label': '2',
   'Text': 'Where a non-taxable employee resides on federal land that, in the opinion of the Minister, is within easy access of a school established by the Minister of National Defence, the Minister shall, unless the facilities are insufficient, arrange with the Minister of National Defence for the accommodation at that school of the children of the employee, while he is so resident, in accordance with the provisions of Order in Council P.C. 44/2300 of 6th May, 1950.'},
  {'Label': '3',
   'Subsection': {'Label': '(3)',
    'Text': 'The amount payable under subsection (2) may be paid to a non-taxable employee to the extent that he has paid school fees to the school authority in respect of the child, and the amount payable to the school authority shall be reduced by

In [156]:
len(xmldict['Body']['Section'])

7

In [169]:
xmldict['Body']['Section'][0]

{'Definition': {'Text': {'DefinedTermEn': 'school authority',
   'DefinedTermFr': u'autorit\xe9 scolaire'},
  'generate-in-text': 'no'},
 'Label': '1',
 'Text': 'In these regulations,'}

In [170]:
xmldict['Body']['Section'][0]["Text"]

'In these regulations,'

In [171]:
xmldict['Body']['Section'][0]['Definition']["Text"]

{'DefinedTermEn': 'school authority', 'DefinedTermFr': u'autorit\xe9 scolaire'}

In [172]:
xmldict['Body']['Section'][1]["Text"]

'Where a non-taxable employee resides on federal land that, in the opinion of the Minister, is within easy access of a school established by the Minister of National Defence, the Minister shall, unless the facilities are insufficient, arrange with the Minister of National Defence for the accommodation at that school of the children of the employee, while he is so resident, in accordance with the provisions of Order in Council P.C. 44/2300 of 6th May, 1950.'

In [398]:
xmldict['Body']['Section'][2]

KeyError: 2

# Method 1 - Test with 2nd Regualtion 

In [396]:
#Create the parent-child dictionary 
tree = ElementTree.parse('SOR-80-394.xml')
root = tree.getroot()
xmldict = XmlDictConfig(root)

In [400]:
xmldict['Body']

{'Heading': {'TitleText': 'Notice of Appeal', 'level': '1'},
 'Section': {'Label': '6',
  'Text': 'A notice of appeal shall be in the form set out in the schedule.'}}

# Alternate Method

In [401]:
import untangle
import xml.etree.ElementTree
from lxml import etree

In [402]:
def make_dict_from_tree(element_tree):
    """Traverse the given XML element tree to convert it into a dictionary.
 
    :param element_tree: An XML element tree
    :type element_tree: xml.etree.ElementTree
    :rtype: dict
    """
    def internal_iter(tree, accum):
        """Recursively iterate through the elements of the tree accumulating
        a dictionary result.
 
        :param tree: The XML element tree
        :type tree: xml.etree.ElementTree
        :param accum: Dictionary into which data is accumulated
        :type accum: dict
        :rtype: dict
        """
        if tree is None:
            return accum
 
        if tree.getchildren():
            accum[tree.tag] = {}
            for each in tree.getchildren():
                result = internal_iter(each, {})
                if each.tag in accum[tree.tag]:
                    if not isinstance(accum[tree.tag][each.tag], list):
                        accum[tree.tag][each.tag] = [
                            accum[tree.tag][each.tag]
                        ]
                    accum[tree.tag][each.tag].append(result[each.tag])
                else:
                    accum[tree.tag].update(result)
        else:
            accum[tree.tag] = tree.text
 
        return accum
 
    return internal_iter(element_tree, {})
 


# Alt Method Test 1 - SOR-49-42

In [403]:
doc = etree.parse('SOR-49-42.xml')
doc_dict = make_dict_from_tree(doc.getroot())

In [404]:
doc_dict

{'Regulation': {'Body': {'Section': [{'Definition': [{'Text': {'DefinedTermEn': 'child',
        'DefinedTermFr': 'enfant'}},
      {'Text': {'DefinedTermEn': 'federal land',
        'DefinedTermFr': u'terrain f\xe9d\xe9ral'}},
      {'Text': {'DefinedTermEn': 'Minister', 'DefinedTermFr': 'Ministre'}},
      {'Text': {'DefinedTermEn': 'non-taxable employee',
        'DefinedTermFr': u'employ\xe9 non imposable'}},
      {'Text': {'DefinedTermEn': 'school authority',
        'DefinedTermFr': u'autorit\xe9 scolaire'}}],
     'Label': '1',
     'Text': 'In these regulations,'},
    {'Label': '2',
     'Text': 'Where a non-taxable employee resides on federal land that, in the opinion of the Minister, is within easy access of a school established by the Minister of National Defence, the Minister shall, unless the facilities are insufficient, arrange with the Minister of National Defence for the accommodation at that school of the children of the employee, while he is so resident, in accordan

### Body

In [412]:
#How many parts (labels) does this Section have?
len_body = len(doc_dict['Regulation']['Body']['Section'])
len_body 

7

In [413]:
#Raw extraction of all text in the body...
str(doc_dict['Regulation']['Body'].values())

"[[{'Text': 'In these regulations,', 'Definition': [{'Text': {'DefinedTermFr': 'enfant', 'DefinedTermEn': 'child'}}, {'Text': {'DefinedTermFr': u'terrain f\\xe9d\\xe9ral', 'DefinedTermEn': 'federal land'}}, {'Text': {'DefinedTermFr': 'Ministre', 'DefinedTermEn': 'Minister'}}, {'Text': {'DefinedTermFr': u'employ\\xe9 non imposable', 'DefinedTermEn': 'non-taxable employee'}}, {'Text': {'DefinedTermFr': u'autorit\\xe9 scolaire', 'DefinedTermEn': 'school authority'}}], 'Label': '1'}, {'Text': 'Where a non-taxable employee resides on federal land that, in the opinion of the Minister, is within easy access of a school established by the Minister of National Defence, the Minister shall, unless the facilities are insufficient, arrange with the Minister of National Defence for the accommodation at that school of the children of the employee, while he is so resident, in accordance with the provisions of Order in Council P.C. 44/2300 of 6th May, 1950.', 'Label': '2'}, {'Subsection': [{'Text': 'Wh

### Extraction of each Part (Label) in the Body

##### Part 1 of 7

In [207]:
doc_dict['Regulation']['Body']['Section'][0]

{'Definition': [{'Text': {'DefinedTermEn': 'child',
    'DefinedTermFr': 'enfant'}},
  {'Text': {'DefinedTermEn': 'federal land',
    'DefinedTermFr': u'terrain f\xe9d\xe9ral'}},
  {'Text': {'DefinedTermEn': 'Minister', 'DefinedTermFr': 'Ministre'}},
  {'Text': {'DefinedTermEn': 'non-taxable employee',
    'DefinedTermFr': u'employ\xe9 non imposable'}},
  {'Text': {'DefinedTermEn': 'school authority',
    'DefinedTermFr': u'autorit\xe9 scolaire'}}],
 'Label': '1',
 'Text': 'In these regulations,'}

In [209]:
#How many parts in this 1st section
len(doc_dict['Regulation']['Body']['Section'][0])

3

In [215]:
#How many definitions in part
len(doc_dict['Regulation']['Body']['Section'][0]['Definition'])

5

In [225]:
list_terms = []
for i in range (0,len(doc_dict['Regulation']['Body']['Section'][0]['Definition'])):
    list_terms.append(doc_dict['Regulation']['Body']['Section'][0]['Definition'][i]['Text']['DefinedTermEn'])

In [227]:
list_terms

['child',
 'federal land',
 'Minister',
 'non-taxable employee',
 'school authority']

##### Part 2 of 7

In [310]:
num = 1

In [312]:
#Check if there are subsections 
try:
    len_sub = len(doc_dict['Regulation']['Body']['Section'][num]['Subsection'])
    section = ""
    for k in range (0,len_sub):
        section += (doc_dict['Regulation']['Body']['Section'][num]['Subsection'][k]['Text']    + " ") 
    
except:
    section = doc_dict['Regulation']['Body']['Section'][num]["Text"]

In [313]:
section

'Where a non-taxable employee resides on federal land that, in the opinion of the Minister, is within easy access of a school established by the Minister of National Defence, the Minister shall, unless the facilities are insufficient, arrange with the Minister of National Defence for the accommodation at that school of the children of the employee, while he is so resident, in accordance with the provisions of Order in Council P.C. 44/2300 of 6th May, 1950.'

##### Part 3 of 7

In [322]:
num = 2

In [323]:
#Check if there are subsections 
try:
    len_sub = len(doc_dict['Regulation']['Body']['Section'][num]['Subsection'])
    section = ""
    for k in range (0,len_sub):
        section += (doc_dict['Regulation']['Body']['Section'][num]['Subsection'][k]['Text']    + " ") 
    
except:
    section = doc_dict['Regulation']['Body']['Section'][num]["Text"]

In [324]:
section

'Where a child of a non-taxable employee is not accommodated at a school mentioned in section 2, the Minister may arrange for the child to be accommodated at a school in the municipality in which the employee resides, or if, in the opinion of the Minister, the school facilities in that municipality are inadequate, at a school in a municipality having adequate facilities for the purpose, but a report shall be submitted to the Treasury Board whenever attendance at a school outside the municipality in which the employee resides is authorized stating the reasons for the arrangement and the additional costs resulting, if any. Where a child is accommodated at a school pursuant to an arrangement under subsection (1), the Minister may pay to the school authority, in each school year during which the child attends the school under the arrangement, an amount not exceeding the school fees ordinarily charged for the accommodation of a non-resident at the school. The amount payable under subsection

##### Part 4 of 7

In [331]:
num = 3

In [329]:
#Check if there are subsections 
try:
    len_sub = len(doc_dict['Regulation']['Body']['Section'][num]['Subsection'])
    section = ""
    for k in range (0,len_sub):
        section += (doc_dict['Regulation']['Body']['Section'][num]['Subsection'][k]['Text']    + " ") 
    
except:
    section = doc_dict['Regulation']['Body']['Section'][num]["Text"]

In [330]:
section

'Where a child of a non-taxable employee, pursuant to an arrangement under section 3, attends a school that is not less than five miles and not more than thirty miles from the place of residence, and to attend the school uses a public transportation service approved for the purpose by the Minister, other than the service mentioned in subsection (2) or an urban public transportation service, the Minister may pay to the employee the amount by which the cost to the employee of the service that is rendered to the child exceeds three dollars a month. Where the Minister of National Defence provides a transportation service that can be made available to a child for the purpose of attending a school pursuant to an arrangement under section 3, the Minister may arrange for the child to use that service on such terms and conditions as the Minister of National Defence may prescribe. '

##### Part 5 of 7

In [335]:
num = 4

try:
    len_sub = len(doc_dict['Regulation']['Body']['Section'][num]['Subsection'])
    section = ""
    for k in range (0,len_sub):
        section += (doc_dict['Regulation']['Body']['Section'][num]['Subsection'][k]['Text']    + " ") 
    
except:
    section = doc_dict['Regulation']['Body']['Section'][num]["Text"]

In [336]:
section

'Where federal land is made available for residential use, the Minister shall, except where in his opinion it is not feasible, lease the land on terms that will not deprive the municipality in which the land is situated, of its right to tax the tenant in respect of his occupancy of the land.'

##### Part 6 of 7

In [364]:
doc_dict['Regulation']['Body']['Section'][num]

{'Label': '6',
 'Paragraph': [{'Label': '(a)',
   'Text': {'XRefExternal': 'Municipal Grants Act'}},
  {'Label': '(b)',
   'Text': 'that is situated in a municipality in which he is subject to municipal taxation in respect of his occupancy of that land.'}],
 'Text': 'No payment shall be made under these regulations in respect of a person who resides on federal land'}

In [394]:
num = 5

#Check for if body section has subsections 
try:
    len_sub = len(doc_dict['Regulation']['Body']['Section'][num]['Subsection'])
    section = ""
    for k in range (0,len_sub):
        section += (doc_dict['Regulation']['Body']['Section'][num]['Subsection'][k]['Text']    + " ") 
    
except:
    pass

#Check for if body section has multiple paragrphs 
try:
    len_paragraph = len(doc_dict['Regulation']['Body']['Section'][num]['Paragraph'])
    section = ""
    for k in range (0,len_paragraph):
        section += (doc_dict['Regulation']['Body']['Section'][num]["Text"] + " " + 
        (str(doc_dict['Regulation']['Body']['Section'][num]['Paragraph'][k]['Text']))  + " ")

#If there are subsections or separate paragraphs
except:
    section = doc_dict['Regulation']['Body']['Section'][num]["Text"]

In [395]:
section

"No payment shall be made under these regulations in respect of a person who resides on federal land {'XRefExternal': 'Municipal Grants Act'} No payment shall be made under these regulations in respect of a person who resides on federal land that is situated in a municipality in which he is subject to municipal taxation in respect of his occupancy of that land. "

In [388]:
doc_dict['Regulation']['Body']['Section'][num]["Text"]

'No payment shall be made under these regulations in respect of a person who resides on federal land'

In [393]:
str(doc_dict['Regulation']['Body']['Section'][num]['Paragraph'][1]['Text'])

'that is situated in a municipality in which he is subject to municipal taxation in respect of his occupancy of that land.'

In [367]:
len_paragraph = len(doc_dict['Regulation']['Body']['Section'][num]['Paragraph'])
len_paragraph

2

In [379]:
section = ""
for k in range (0,len_paragraph):
    section += (str(doc_dict['Regulation']['Body']['Section'][num]['Paragraph'][k]['Text'])  + " ")

In [380]:
section

"{'XRefExternal': 'Municipal Grants Act'} that is situated in a municipality in which he is subject to municipal taxation in respect of his occupancy of that land. "

# Alt Method Test 2: SOR-80-394

In [None]:
doc = etree.parse('SOR-49-42.xml')
doc_dict = make_dict_from_tree(doc.getroot())