In [100]:
import pandas as pd
import io
import xml.etree.ElementTree as ElementTree
import os

#Hide Future Version warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [101]:
#Code to convert XML element tree to a dictionary
def make_dict_from_tree(element_tree):
    def internal_iter(tree, accum):
        if tree is None:
            return accum
        
        if tree.getchildren():
            accum[tree.tag] = {}
            for each in tree.getchildren():
                result = internal_iter(each, {})
                if each.tag in accum[tree.tag]:
                    if not isinstance(accum[tree.tag][each.tag], list):
                        accum[tree.tag][each.tag] = [
                            accum[tree.tag][each.tag]
                        ]
                    accum[tree.tag][each.tag].append(result[each.tag])
                else:
                    accum[tree.tag].update(result)
        else:
            accum[tree.tag] = tree.text
        return accum
    return internal_iter(element_tree, {})

# Test 1 - SOR-49-42

In [102]:
doc = etree.parse('SOR-49-42.xml')
doc_dict = make_dict_from_tree(doc.getroot())

In [103]:
doc_dict

{'Regulation': {'Body': {'Section': [{'Definition': [{'Text': {'DefinedTermEn': 'child',
        'DefinedTermFr': 'enfant'}},
      {'Text': {'DefinedTermEn': 'federal land',
        'DefinedTermFr': u'terrain f\xe9d\xe9ral'}},
      {'Text': {'DefinedTermEn': 'Minister', 'DefinedTermFr': 'Ministre'}},
      {'Text': {'DefinedTermEn': 'non-taxable employee',
        'DefinedTermFr': u'employ\xe9 non imposable'}},
      {'Text': {'DefinedTermEn': 'school authority',
        'DefinedTermFr': u'autorit\xe9 scolaire'}}],
     'Label': '1',
     'Text': 'In these regulations,'},
    {'Label': '2',
     'Text': 'Where a non-taxable employee resides on federal land that, in the opinion of the Minister, is within easy access of a school established by the Minister of National Defence, the Minister shall, unless the facilities are insufficient, arrange with the Minister of National Defence for the accommodation at that school of the children of the employee, while he is so resident, in accordan

##### Identification

In [108]:
#Extracting all the Identification Headers
InstrumentNumber = doc_dict['Regulation']['Identification']['InstrumentNumber']
ConsolidationDate = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']
LastModificationDate = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']
EnablingAuthority = doc_dict['Regulation']['Identification']['EnablingAuthority'].values()
ShortTitle = doc_dict['Regulation']['Identification']['ShortTitle']
LongTitle = doc_dict['Regulation']['Identification']['LongTitle']
RegulationMaker = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['RegulationMaker']
RetulationOrderNumber = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['OrderNumber']
RegulationMakerDate = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']

In [109]:
#Checking all the elements
print (InstrumentNumber)
print (ConsolidationDate)
print (LastModificationDate)
print (EnablingAuthority)
print (ShortTitle)
print (LongTitle)
print (RegulationMaker)
print (RetulationOrderNumber)
print (RegulationMakerDate)

SOR/49-42
{'YYYY': '2018', 'MM': '06', 'DD': '21'}
{'YYYY': '2006', 'MM': '10', 'DD': '26'}
[['APPROPRIATION ACT, NO. 7, 1949', 'APPROPRIATION ACTS']]
Regulations re school fees and transportation costs for children of certain Government employees
Regulations Governing Payment of School Fees and Transportation Costs Re Children of Certain Employees of the Government of Canada
P.C.
1954-1694
{'YYYY': '1954', 'MM': '11', 'DD': '09'}


##### Order

In [57]:
#How many parts (labels) does this body have?
len(doc_dict['Regulation']['Order']['Provision'])

3

In [51]:
#Raw extraction of all text in the 'order'...
order = str(doc_dict['Regulation']['Order'].values())
order

"[[{'Text': 'His Excellency the Governor General in Council, on the recommendation of the Minister of Finance and pursuant to The Appropriation Act, No. 7, 1949, Vote No 938, is pleased to order as follows:'}, {'Text': 'The Regulations governing the payment of school fees and transportation costs of children of certain employees of the Government of Canada, established by Order in Council PC 3455 of 19th July, 1950, are hereby revoked; and', 'Label': '1.'}, {'Text': u'The annexed \\u201cRegulations governing payment of school fees and transportation costs re children of certain employees of the Government of Canada\\u201d are hereby made and established in substitution for the regulations hereby revoked.', 'Label': '2.'}]]"

##### Body

In [7]:
#How many parts (labels) does this body have?
len_body = len(doc_dict['Regulation']['Body']['Section'])
len_body 

7

In [58]:
#Raw extraction of all text in the 'body'...
body = str(doc_dict['Regulation']['Body'].values())
body

"[[{'Text': 'In these regulations,', 'Definition': [{'Text': {'DefinedTermFr': 'enfant', 'DefinedTermEn': 'child'}}, {'Text': {'DefinedTermFr': u'terrain f\\xe9d\\xe9ral', 'DefinedTermEn': 'federal land'}}, {'Text': {'DefinedTermFr': 'Ministre', 'DefinedTermEn': 'Minister'}}, {'Text': {'DefinedTermFr': u'employ\\xe9 non imposable', 'DefinedTermEn': 'non-taxable employee'}}, {'Text': {'DefinedTermFr': u'autorit\\xe9 scolaire', 'DefinedTermEn': 'school authority'}}], 'Label': '1'}, {'Text': 'Where a non-taxable employee resides on federal land that, in the opinion of the Minister, is within easy access of a school established by the Minister of National Defence, the Minister shall, unless the facilities are insufficient, arrange with the Minister of National Defence for the accommodation at that school of the children of the employee, while he is so resident, in accordance with the provisions of Order in Council P.C. 44/2300 of 6th May, 1950.', 'Label': '2'}, {'Subsection': [{'Text': 'Wh

# Test 2: SOR-80-394

In [126]:
doc = etree.parse('SOR-80-394.xml')
doc_dict = make_dict_from_tree(doc.getroot())

#Extracting all the Identification Headers
InstrumentNumber = doc_dict['Regulation']['Identification']['InstrumentNumber']
RegistrationDate = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']
ConsolidationDate = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']
LastModificationDate = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']
EnablingAuthority = doc_dict['Regulation']['Identification']['EnablingAuthority'].values()
ShortTitle = doc_dict['Regulation']['Identification']['ShortTitle']
LongTitle = doc_dict['Regulation']['Identification']['LongTitle']
RegulationMaker = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['RegulationMaker']
RetulationOrderNumber = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['OrderNumber']
RegulationMakerDate = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']

#How many parts (labels) does this body have?
len_order = len(doc_dict['Regulation']['Order']['Provision'])

#Raw extraction of all text in the 'order'...
order = str(doc_dict['Regulation']['Order'].values())

#How many parts (labels) does this body have?
len_body = len(doc_dict['Regulation']['Body']['Section'])

#Raw extraction of all text in the 'body'...
body = str(doc_dict['Regulation']['Body'].values())

#Raw extraction of all text in the Schedule
schedule = doc_dict['Regulation']['Schedule'].values()

In [127]:
#Checking all the Identification elements
print (InstrumentNumber)
print (RegistrationDate)
print (ConsolidationDate)
print (LastModificationDate)
print (EnablingAuthority)
print (ShortTitle)
print (LongTitle)
print (RegulationMaker)
print (RetulationOrderNumber)
print (RegulationMakerDate)

SOR/80-394
{'YYYY': '1980', 'MM': '5', 'DD': '27'}
{'YYYY': '2018', 'MM': '06', 'DD': '21'}
{'YYYY': '2016', 'MM': '11', 'DD': '28'}
['CANADIAN HUMAN RIGHTS ACT']
Human Rights Tribunal Appeal Regulations
Regulations Respecting the Manner and Form for Making an Appeal from the Decision or Order of a Human Rights Tribunal
P.C.
1980-1411
{'YYYY': '1980', 'MM': '5', 'DD': '27'}


In [128]:
#Checking the Order
print ("Length of order: " + str(len_order))
print (order)

Length of order: 1
[{'Text': {'XRefExternal': ['Canadian Human Rights Act', 'Regulations respecting the manner and form for making an appeal from the decision or order of a Human Rights Tribunal']}}]


In [129]:
#Checking the Body
print ("Length of body: " + str(len_body))
print (body)

Length of body: 6
[[{'Text': {'XRefExternal': 'Human Rights Tribunal Appeal Regulations'}, 'Label': '1'}, {'Text': {'Repealed': '[Revoked, SOR/90-286, s. 1]'}, 'Label': '2'}, {'Text': {'Repealed': '[Revoked, SOR/90-286, s. 2]'}, 'Label': '3'}, {'Text': 'Service of the notice of appeal on all parties may be made personally or by registered mail.', 'Label': '4'}, {'Text': {'Repealed': '[Revoked, SOR/90-286, s. 3]'}, 'Label': '5'}, {'Text': 'A notice of appeal shall be in the form set out in the schedule.', 'Label': '6'}], [{'TitleText': 'Short Title'}, {'TitleText': 'Notice of Appeal'}]]


In [132]:
#Checking the scheduale 
print (schedule)

[{'TitleText': 'Notice of Appeal', 'Label': 'SCHEDULE'}, [{'Text': {'Leader': [None, None, None]}}, {'Text': {'LeaderRightJustified': None, 'Leader': None}}, {'Text': {'LeaderRightJustified': None}}, {'Text': 'The Appellant appeals on the following grounds (a brief statement of the points to be argued).'}, {'Text': {'Leader': [None, None, None, None]}, 'SignatureBlock': {'SignatureLine': {'Leader': None}, 'SignatureName': 'Appellant'}}, {'Text': 'TO: President', 'Provision': [{'Text': 'Human Rights Tribunal Panel'}, {'Text': 'Ottawa, Ontario'}, {'Text': 'K1A 1J4'}]}, {'Text': 'AND TO:'}], 'SOR/86-294, s. 3; SOR/90-286, s. 4; SOR/94-237, s. 1.']


# Test 3: SOR-2018-46

In [137]:
doc = etree.parse('SOR-2018-46.xml')
doc_dict = make_dict_from_tree(doc.getroot())

In [140]:
#Extracting all the Identification Headers
try:
    InstrumentNumber = doc_dict['Regulation']['Identification']['InstrumentNumber']
except: 
    InstrumentNumber = ""

try:
    RegistrationDate = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']
except: 
    RegistrationDate = ""

try:
    ConsolidationDate = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']
except: 
    RegistrationDate = ""

try:
    LastModificationDate = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']
except:
    LastModificationDate = ""

try:
    EnablingAuthority = doc_dict['Regulation']['Identification']['EnablingAuthority'].values()
except:
    EnablingAuthority = ""

try:
    ShortTitle = doc_dict['Regulation']['Identification']['ShortTitle']
except:
    ShortTitle = ""

try:
    LongTitle = doc_dict['Regulation']['Identification']['LongTitle']
except:
    LongTitle = ""

try:
    RegulationMaker = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['RegulationMaker']
except:
    RegulationMaker = ""

try:
    RetulationOrderNumber = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['OrderNumber']
except:
    RetulationOrderNumber = ""

try:
    RegulationMakerDate = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']
except:
    RegulationMakerDate = ""

#Order Section of the Regulation
try:
    len_order = len(doc_dict['Regulation']['Order']['Provision'])
    order = str(doc_dict['Regulation']['Order'].values())
except:
    order = ""

#Body Section of the Regulation
try:
    len_body = len(doc_dict['Regulation']['Body']['Section'])    
    body = str(doc_dict['Regulation']['Body'].values())
except:
    body = ""

#Schedule Section of the Regulation
try:
    schedule = doc_dict['Regulation']['Schedule'].values()
except:
    schedule = ""

In [141]:
#Checking all the Identification elements
print (InstrumentNumber)
print (RegistrationDate)
print (ConsolidationDate)
print (LastModificationDate)
print (EnablingAuthority)
print (ShortTitle)
print (LongTitle)
print (RegulationMaker)
print (RetulationOrderNumber)
print (RegulationMakerDate)

SOR/2018-46
{'YYYY': '2018', 'MM': '3', 'DD': '20'}
{'YYYY': '2018', 'MM': '06', 'DD': '21'}
{'YYYY': '2018', 'MM': '4', 'DD': '4'}
['CRIMINAL CODE']
Human Rights Tribunal Appeal Regulations
Order Declaring an Amnesty Period (2018)
P.C.
2018-301
{'YYYY': '2018', 'MM': '3', 'DD': '20'}


In [142]:
#Checking the Order
print ("Length of order: " + str(len_order))
print (order)

Length of order: 1
[[{'Text': 'S.C. 1995, c. 39, s. 139', 'Label': 'a'}, {'Text': 'R.S., c. C-46', 'Label': 'b'}], {'Text': {'XRefExternal': ['Criminal Code', 'Order Declaring an Amnesty Period (2018)'], 'FootnoteRef': ['a', 'b']}}]


In [143]:
#Checking the Body
print ("Length of body: " + str(len_body))
print (body)

Length of body: 3
[[{'Text': {'DefinedTermEn': 'firearm'}, 'Paragraph': [{'Text': 'a SAN Swiss Arms Model Classic Green Sniper rifle;', 'Label': '(a)'}, {'Text': 'a SAN Swiss Arms Model Ver rifle;', 'Label': '(b)'}, {'Text': 'a SAN Swiss Arms Model Aestas rifle;', 'Label': '(c)'}, {'Text': 'a SAN Swiss Arms Model Autumnus rifle; and', 'Label': '(d)'}, {'Text': 'a SAN Swiss Arms Model Hiemis rifle.', 'Label': '(e)'}], 'MarginalNote': {'DefinitionRef': 'firearm'}, 'Label': '1'}, {'Subsection': [{'Text': {'XRefExternal': 'Criminal Code'}, 'Paragraph': [{'Text': {'XRefExternal': 'Firearms Act'}, 'Label': '(a)'}, {'Text': 'during the amnesty period, continues to hold a licence while in possession of the firearm.', 'Label': '(b)'}], 'Label': '(1)'}, {'Text': 'The purpose of the amnesty period is to permit the person to do any of the following during that period:', 'Paragraph': [{'Text': 'possess the firearm;', 'Label': '(a)'}, {'Text': 'deliver the firearm to a peace officer, firearms office