In [2]:
#Import the required packages
import pandas as pd
import xml.etree.ElementTree
from lxml import etree
import os
import io

#Hide Future Version warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#Code to convert XML element tree to a dictionary
def make_dict_from_tree(element_tree):
    def internal_iter(tree, accum):
        if tree is None:
            return accum
        
        if tree.getchildren():
            accum[tree.tag] = {}
            for each in tree.getchildren():
                result = internal_iter(each, {})
                if each.tag in accum[tree.tag]:
                    if not isinstance(accum[tree.tag][each.tag], list):
                        accum[tree.tag][each.tag] = [
                            accum[tree.tag][each.tag]
                        ]
                    accum[tree.tag][each.tag].append(result[each.tag])
                else:
                    accum[tree.tag].update(result)
        else:
            accum[tree.tag] = tree.text
        return accum
    return internal_iter(element_tree, {})

In [11]:
#function to clean each body text
def body_clean(text):
    cleaned = text.replace("Text", "")
    cleaned = cleaned.replace("[", "")
    cleaned = cleaned.replace("]", "")
    cleaned = cleaned.replace("{", "")
    cleaned = cleaned.replace("}", "")
    return cleaned

In [12]:
#function to clean each label tag
def label_clean(text):
    cleaned = body_clean(text)
    cleaned = cleaned.replace("'", "")
    return cleaned

# Test - Preparing the Body Section of Each Regulation

In [15]:
list_regs = ['SOR-2013-39.xml','SOR-2000-107.xml','SOR-2005-334.xml']

#Final list of each regulation's formatted body text to append to the dataframe
list_body_final = []

for reg in list_regs:
    #Create the master dictionary for each regulation 
    doc = etree.parse(reg)
    doc_dict = make_dict_from_tree(doc.getroot())
    
    #Create the nested list for each regulation's label and associated text 
    ###Structure for one regulation --> [[Label: X, [phrase1, phrase2, phrase3]],[Label: Y, [Phrase5, Phrase6]]]
    list_reg_labeled_body = []  
    
    #Iterate through each part of a regulation's body (i.e. the labels such as label 1, label 2 etc.)
    Parts = len(doc_dict['Regulation']['Body']['Section'])
    
    for i in range(0,Parts):
        #Identify the title (e.g.number) of each section in the regulation body
        label_number = str(doc_dict['Regulation']['Body']['Section'][i]["Label"])
        label_tag = "Label: " + label_clean(label_number)

        #Determine all the contents associated with the label, removing the label key
        label_text = doc_dict['Regulation']['Body']['Section'][i].copy()
        del label_text['Label']

        #Format the text for each label, removing remains of xml tags and dictionary formatting
        label_text = body_clean(str(label_text))
        label_text = label_text.split("'")
        list_label_text = []

        for phrase in label_text:
            if len(phrase) > 2:
                list_label_text.append(phrase)

        #Create the nested pair of label_tag + label_text --> [Label: X, [phrase1, phrase2, phrase3]]
        label_pair = []
        label_pair.append(label_tag)
        label_pair.append(list_label_text)
    
        #Append all the nested lists for each individual regulation
        list_reg_labeled_body.append(label_pair)   
        
    #Append each regulations final label-text nested list for adding to the df
    list_body_final.append(list_reg_labeled_body)

###### Testing the outputs of the three regs used

In [27]:
#Testing the output
len(list_body_final)

3

In [28]:
len(list_body_final[0])

4

In [29]:
list_body_final[0]

[['Label: 1',
  ['XRefExternal',
   'CPA Canada Public Sector Accounting Handbook',
   'DefinedTermEn',
   'government',
   'MarginalNote',
   'DefinitionRef',
   'government',
   'HistoricalNote',
   'SOR/2017-114, s. 5.']],
 ['Label: 2',
  ['Subsection',
   'XRefExternal',
   'Northwest Territories Act',
   'Paragraph',
   'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;',
   'Label',
   '(a)',
   'XRefInternal',
   'Label',
   '(b)',
   'XRefInternal',
   'Label',
   '(c)',
   'a contingent liability incurred as a result of any loan guarantee provided by the government.',
   'Label',
   '(d)',
   'Label',
   '(1)',
   'Each of the following does not constitute or is deemed not to constitute borrowing:',
   'Paragraph',
   'an obligation, liability or contingent liability incurred as a result of any transaction referred to in subsection

In [24]:
list_body_final[0][0]

['Label: 1',
 ['XRefExternal',
  'CPA Canada Public Sector Accounting Handbook',
  'DefinedTermEn',
  'government',
  'MarginalNote',
  'DefinitionRef',
  'government',
  'HistoricalNote',
  'SOR/2017-114, s. 5.']]

In [25]:
list_body_final[0][0][0]

'Label: 1'

In [26]:
list_body_final[0][0][1]

['XRefExternal',
 'CPA Canada Public Sector Accounting Handbook',
 'DefinedTermEn',
 'government',
 'MarginalNote',
 'DefinitionRef',
 'government',
 'HistoricalNote',
 'SOR/2017-114, s. 5.']

In [30]:
list_body_final[1]

[['Label: 1',
  ['The definitions in this section apply in these Regulations.',
   'Definition',
   'DefinedTermFr',
   'facteur de bioaccumulation',
   'DefinedTermEn',
   'bioaccumulation factor',
   'DefinedTermFr',
   'facteur de bioconcentration',
   'DefinedTermEn',
   'bioconcentration factor',
   'DefinedTermFr',
   'demi-vie',
   'DefinedTermEn',
   'half-life',
   'DefinedTermFr',
   'coefficient de partage octanol-eau',
   'DefinedTermEn',
   'octanol-water partition coefficient']],
 ['Label: 2', ['XRefExternal', 'Canadian Environmental Protection Act, 1999']],
 ['Label: 3',
  ['A substance is persistent when it has at least one of the following characteristics:',
   'Paragraph',
   'in air,',
   'Subparagraph',
   'its half-life is equal to or greater than 2 days, or',
   'Label',
   '(i)',
   'it is subject to atmospheric transport from its source to a remote area;',
   'Label',
   '(ii)',
   'Label',
   '(a)',
   'in water, its half-life is equal to or greater than 182 da

In [38]:
list_body_final[1][2][1]

['A substance is persistent when it has at least one of the following characteristics:',
 'Paragraph',
 'in air,',
 'Subparagraph',
 'its half-life is equal to or greater than 2 days, or',
 'Label',
 '(i)',
 'it is subject to atmospheric transport from its source to a remote area;',
 'Label',
 '(ii)',
 'Label',
 '(a)',
 'in water, its half-life is equal to or greater than 182 days;',
 'Label',
 '(b)',
 'in sediments, its half-life is equal to or greater than 365 days; or',
 'Label',
 '(c)',
 'in soil, its half-life is equal to or greater than 182 days.',
 'Label',
 '(d)']

In [46]:
list_body_final[1][5][0]

'Label: 6'

# Looping through all Regs & Flagging Errors (SOR Only)

In [83]:
#Obtain the list of regulations only from the working directory
list_regs = []
files = [f for f in os.listdir('.') if os.path.isfile(f)]

for file in files:
    if file[-3:] == 'xml':
        list_regs.append(file)

In [90]:
#Final list of each regulation's formatted body text to append to the dataframe
list_body_final = []
list_exceptions = []

for reg in list_regs:
    #Create the master dictionary for each regulation 
    doc = etree.parse(reg)
    doc_dict = make_dict_from_tree(doc.getroot())
    
    #Create the nested list for each regulation's label and associated text 
    ###Structure for one regulation --> [[Label: X, [phrase1, phrase2, phrase3]],[Label: Y, [Phrase5, Phrase6]]]
    list_reg_labeled_body = []  
    
    #The following code will execute assuming the reg follows the conventional structure...track the exceptions
    try:
        #Iterate through each part of a regulation's body (i.e. the labels such as label 1, label 2 etc.)
        Parts = len(doc_dict['Regulation']['Body']['Section'])

        for i in range(0,Parts):
            #Identify the title (e.g.number) of each section in the regulation body
            label_number = str(doc_dict['Regulation']['Body']['Section'][i]["Label"])
            label_tag = "Label: " + label_clean(label_number)

            #Determine all the contents associated with the label, removing the label key
            label_text = doc_dict['Regulation']['Body']['Section'][i].copy()
            del label_text['Label']

            #Format the text for each label, removing remains of xml tags and dictionary formatting
            label_text = body_clean(str(label_text))
            label_text = label_text.split("'")
            list_label_text = []

            for phrase in label_text:
                if len(phrase) > 2:
                    list_label_text.append(phrase)

            #Create the nested pair of label_tag + label_text --> [Label: X, [phrase1, phrase2, phrase3]]
            label_pair = []
            label_pair.append(label_tag)
            label_pair.append(list_label_text)

            #Append all the nested lists for each individual regulation
            list_reg_labeled_body.append(label_pair)   

            #Append each regulations final label-text nested list for adding to the df
            list_body_final.append(list_reg_labeled_body)
        
    except:
        list_body_final.append("Exception")
        list_exceptions.append(reg)

In [96]:
print (len(list_exceptions))
print (len(files))
print (69.0/2638)

69
2638
0.0261561789234


In [92]:
list_exceptions

['SOR-2000-131.xml',
 'SOR-2001-182.xml',
 'SOR-2001-286.xml',
 'SOR-2001-297.xml',
 'SOR-2002-351.xml',
 'SOR-2002-420.xml',
 'SOR-2002-48.xml',
 'SOR-2003-312.xml',
 'SOR-2003-313.xml',
 'SOR-2005-226.xml',
 'SOR-2006-139.xml',
 'SOR-2006-247.xml',
 'SOR-2006-288.xml',
 'SOR-2006-298.xml',
 'SOR-2006-43.xml',
 'SOR-2007-286.xml',
 'SOR-2007-71.xml',
 'SOR-2008-249.xml',
 'SOR-2009-14.xml',
 'SOR-2009-202.xml',
 'SOR-2010-166.xml',
 'SOR-2011-115.xml',
 'SOR-2011-168.xml',
 'SOR-2011-227.xml',
 'SOR-2011-52.xml',
 'SOR-2013-141.xml',
 'SOR-2013-142.xml',
 'SOR-2014-18.xml',
 'SOR-2014-236.xml',
 'SOR-2014-59.xml',
 'SOR-2014-61.xml',
 'SOR-2016-42.xml',
 'SOR-2017-205.xml',
 'SOR-2017-245.xml',
 'SOR-2017-73.xml',
 'SOR-56-290.xml',
 'SOR-57-176.xml',
 'SOR-61-378.xml',
 'SOR-78-148.xml',
 'SOR-78-223.xml',
 'SOR-78-68.xml',
 'SOR-79-158.xml',
 'SOR-80-629.xml',
 'SOR-80-957.xml',
 'SOR-81-543.xml',
 'SOR-84-764.xml',
 'SOR-87-40.xml',
 'SOR-87-592.xml',
 'SOR-87-91.xml',
 'SOR-88-504

In [98]:
doc = etree.parse('SOR-79-613.xml')
doc_dict = make_dict_from_tree(doc.getroot())

In [101]:
doc_dict['Regulation'].keys()

['Body', 'Identification', 'Order', 'Schedule']

# How to Handle Order Section

In [112]:
doc_dict['Regulation'].keys()

['Body', 'Identification', 'Order', 'Schedule']

In [113]:
doc_dict['Regulation']['Order']['Provision']['Text']

{'XRefExternal': ['Statutory Instruments Act',
  'Regulations respecting a special issue of Part II of the Canada Gazette to be published in conjunction with the Consolidated Regulations of Canada, 1978']}

In [115]:
doc_dict['Regulation']['Order']['Provision']['Text'].keys()

['XRefExternal']

In [7]:
doc = etree.parse('SOR-2013-39.xml')
doc_dict = make_dict_from_tree(doc.getroot())

In [9]:
doc_dict['Regulation'].

{'Regulation': {'Body': {'Heading': [{'TitleText': 'Definition'},
    {'TitleText': 'Borrowing'},
    {'TitleText': 'Value of Borrowing'},
    {'TitleText': 'Coming into Force'}],
   'Section': [{'HistoricalNote': 'SOR/2017-114, s. 5.',
     'Label': '1',
     'MarginalNote': {'DefinitionRef': 'government'},
     'Text': {'DefinedTermEn': 'government',
      'XRefExternal': 'CPA Canada Public Sector Accounting Handbook'}},
    {'HistoricalNote': 'SOR/2017-114, s. 3.',
     'Label': '2',
     'MarginalNote': 'Meaning',
     'Subsection': [{'Label': '(1)',
       'Paragraph': [{'Label': '(a)',
         'Text': 'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;'},
        {'Label': '(b)', 'Text': {'XRefInternal': '3'}},
        {'Label': '(c)', 'Text': {'XRefInternal': '3'}},
        {'Label': '(d)',
         'Text': 'a contingent liability in

In [34]:
doc_dict['Regulation']['Body']

{'Heading': [{'TitleText': 'Definition'},
  {'TitleText': 'Borrowing'},
  {'TitleText': 'Value of Borrowing'},
  {'TitleText': 'Coming into Force'}],
 'Section': [{'HistoricalNote': 'SOR/2017-114, s. 5.',
   'Label': '1',
   'MarginalNote': {'DefinitionRef': 'government'},
   'Text': {'DefinedTermEn': 'government',
    'XRefExternal': 'CPA Canada Public Sector Accounting Handbook'}},
  {'HistoricalNote': 'SOR/2017-114, s. 3.',
   'Label': '2',
   'MarginalNote': 'Meaning',
   'Subsection': [{'Label': '(1)',
     'Paragraph': [{'Label': '(a)',
       'Text': 'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;'},
      {'Label': '(b)', 'Text': {'XRefInternal': '3'}},
      {'Label': '(c)', 'Text': {'XRefInternal': '3'}},
      {'Label': '(d)',
       'Text': 'a contingent liability incurred as a result of any loan guarantee provided by the gov

In [42]:
doc_dict['Regulation']['Body']['Section']

[{'HistoricalNote': 'SOR/2017-114, s. 5.',
  'Label': '1',
  'MarginalNote': {'DefinitionRef': 'government'},
  'Text': {'DefinedTermEn': 'government',
   'XRefExternal': 'CPA Canada Public Sector Accounting Handbook'}},
 {'HistoricalNote': 'SOR/2017-114, s. 3.',
  'Label': '2',
  'MarginalNote': 'Meaning',
  'Subsection': [{'Label': '(1)',
    'Paragraph': [{'Label': '(a)',
      'Text': 'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;'},
     {'Label': '(b)', 'Text': {'XRefInternal': '3'}},
     {'Label': '(c)', 'Text': {'XRefInternal': '3'}},
     {'Label': '(d)',
      'Text': 'a contingent liability incurred as a result of any loan guarantee provided by the government.'}],
    'Text': {'XRefExternal': 'Northwest Territories Act'}},
   {'Label': '(2)',
    'MarginalNote': 'Exclusions',
    'Paragraph': [{'Label': '(a)',
      'Text': 

In [47]:
len(doc_dict['Regulation']['Body']['Section'])

4

In [48]:
#Section 1
doc_dict['Regulation']['Body']['Section'][0]

{'HistoricalNote': 'SOR/2017-114, s. 5.',
 'Label': '1',
 'MarginalNote': {'DefinitionRef': 'government'},
 'Text': {'DefinedTermEn': 'government',
  'XRefExternal': 'CPA Canada Public Sector Accounting Handbook'}}

In [46]:
len(doc_dict['Regulation']['Body']['Section'][0])

4

In [51]:
doc_dict['Regulation']['Body']['Section'][0]['Label']

'1'

In [54]:
doc_dict['Regulation']['Body']['Section'][0]['HistoricalNote']

'SOR/2017-114, s. 5.'

In [55]:
doc_dict['Regulation']['Body']['Section'][0]['Text']

{'DefinedTermEn': 'government',
 'XRefExternal': 'CPA Canada Public Sector Accounting Handbook'}

In [57]:
#Section 2
doc_dict['Regulation']['Body']['Section'][1]

{'HistoricalNote': 'SOR/2017-114, s. 3.',
 'Label': '2',
 'MarginalNote': 'Meaning',
 'Subsection': [{'Label': '(1)',
   'Paragraph': [{'Label': '(a)',
     'Text': 'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;'},
    {'Label': '(b)', 'Text': {'XRefInternal': '3'}},
    {'Label': '(c)', 'Text': {'XRefInternal': '3'}},
    {'Label': '(d)',
     'Text': 'a contingent liability incurred as a result of any loan guarantee provided by the government.'}],
   'Text': {'XRefExternal': 'Northwest Territories Act'}},
  {'Label': '(2)',
   'MarginalNote': 'Exclusions',
   'Paragraph': [{'Label': '(a)',
     'Text': 'an obligation, liability or contingent liability incurred as a result of any transaction referred to in subsection (1) between any two parties within the government; and'},
    {'Label': '(b)',
     'Text': {'XRefExternal': ['National 

In [58]:
len(doc_dict['Regulation']['Body']['Section'][1])

4

In [62]:
doc_dict['Regulation']['Body']['Section'][1].keys()

['Subsection', 'MarginalNote', 'HistoricalNote', 'Label']

In [63]:
doc_dict['Regulation']['Body']['Section'][1]['Label']

'2'

In [65]:
doc_dict['Regulation']['Body']['Section'][1]['HistoricalNote']

'SOR/2017-114, s. 3.'

In [66]:
doc_dict['Regulation']['Body']['Section'][1]['MarginalNote']

'Meaning'

In [67]:
doc_dict['Regulation']['Body']['Section'][1]['Subsection']

[{'Label': '(1)',
  'Paragraph': [{'Label': '(a)',
    'Text': 'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;'},
   {'Label': '(b)', 'Text': {'XRefInternal': '3'}},
   {'Label': '(c)', 'Text': {'XRefInternal': '3'}},
   {'Label': '(d)',
    'Text': 'a contingent liability incurred as a result of any loan guarantee provided by the government.'}],
  'Text': {'XRefExternal': 'Northwest Territories Act'}},
 {'Label': '(2)',
  'MarginalNote': 'Exclusions',
  'Paragraph': [{'Label': '(a)',
    'Text': 'an obligation, liability or contingent liability incurred as a result of any transaction referred to in subsection (1) between any two parties within the government; and'},
   {'Label': '(b)',
    'Text': {'XRefExternal': ['National Housing Act',
      'National Housing Act']}}],
  'Text': 'Each of the following does not constitute or is deemed

In [75]:
doc_dict['Regulation']['Body']['Section'][1]['Subsection'][0].keys()

['Text', 'Paragraph', 'Label']

In [78]:
doc_dict['Regulation']['Body']['Section'][1]['Subsection'][0]['Paragraph']

[{'Label': '(a)',
  'Text': 'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;'},
 {'Label': '(b)', 'Text': {'XRefInternal': '3'}},
 {'Label': '(c)', 'Text': {'XRefInternal': '3'}},
 {'Label': '(d)',
  'Text': 'a contingent liability incurred as a result of any loan guarantee provided by the government.'}]

### Extracting each label & its respective text

In [221]:
#function to clean each body text
def body_clean(text):
    cleaned = text.replace("Text", "")
    cleaned = cleaned.replace("[", "")
    cleaned = cleaned.replace("]", "")
    cleaned = cleaned.replace("{", "")
    cleaned = cleaned.replace("}", "")
    return cleaned

In [222]:
#function to clean each label tag
def label_clean(text):
    cleaned = body_clean(text)
    cleaned = cleaned.replace("'", "")
    return cleaned

In [223]:
doc = etree.parse('SOR-2013-39.xml')
doc_dict = make_dict_from_tree(doc.getroot())

In [224]:
Parts = len(doc_dict['Regulation']['Body']['Section'])
Parts

4

In [274]:
Parts = len(doc_dict['Regulation']['Body']['Section'])

#Create the nested list for each regulation's label and associated text 
###Structure for one regulation --> [[Label: X, [phrase1, phrase2, phrase3]],[Label: Y, [Phrase5, Phrase6]]]
reg_body_labeled = []
        
for i in range(0,Parts):
    #Identify the title (e.g.number) of each section in the regulation body
    label_number = str(doc_dict['Regulation']['Body']['Section'][i]["Label"])
    label_tag = "Label: " + label_clean(label_number)
                
    #Determine all the contents associated with the label, removing the label key
    label_text = doc_dict['Regulation']['Body']['Section'][i].copy()
    del label_text['Label']
    
    #Format the text for each label, removing remains of xml tags and dictionary formatting
    label_text = body_clean(str(label_text))
    label_text = label_text.split("'")
    label_text_list = []
    for phrase in label_text:
        if len(phrase) > 2:
            label_text_list.append(phrase)
    
    #Create the nested pair of label_tag + label_text --> [Label: X, [phrase1, phrase2, phrase3]]
    label_pair = []
    label_pair.append(label_tag)
    label_pair.append(label_text_list)
    
    #Add to the master list of the individual regulation
    reg_body_labeled.append(label_pair)   

In [275]:
len(reg_body_labeled)

4

In [276]:
len(reg_body_labeled[0])

2

In [277]:
reg_body_labeled[0][0]

'Label: 1'

In [278]:
reg_body_labeled[3][1]

['Footnote',
 'Emphasis',
 'see',
 'Label',
 'XRefExternal',
 'Jobs, Growth and Long-term Prosperity Act',
 'MarginalNote',
 'S.C. 2012, c. 19']

In [279]:
string_clean(str(label_text))

"'', 'Footnote', ': ', '', ': ', 'Emphasis', ': ', 'see', ', ', 'Label', ': ', '*', ', ', '', ': ', 'XRefExternal', ': ', 'Jobs, Growth and Long-term Prosperity Act', ', ', 'MarginalNote', ': ', 'S.C. 2012, c. 19', ''"

In [195]:
test = (str(label_text)).split("'")

In [196]:
type(test)

list

In [194]:
for word in test:
    if len(word) > 2:
        print (word)

Footnote
Emphasis
see
Label
XRefExternal
Jobs, Growth and Long-term Prosperity Act
MarginalNote
S.C. 2012, c. 19


In [171]:
label_tag

"Label {'FootnoteRef': '*'} - "

In [155]:
str(label_text)

"{'Footnote': {'Text': {'Emphasis': 'see'}, 'Label': '*'}, 'Text': {'XRefExternal': 'Jobs, Growth and Long-term Prosperity Act'}, 'MarginalNote': 'S.C. 2012, c. 19'}"

In [143]:
print label_tag

Label {'FootnoteRef': '*'} - 


In [144]:
doc_dict['Regulation']['Body']['Section'][0]

{'HistoricalNote': 'SOR/2017-114, s. 5.',
 'Label': '1',
 'MarginalNote': {'DefinitionRef': 'government'},
 'Text': {'DefinedTermEn': 'government',
  'XRefExternal': 'CPA Canada Public Sector Accounting Handbook'}}

In [145]:
str(doc_dict['Regulation']['Body']['Section'][0])

"{'Text': {'XRefExternal': 'CPA Canada Public Sector Accounting Handbook', 'DefinedTermEn': 'government'}, 'MarginalNote': {'DefinitionRef': 'government'}, 'HistoricalNote': 'SOR/2017-114, s. 5.', 'Label': '1'}"

In [135]:
doc_dict['Regulation']['Body']['Section'][3]

{'Footnote': {'Label': '*', 'Text': {'Emphasis': 'see'}},
 'Label': {'FootnoteRef': '*'},
 'MarginalNote': 'S.C. 2012, c. 19',
 'Text': {'XRefExternal': 'Jobs, Growth and Long-term Prosperity Act'}}

In [92]:
# Second test 
doc2 = etree.parse('SOR-2011-108.xml')
doc_dict2 = make_dict_from_tree(doc2.getroot())




In [93]:
doc_dict2['Regulation']['Body'].keys()

['Section', 'Heading']

In [97]:
len(doc_dict2['Regulation']['Body']['Section'])

12

In [111]:
doc_dict2['Regulation']['Body']['Section'][0]

{'Label': '1',
 'MarginalNote': 'Definitions',
 'Subsection': [{'Label': '(1)',
   'Text': {'XRefExternal': 'Financial Administration Act'}},
  {'Definition': [{'MarginalNote': [{'DefinedTermEn': 'action or proceeding'},
      {'DefinedTermFr': u'enqu\xeate ou proc\xe9dure'}],
     'Text': {'DefinedTermEn': 'action or proceeding'}},
    {'MarginalNote': [{'DefinedTermEn': 'officer'},
      {'DefinedTermFr': 'dirigeant'}],
     'Text': {'DefinedTermEn': 'officer'}},
    {'MarginalNote': [{'DefinedTermEn': 'party'}, {'DefinedTermFr': 'partie'}],
     'Text': {'DefinedTermEn': 'party'}}],
   'Label': '(2)',
   'MarginalNote': 'Definitions',
   'Text': 'The following definitions apply for the purposes of section 119 of the Act,'}]}

In [112]:
doc_dict2['Regulation']['Body']['Section'][0].keys()

['Subsection', 'MarginalNote', 'Label']

In [121]:
doc_dict2['Regulation']['Body']['Section'][0]['Subsection'][1]

{'Definition': [{'MarginalNote': [{'DefinedTermEn': 'action or proceeding'},
    {'DefinedTermFr': u'enqu\xeate ou proc\xe9dure'}],
   'Text': {'DefinedTermEn': 'action or proceeding'}},
  {'MarginalNote': [{'DefinedTermEn': 'officer'},
    {'DefinedTermFr': 'dirigeant'}],
   'Text': {'DefinedTermEn': 'officer'}},
  {'MarginalNote': [{'DefinedTermEn': 'party'}, {'DefinedTermFr': 'partie'}],
   'Text': {'DefinedTermEn': 'party'}}],
 'Label': '(2)',
 'MarginalNote': 'Definitions',
 'Text': 'The following definitions apply for the purposes of section 119 of the Act,'}

In [96]:
doc_dict2['Regulation']['Body']['Section']

[{'Label': '1',
  'MarginalNote': 'Definitions',
  'Subsection': [{'Label': '(1)',
    'Text': {'XRefExternal': 'Financial Administration Act'}},
   {'Definition': [{'MarginalNote': [{'DefinedTermEn': 'action or proceeding'},
       {'DefinedTermFr': u'enqu\xeate ou proc\xe9dure'}],
      'Text': {'DefinedTermEn': 'action or proceeding'}},
     {'MarginalNote': [{'DefinedTermEn': 'officer'},
       {'DefinedTermFr': 'dirigeant'}],
      'Text': {'DefinedTermEn': 'officer'}},
     {'MarginalNote': [{'DefinedTermEn': 'party'},
       {'DefinedTermFr': 'partie'}],
      'Text': {'DefinedTermEn': 'party'}}],
    'Label': '(2)',
    'MarginalNote': 'Definitions',
    'Text': 'The following definitions apply for the purposes of section 119 of the Act,'}]},
 {'Label': '2',
  'MarginalNote': 'Request',
  'Subsection': [{'Label': '(1)',
    'Text': 'A request for an indemnification or advance under section 119 of the Act shall be made in writing to the Secretary of the Treasury Board as soon as

In [90]:
doc_dict2['Regulation']['Body']['Section'][0]

{'HistoricalNote': 'SOR/2017-114, s. 5.',
 'Label': '1',
 'MarginalNote': {'DefinitionRef': 'government'},
 'Text': {'DefinedTermEn': 'government',
  'XRefExternal': 'CPA Canada Public Sector Accounting Handbook'}}

In [91]:
doc_dict2['Regulation']['Body']['Section'][1]

{'HistoricalNote': 'SOR/2017-114, s. 3.',
 'Label': '2',
 'MarginalNote': 'Meaning',
 'Subsection': [{'Label': '(1)',
   'Paragraph': [{'Label': '(a)',
     'Text': 'an obligation incurred as a result of any loan of money received by the government, including a loan made by the issuance and sale of bonds, debentures, notes or any other evidence of indebtedness;'},
    {'Label': '(b)', 'Text': {'XRefInternal': '3'}},
    {'Label': '(c)', 'Text': {'XRefInternal': '3'}},
    {'Label': '(d)',
     'Text': 'a contingent liability incurred as a result of any loan guarantee provided by the government.'}],
   'Text': {'XRefExternal': 'Northwest Territories Act'}},
  {'Label': '(2)',
   'MarginalNote': 'Exclusions',
   'Paragraph': [{'Label': '(a)',
     'Text': 'an obligation, liability or contingent liability incurred as a result of any transaction referred to in subsection (1) between any two parties within the government; and'},
    {'Label': '(b)',
     'Text': {'XRefExternal': ['National 

In [6]:
#Iterate through all the regulations
files = [f for f in os.listdir('.') if os.path.isfile(f)]
counter = 0

for file in files:
    if (file != 'Regulation Exploration AW - V1.ipynb') & (file != 'Regulation Exploration AW - V2.ipynb') & (file != 'Regulation Exploration AW - V3 - PartialTest.ipynb') & (file != 'Regulation Exploration AW - V3 - Full.ipynb') & (file != 'Regulation Exploration AW - V4'):

        #Parse each xml file
        doc = etree.parse(file)
        doc_dict = make_dict_from_tree(doc.getroot())

        #Extracting all the Identification Headers
        try:
            InstrumentNumber = doc_dict['Regulation']['Identification']['InstrumentNumber']
        except: 
            InstrumentNumber = ""

        try:
            RegistrationDate_Year = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['YYYY']
            RegistrationDate_Month = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['MM']
            RegistrationDate_Day = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['DD']
            RegistrationDate = RegistrationDate_Month + "/" + RegistrationDate_Day + "/" + RegistrationDate_Year
        except: 
            RegistrationDate = ""

        try:
            ConsolidationDate_Year = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['YYYY']
            ConsolidationDate_Month = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['MM']
            ConsolidationDate_Day = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['DD']
            ConsolidationDate = ConsolidationDate_Month + "/" + ConsolidationDate_Day + "/" + ConsolidationDate_Year
        except: 
            ConsolidationDate = ""

        try:
            LastModifiedDate_Year = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['YYYY']
            LastModifiedDate_Month = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['MM']
            LastModifiedDate_Day = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['DD']
            LastModifiedDate = LastModifiedDate_Month + "/" + LastModifiedDate_Day + "/" + LastModifiedDate_Year
        except:
            LastModifiedDate = ""

        try:
            EnablingAuthority = doc_dict['Regulation']['Identification']['EnablingAuthority'].values()
        except:
            EnablingAuthority = ""

        try:
            ShortTitle = doc_dict['Regulation']['Identification']['ShortTitle']
        except:
            ShortTitle = ""

        try:
            LongTitle = doc_dict['Regulation']['Identification']['LongTitle']
        except:
            LongTitle = ""

        try:
            RegulationMaker = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['RegulationMaker']
        except:
            RegulationMaker = ""

        try:
            RegulationOrderNumber = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['OrderNumber']
        except:
            RegulationOrderNumber = ""

        try:
            RegulationMakerOrderDate_Year = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['YYYY']
            RegulationMakerOrderDate_Month = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['MM']
            RegulationMakerOrderDate_Day = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['DD']
            RegulationMakerOrderDate = RegulationMakerOrderDate_Month + "/" + RegulationMakerOrderDate_Day + "/" + RegulationMakerOrderDate_Year
        except:
            RegulationMakerOrderDate = ""

        #Order Section of the Regulation
        try:
            len_order = len(doc_dict['Regulation']['Order']['Provision'])
            order = str(doc_dict['Regulation']['Order'].values())
        except:
            order = ""

        #Body Section of the Regulation
        try:
            len_body = len(doc_dict['Regulation']['Body']['Section'])    
            body = str(doc_dict['Regulation']['Body'].values())
        except:
            body = ""

        #Schedule Section of the Regulation
        try:
            schedule = doc_dict['Regulation']['Schedule'].values()
        except:
            schedule = ""
     
        #Append to all of the lists
        list_InstrumentNumber.append(InstrumentNumber)
        list_RegistrationDate.append(RegistrationDate)
        list_ConsolidationDate.append(ConsolidationDate)
        list_LastModifiedDate.append(LastModifiedDate)
        list_EnablingAuthority.append(EnablingAuthority)
        list_ShortTitle.append(ShortTitle)
        list_LongTitle.append(LongTitle)
        list_RegulationMaker.append(RegulationMaker)
        list_RegulationOrderNumber.append(RegulationOrderNumber)
        list_RegulationMakerOrderDate.append(RegulationMakerOrderDate)
        list_len_order.append(len_order)
        list_order.append(order)
        list_len_body.append(len_body)
        list_body.append(body)

    else:
        pass

In [11]:
#Create final dataframe 
df = pd.DataFrame()
df["Instrumentation_Num"] = list_InstrumentNumber
df["Registration_Date"] = list_RegistrationDate
df["Consolidation_Date"] = list_ConsolidationDate
df["Last_Mod_Date"] = list_LastModifiedDate
df["Enabling_Authority"] = list_EnablingAuthority
df["Short_Title"] = list_ShortTitle
df["Long_Title"] = list_LongTitle
df["Reg_Maker"] = list_RegulationMaker
df["Reg_Order_Num"] = list_RegulationOrderNumber
df["Reg_Order_Maker_Date"] = list_RegulationMakerOrderDate
df["Length_Order"] = list_len_order
df["Order"] = list_order
df["Length_Body"] = list_len_body
df["Body"] = list_body

In [12]:
len(list_LastModifiedDate)

2628

In [13]:
len(list_body)

2628

In [14]:
#check the final output
df

Unnamed: 0,Instrumentation_Num,Registration_Date,Consolidation_Date,Last_Mod_Date,Enabling_Authority,Short_Title,Long_Title,Reg_Maker,Reg_Order_Num,Reg_Order_Maker_Date,Length_Order,Order,Length_Body,Body
0,SOR/2000-1,12/14/1999,06/21/2018,3/13/2008,"[[PUBLIC SERVICE SUPERANNUATION ACT, FINANCIAL...",,Certain Canada Port Authorities Divestiture Re...,T.B.,827750,12/9/1999,2,"[{'Footnote': {'Text': 'S.C. 1992, c. 46, s. 2...",11,[[{'Text': 'The definitions in this section ap...
1,SOR/2000-100,3/15/2000,06/21/2018,4/27/2006,[FEDERAL-PROVINCIAL FISCAL ARRANGEMENTS ACT],,Federal-Provincial Fiscal Arrangements Regulat...,P.C.,2000-317,3/15/2000,2,"[{'Footnote': [{'Text': 'S.C. 1999, c. 31, s. ...",31,"[[{'Text': {'Repealed': '[Repealed, SOR/2007-3..."
2,SOR/2000-107,3/23/2000,06/21/2018,6/20/2005,"[CANADIAN ENVIRONMENTAL PROTECTION ACT, 1999]",,Persistence and Bioaccumulation Regulations,P.C.,2000-348,3/23/2000,3,"[[{'Footnote': {'Text': 'S.C. 1999, c. 33', 'L...",6,[[{'Text': 'The definitions in this section ap...
3,SOR/2000-108,3/23/2000,06/21/2018,6/22/2005,"[CANADIAN ENVIRONMENTAL PROTECTION ACT, 1999]",,Export Control List Notification Regulations,P.C.,2000-349,3/23/2000,1,"[{'Text': {'Repealed': '[Repealed, SOR/2013-88...",5,"[[{'Text': {'Repealed': '[Repealed, SOR/2013-8..."
4,SOR/2000-111,3/23/2000,06/21/2018,8/18/2005,[AERONAUTICS ACT],,Canadian Aviation Security Regulations,P.C.,2000-364,3/23/2000,1,"[{'Text': {'Repealed': '[Repealed, SOR/2011-31...",119,"[[{'Text': {'Repealed': '[Repealed, SOR/2011-3..."
5,SOR/2000-112,3/23/2000,06/21/2018,8/18/2005,[AERONAUTICS ACT],,Designated Provisions Regulations,P.C.,2000-365,3/23/2000,2,"[{'Footnote': {'Text': 'S.C. 1992, c. 4, s. 19...",6,[[{'Text': 'The definitions in this section ap...
6,SOR/2000-113,3/23/2000,06/21/2018,9/22/2005,[BANK ACT],,Insider Reports Exemptions (Banks) Regulations,P.C.,2000-368,3/23/2000,2,,2,"[[{'Text': {'Repealed': '[Repealed, SOR/2006-3..."
7,SOR/2000-131,3/30/2000,06/21/2018,5/5/2006,[PUBLIC SERVICE LABOUR RELATIONS ACT],,Order Designating the Staff of the Non-Public ...,P.C.,2000-442,3/30/2000,2,"[{'Footnote': {'Text': 'S.C. 1999, c. 26, s. 1...",2,
8,SOR/2000-132,3/30/2000,06/21/2018,10/16/2008,[PILOTAGE ACT],,General Pilotage Regulations,P.C.,2000-444,3/30/ 2000,2,"[{'Footnote': {'Text': 'R.S., c. 31 (1st Supp....",29,[[{'Text': 'The definitions in this section ap...
9,SOR/2000-14,12/16/1999,06/21/2018,1/10/2006,[NATIONAL DEFENCE ACT],,Military Police Professional Code of Conduct,P.C.,1999-2213,12/16/1999,2,"[{'Footnote': {'Text': 'S.C. 1998, c. 35, s. 5...",9,[[{'Text': {'XRefExternal': 'National Defence ...


In [None]:
#Save the final output

In [15]:
df.to_excel("AI_Reg_Flat_Table_V1.xlsx")

# Testing for Body Bugs

In [45]:
doc = etree.parse("SOR-2000-300.xml")
doc_dict = make_dict_from_tree(doc.getroot())

In [46]:
doc_dict['Regulation']

{'Body': {'Heading': [{'TitleText': 'Interpretation'},
   {'TitleText': 'Samples'},
   {'TitleText': {'XRefExternal': ['Criminal Code', 'National Defence Act']}},
   {'TitleText': 'Written Consent'},
   {'TitleText': 'Removal of Access to Information'},
   {'TitleText': 'International Agreements'},
   {'TitleText': 'Coming into Force'}],
  'Section': [{'Definition': [{'Text': {'DefinedTermEn': 'Act',
       'DefinedTermFr': 'Loi',
       'XRefExternal': 'DNA Identification Act'}},
     {'Text': {'DefinedTermEn': 'DNA Data Bank',
       'DefinedTermFr': u'banque de donn\xe9es g\xe9n\xe9tiques'}}],
    'HistoricalNote': 'SOR/2018-42, s. 1.',
    'Label': '1',
    'Text': 'The definitions in this section apply in these Regulations.'},
   {'HistoricalNote': 'SOR/2002-451, s. 1; SOR/2006-225, s. 1(E); SOR/2008-139, s. 1(E); SOR/2018-42, s. 2.',
    'Label': '2',
    'Subsection': [{'Label': '(1)',
      'Paragraph': [{'Label': '(a)',
        'Text': 'a sample of the bodily substance that wa

In [38]:
import pandas as pd
import io
import xml.etree.ElementTree as ElementTree
import os

In [39]:
class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                # treat like dict
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                # treat like list
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)


class XmlDictConfig(dict):
    '''
    Example usage:

    >>> tree = ElementTree.parse('your_file.xml')
    >>> root = tree.getroot()
    >>> xmldict = XmlDictConfig(root)

    Or, if you want to use an XML string:

    >>> root = ElementTree.XML(xml_string)
    >>> xmldict = XmlDictConfig(root)

    And then use xmldict for what it is... a dict.
    '''
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                # treat like dict - we assume that if the first two tags
                # in a series are different, then they are all different.
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                # treat like list - we assume that if the first two tags
                # in a series are the same, then the rest are the same.
                else:
                    # here, we put the list in dictionary; the key is the
                    # tag name the list elements all share in common, and
                    # the value is the list itself 
                    aDict = {element[0].tag: XmlListConfig(element)}
                # if the tag has attributes, add those to the dict
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            # this assumes that if you've got an attribute in a tag,
            # you won't be having any text. This may or may not be a 
            # good idea -- time will tell. It works for the way we are
            # currently doing XML configuration files...
            elif element.items():
                self.update({element.tag: dict(element.items())})
            # finally, if there are no child tags and no attributes, extract
            # the text
            else:
                self.update({element.tag: element.text})

In [40]:
tree = ElementTree.parse('SOR-2000-300.xml')
root = tree.getroot()

In [41]:
root

<Element 'Regulation' at 0x122b6da0>

In [42]:
xmldict = XmlDictConfig(root)

In [43]:
xmldict

{'Body': {'Heading': {'TitleText': 'Coming into Force', 'level': '1'},
  'Section': {'Label': '6',
   'Text': 'These Regulations come into force on the day on which they are registered.',
   'type': 'transitional'}},
 'Identification': {'ConsolidationDate': {'Date': {'DD': '21',
    'MM': '06',
    'YYYY': '2018'}},
  'EnablingAuthority': {'XRefExternal': {'reference-type': 'act'}},
  'InstrumentNumber': 'SOR/2000-300',
  'LastModifiedDate': {'Date': {'DD': '28', 'MM': '11', 'YYYY': '2006'}},
  'LongTitle': 'DNA Identification Regulations',
  'RegistrationDate': {'Date': {'DD': '27', 'MM': '7', 'YYYY': '2000'}},
  'RegulationMakerOrder': {'Date': {'DD': '27', 'MM': '7', 'YYYY': ' 2000'},
   'OrderNumber': '2000-1109',
   'RegulationMaker': 'P.C.'}},
 'Order': {'Provision': {'Footnote': {'Label': 'a',
    'Text': 'S.C. 1998, c. 37',
    'id': 'fn_SOR-2000-300_e_hq_1298',
    'placement': 'page',
    'status': 'official'},
   'Text': {'FootnoteRef': {'idref': 'fn_SOR-2000-300_e_hq_1298'}

In [44]:
# More Body Testing

In [48]:
doc_dict['Regulation']['Body']

{'Heading': [{'TitleText': 'Interpretation'},
  {'TitleText': 'Samples'},
  {'TitleText': {'XRefExternal': ['Criminal Code', 'National Defence Act']}},
  {'TitleText': 'Written Consent'},
  {'TitleText': 'Removal of Access to Information'},
  {'TitleText': 'International Agreements'},
  {'TitleText': 'Coming into Force'}],
 'Section': [{'Definition': [{'Text': {'DefinedTermEn': 'Act',
      'DefinedTermFr': 'Loi',
      'XRefExternal': 'DNA Identification Act'}},
    {'Text': {'DefinedTermEn': 'DNA Data Bank',
      'DefinedTermFr': u'banque de donn\xe9es g\xe9n\xe9tiques'}}],
   'HistoricalNote': 'SOR/2018-42, s. 1.',
   'Label': '1',
   'Text': 'The definitions in this section apply in these Regulations.'},
  {'HistoricalNote': 'SOR/2002-451, s. 1; SOR/2006-225, s. 1(E); SOR/2008-139, s. 1(E); SOR/2018-42, s. 2.',
   'Label': '2',
   'Subsection': [{'Label': '(1)',
     'Paragraph': [{'Label': '(a)',
       'Text': 'a sample of the bodily substance that was collected with a DNA Data 