In [33]:
import pandas as pd
import xml.etree.ElementTree
from lxml import etree

#Hide Future Version warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [34]:
#Code to convert XML element tree to a dictionary
def make_dict_from_tree(element_tree):
    def internal_iter(tree, accum):
        if tree is None:
            return accum
        
        if tree.getchildren():
            accum[tree.tag] = {}
            for each in tree.getchildren():
                result = internal_iter(each, {})
                if each.tag in accum[tree.tag]:
                    if not isinstance(accum[tree.tag][each.tag], list):
                        accum[tree.tag][each.tag] = [
                            accum[tree.tag][each.tag]
                        ]
                    accum[tree.tag][each.tag].append(result[each.tag])
                else:
                    accum[tree.tag].update(result)
        else:
            accum[tree.tag] = tree.text
        return accum
    return internal_iter(element_tree, {})

In [35]:
#List of regulations to test 
list_regulations = ["SOR-49-42.xml","SOR-80-394.xml","SOR-2018-46.xml"]

In [36]:
#Create the empty lists to append to
list_InstrumentNumber = []
list_RegistrationDate = []
list_ConsolidationDate = []
list_LastModificationDate = []
list_EnablingAuthority = []
list_ShortTitle = []
list_LongTitle = []
list_RegulationMaker = []
list_RegulationOrderNumber = []
list_RegulationMakerDate = []
list_len_order = []
list_order = []
list_len_body = []
list_body = []

#Iterate through all the regulations
for i in list_regulations:
    #Parse each xml file
    doc = etree.parse(i)
    doc_dict = make_dict_from_tree(doc.getroot())
    
    #Extracting all the Identification Headers
    try:
        InstrumentNumber = doc_dict['Regulation']['Identification']['InstrumentNumber']
    except: 
        InstrumentNumber = ""

    try:
        RegistrationDate = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']
    except: 
        RegistrationDate = ""

    try:
        ConsolidationDate = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']
    except: 
        RegistrationDate = ""

    try:
        LastModificationDate = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']
    except:
        LastModificationDate = ""

    try:
        EnablingAuthority = doc_dict['Regulation']['Identification']['EnablingAuthority'].values()
    except:
        EnablingAuthority = ""

    try:
        ShortTitle = doc_dict['Regulation']['Identification']['ShortTitle']
    except:
        ShortTitle = ""

    try:
        LongTitle = doc_dict['Regulation']['Identification']['LongTitle']
    except:
        LongTitle = ""

    try:
        RegulationMaker = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['RegulationMaker']
    except:
        RegulationMaker = ""

    try:
        RegulationOrderNumber = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['OrderNumber']
    except:
        RegulationOrderNumber = ""

    try:
        RegulationMakerDate = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']
    except:
        RegulationMakerDate = ""

    #Order Section of the Regulation
    try:
        len_order = len(doc_dict['Regulation']['Order']['Provision'])
        order = str(doc_dict['Regulation']['Order'].values())
    except:
        order = ""

    #Body Section of the Regulation
    try:
        len_body = len(doc_dict['Regulation']['Body']['Section'])    
        body = str(doc_dict['Regulation']['Body'].values())
    except:
        body = ""

    #Schedule Section of the Regulation
    try:
        schedule = doc_dict['Regulation']['Schedule'].values()
    except:
        schedule = ""

    #Append to all of the lists
    list_InstrumentNumber.append(InstrumentNumber)
    list_RegistrationDate.append(RegistrationDate)
    list_ConsolidationDate.append(ConsolidationDate)
    list_LastModificationDate.append(LastModificationDate)
    list_EnablingAuthority.append(EnablingAuthority)
    list_ShortTitle.append(ShortTitle)
    list_LongTitle.append(LongTitle)
    list_RegulationMaker.append(RegulationMaker)
    list_RegulationOrderNumber.append(RegulationOrderNumber)
    list_RegulationMakerDate.append(RegulationMakerDate)
    list_len_order.append(len_order)
    list_order.append(order)
    list_len_body.append(len_body)
    list_body.append(body)

In [48]:
#Create final dataframe 
df = pd.DataFrame()
df["Instrumentation_Num"] = list_InstrumentNumber
df["Registration_Date"] = list_RegistrationDate
df["Consolidation_Date"] = list_ConsolidationDate
df["Last_Mod_Date"] = list_LastModificationDate
df["Enabling_Authority"] = list_EnablingAuthority
df["Short_Title"] = list_ShortTitle
df["Long_Title"] = list_LongTitle
df["Reg_Maker"] = list_RegulationMaker
df["Reg_Order_Numb"] = list_RegulationOrderNumber
df["Reg_Maker_Date"] = list_RegulationMakerDate
df["Length_Order"] = list_len_order
df["Order"] = list_order
df["Length_Body"] = list_len_body
df["Body"] = list_body

In [49]:
df

Unnamed: 0,Instrumentation_Num,Registration_Date,Consolidation_Date,Last_Mod_Date,Enabling_Authority,Short_Title,Long_Title,Reg_Maker,Reg_Order_Numb,Reg_Maker_Date,Length_Order,Order,Length_Body,Body
0,SOR/49-42,,"{u'YYYY': u'2018', u'MM': u'06', u'DD': u'21'}","{u'YYYY': u'2006', u'MM': u'10', u'DD': u'26'}","[[APPROPRIATION ACT, NO. 7, 1949, APPROPRIATIO...",Regulations re school fees and transportation ...,Regulations Governing Payment of School Fees a...,P.C.,1954-1694,"{u'YYYY': u'1954', u'MM': u'11', u'DD': u'09'}",3,[[{'Text': 'His Excellency the Governor Genera...,7,"[[{'Text': 'In these regulations,', 'Definitio..."
1,SOR/80-394,"{u'YYYY': u'1980', u'MM': u'5', u'DD': u'27'}","{u'YYYY': u'2018', u'MM': u'06', u'DD': u'21'}","{u'YYYY': u'2016', u'MM': u'11', u'DD': u'28'}",[CANADIAN HUMAN RIGHTS ACT],Human Rights Tribunal Appeal Regulations,Regulations Respecting the Manner and Form for...,P.C.,1980-1411,"{u'YYYY': u'1980', u'MM': u'5', u'DD': u'27'}",1,[{'Text': {'XRefExternal': ['Canadian Human Ri...,6,[[{'Text': {'XRefExternal': 'Human Rights Trib...
2,SOR/2018-46,"{u'YYYY': u'2018', u'MM': u'3', u'DD': u'20'}","{u'YYYY': u'2018', u'MM': u'06', u'DD': u'21'}","{u'YYYY': u'2018', u'MM': u'4', u'DD': u'4'}",[CRIMINAL CODE],,Order Declaring an Amnesty Period (2018),P.C.,2018-301,"{u'YYYY': u'2018', u'MM': u'3', u'DD': u'20'}",1,"[[{'Text': 'S.C. 1995, c. 39, s. 139', 'Label'...",3,"[[{'Text': {'DefinedTermEn': 'firearm'}, 'Para..."
