In [1]:
#Import the required packages
import pandas as pd
import xml.etree.ElementTree
from lxml import etree
import os
import io

#Hide Future Version warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#Code to convert XML element tree to a dictionary
def make_dict_from_tree(element_tree):
    def internal_iter(tree, accum):
        if tree is None:
            return accum
        
        if tree.getchildren():
            accum[tree.tag] = {}
            for each in tree.getchildren():
                result = internal_iter(each, {})
                if each.tag in accum[tree.tag]:
                    if not isinstance(accum[tree.tag][each.tag], list):
                        accum[tree.tag][each.tag] = [
                            accum[tree.tag][each.tag]
                        ]
                    accum[tree.tag][each.tag].append(result[each.tag])
                else:
                    accum[tree.tag].update(result)
        else:
            accum[tree.tag] = tree.text
        return accum
    return internal_iter(element_tree, {})

In [5]:
#Create the empty lists to append to
list_InstrumentNumber = []
list_RegistrationDate = []
list_ConsolidationDate = []
list_LastModifiedDate = []
list_EnablingAuthority = []
list_ShortTitle = []
list_LongTitle = []
list_RegulationMaker = []
list_RegulationOrderNumber = []
list_RegulationMakerOrderDate = []
list_len_order = []
list_order = []
list_len_body = []
list_body = []

In [6]:
#Iterate through all the regulations
files = [f for f in os.listdir('.') if os.path.isfile(f)]
counter = 0

for file in files:
    if (file != 'Regulation Exploration AW - V1.ipynb') & (file != 'Regulation Exploration AW - V2.ipynb') & (file != 'Regulation Exploration AW - V3 - PartialTest.ipynb') & (file != 'Regulation Exploration AW - V3 - Full.ipynb'):

        #Parse each xml file
        doc = etree.parse(file)
        doc_dict = make_dict_from_tree(doc.getroot())

        #Extracting all the Identification Headers
        try:
            InstrumentNumber = doc_dict['Regulation']['Identification']['InstrumentNumber']
        except: 
            InstrumentNumber = ""

        try:
            RegistrationDate_Year = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['YYYY']
            RegistrationDate_Month = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['MM']
            RegistrationDate_Day = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['DD']
            RegistrationDate = RegistrationDate_Month + "/" + RegistrationDate_Day + "/" + RegistrationDate_Year
        except: 
            RegistrationDate = ""

        try:
            ConsolidationDate_Year = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['YYYY']
            ConsolidationDate_Month = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['MM']
            ConsolidationDate_Day = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['DD']
            ConsolidationDate = ConsolidationDate_Month + "/" + ConsolidationDate_Day + "/" + ConsolidationDate_Year
        except: 
            ConsolidationDate = ""

        try:
            LastModifiedDate_Year = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['YYYY']
            LastModifiedDate_Month = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['MM']
            LastModifiedDate_Day = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['DD']
            LastModifiedDate = LastModifiedDate_Month + "/" + LastModifiedDate_Day + "/" + LastModifiedDate_Year
        except:
            LastModifiedDate = ""

        try:
            EnablingAuthority = doc_dict['Regulation']['Identification']['EnablingAuthority'].values()
        except:
            EnablingAuthority = ""

        try:
            ShortTitle = doc_dict['Regulation']['Identification']['ShortTitle']
        except:
            ShortTitle = ""

        try:
            LongTitle = doc_dict['Regulation']['Identification']['LongTitle']
        except:
            LongTitle = ""

        try:
            RegulationMaker = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['RegulationMaker']
        except:
            RegulationMaker = ""

        try:
            RegulationOrderNumber = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['OrderNumber']
        except:
            RegulationOrderNumber = ""

        try:
            RegulationMakerOrderDate_Year = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['YYYY']
            RegulationMakerOrderDate_Month = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['MM']
            RegulationMakerOrderDate_Day = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['DD']
            RegulationMakerOrderDate = RegulationMakerOrderDate_Month + "/" + RegulationMakerOrderDate_Day + "/" + RegulationMakerOrderDate_Year
        except:
            RegulationMakerOrderDate = ""

        #Order Section of the Regulation
        try:
            len_order = len(doc_dict['Regulation']['Order']['Provision'])
            order = str(doc_dict['Regulation']['Order'].values())
        except:
            order = ""

        #Body Section of the Regulation
        try:
            len_body = len(doc_dict['Regulation']['Body']['Section'])    
            body = str(doc_dict['Regulation']['Body'].values())
        except:
            body = ""

        #Schedule Section of the Regulation
        try:
            schedule = doc_dict['Regulation']['Schedule'].values()
        except:
            schedule = ""
     
        #Append to all of the lists
        list_InstrumentNumber.append(InstrumentNumber)
        list_RegistrationDate.append(RegistrationDate)
        list_ConsolidationDate.append(ConsolidationDate)
        list_LastModifiedDate.append(LastModifiedDate)
        list_EnablingAuthority.append(EnablingAuthority)
        list_ShortTitle.append(ShortTitle)
        list_LongTitle.append(LongTitle)
        list_RegulationMaker.append(RegulationMaker)
        list_RegulationOrderNumber.append(RegulationOrderNumber)
        list_RegulationMakerOrderDate.append(RegulationMakerOrderDate)
        list_len_order.append(len_order)
        list_order.append(order)
        list_len_body.append(len_body)
        list_body.append(body)

    else:
        pass

In [11]:
#Create final dataframe 
df = pd.DataFrame()
df["Instrumentation_Num"] = list_InstrumentNumber
df["Registration_Date"] = list_RegistrationDate
df["Consolidation_Date"] = list_ConsolidationDate
df["Last_Mod_Date"] = list_LastModifiedDate
df["Enabling_Authority"] = list_EnablingAuthority
df["Short_Title"] = list_ShortTitle
df["Long_Title"] = list_LongTitle
df["Reg_Maker"] = list_RegulationMaker
df["Reg_Order_Num"] = list_RegulationOrderNumber
df["Reg_Order_Maker_Date"] = list_RegulationMakerOrderDate
df["Length_Order"] = list_len_order
df["Order"] = list_order
df["Length_Body"] = list_len_body
df["Body"] = list_body

In [12]:
len(list_LastModifiedDate)

2628

In [13]:
len(list_body)

2628

In [14]:
#check the final output
df

Unnamed: 0,Instrumentation_Num,Registration_Date,Consolidation_Date,Last_Mod_Date,Enabling_Authority,Short_Title,Long_Title,Reg_Maker,Reg_Order_Num,Reg_Order_Maker_Date,Length_Order,Order,Length_Body,Body
0,SOR/2000-1,12/14/1999,06/21/2018,3/13/2008,"[[PUBLIC SERVICE SUPERANNUATION ACT, FINANCIAL...",,Certain Canada Port Authorities Divestiture Re...,T.B.,827750,12/9/1999,2,"[{'Footnote': {'Text': 'S.C. 1992, c. 46, s. 2...",11,[[{'Text': 'The definitions in this section ap...
1,SOR/2000-100,3/15/2000,06/21/2018,4/27/2006,[FEDERAL-PROVINCIAL FISCAL ARRANGEMENTS ACT],,Federal-Provincial Fiscal Arrangements Regulat...,P.C.,2000-317,3/15/2000,2,"[{'Footnote': [{'Text': 'S.C. 1999, c. 31, s. ...",31,"[[{'Text': {'Repealed': '[Repealed, SOR/2007-3..."
2,SOR/2000-107,3/23/2000,06/21/2018,6/20/2005,"[CANADIAN ENVIRONMENTAL PROTECTION ACT, 1999]",,Persistence and Bioaccumulation Regulations,P.C.,2000-348,3/23/2000,3,"[[{'Footnote': {'Text': 'S.C. 1999, c. 33', 'L...",6,[[{'Text': 'The definitions in this section ap...
3,SOR/2000-108,3/23/2000,06/21/2018,6/22/2005,"[CANADIAN ENVIRONMENTAL PROTECTION ACT, 1999]",,Export Control List Notification Regulations,P.C.,2000-349,3/23/2000,1,"[{'Text': {'Repealed': '[Repealed, SOR/2013-88...",5,"[[{'Text': {'Repealed': '[Repealed, SOR/2013-8..."
4,SOR/2000-111,3/23/2000,06/21/2018,8/18/2005,[AERONAUTICS ACT],,Canadian Aviation Security Regulations,P.C.,2000-364,3/23/2000,1,"[{'Text': {'Repealed': '[Repealed, SOR/2011-31...",119,"[[{'Text': {'Repealed': '[Repealed, SOR/2011-3..."
5,SOR/2000-112,3/23/2000,06/21/2018,8/18/2005,[AERONAUTICS ACT],,Designated Provisions Regulations,P.C.,2000-365,3/23/2000,2,"[{'Footnote': {'Text': 'S.C. 1992, c. 4, s. 19...",6,[[{'Text': 'The definitions in this section ap...
6,SOR/2000-113,3/23/2000,06/21/2018,9/22/2005,[BANK ACT],,Insider Reports Exemptions (Banks) Regulations,P.C.,2000-368,3/23/2000,2,,2,"[[{'Text': {'Repealed': '[Repealed, SOR/2006-3..."
7,SOR/2000-131,3/30/2000,06/21/2018,5/5/2006,[PUBLIC SERVICE LABOUR RELATIONS ACT],,Order Designating the Staff of the Non-Public ...,P.C.,2000-442,3/30/2000,2,"[{'Footnote': {'Text': 'S.C. 1999, c. 26, s. 1...",2,
8,SOR/2000-132,3/30/2000,06/21/2018,10/16/2008,[PILOTAGE ACT],,General Pilotage Regulations,P.C.,2000-444,3/30/ 2000,2,"[{'Footnote': {'Text': 'R.S., c. 31 (1st Supp....",29,[[{'Text': 'The definitions in this section ap...
9,SOR/2000-14,12/16/1999,06/21/2018,1/10/2006,[NATIONAL DEFENCE ACT],,Military Police Professional Code of Conduct,P.C.,1999-2213,12/16/1999,2,"[{'Footnote': {'Text': 'S.C. 1998, c. 35, s. 5...",9,[[{'Text': {'XRefExternal': 'National Defence ...


In [None]:
#Save the final output

In [15]:
df.to_excel("AI_Reg_Flat_Table_V1.xlsx")

# Testing how to fix dates

In [46]:
RegistrationDate_Dict = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']
RegistrationDate

{'DD': '11', 'MM': '2', 'YYYY': '1999'}

In [51]:
RegistrationDate_Year = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['YYYY']
RegistrationDate_Month = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['MM']
RegistrationDate_Day = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['DD']
RegistrationDate = RegistrationDate_Month + "/" + RegistrationDate_Day + "/" + RegistrationDate_Year

In [57]:
RegistrationDate = RegistrationDate_Month + "/" + RegistrationDate_Day + "/" + RegistrationDate_Year

In [58]:
type(RegistrationDate_Year)

str

In [9]:
RegistrationDate

'2/11/1999'

# Fixing Date Blanks...

In [14]:
doc = etree.parse('SOR-55-416.xml')
doc_dict = make_dict_from_tree(doc.getroot())

In [16]:
doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']

{'DD': '10', 'MM': '6', 'YYYY': '2005'}

In [14]:
doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']

{'DD': '11', 'MM': '2', 'YYYY': ' 1999'}

# Testing looping through all files

In [3]:
files = [f for f in os.listdir('.') if os.path.isfile(f)]

In [25]:
for file in files:
    if (file != 'Regulation Exploration AW - V1.ipynb') & (file != 'Regulation Exploration AW - V2.ipynb') & (file != 'Regulation Exploration AW - V3 - PartialTest.ipynb') & (file != 'Regulation Exploration AW - V3 - Full.ipynb'):
        print (file)

SOR-2000-1.xml
SOR-2000-100.xml
SOR-2000-107.xml
SOR-2000-108.xml
SOR-2000-111.xml
SOR-2000-112.xml
SOR-2000-113.xml
SOR-2000-131.xml
SOR-2000-132.xml
SOR-2000-14.xml
SOR-2000-141.xml
SOR-2000-142.xml
SOR-2000-143.xml
SOR-2000-177.xml
SOR-2000-181.xml
SOR-2000-187.xml
SOR-2000-202.xml
SOR-2000-203.xml
SOR-2000-204.xml
SOR-2000-205.xml
SOR-2000-206.xml
SOR-2000-207.xml
SOR-2000-208.xml
SOR-2000-209.xml
SOR-2000-210.xml
SOR-2000-211.xml
SOR-2000-212.xml
SOR-2000-214.xml
SOR-2000-217.xml
SOR-2000-228.xml
SOR-2000-230.xml
SOR-2000-233.xml
SOR-2000-253.xml
SOR-2000-260.xml
SOR-2000-265.xml
SOR-2000-272.xml
SOR-2000-273.xml
SOR-2000-277.xml
SOR-2000-283.xml
SOR-2000-294.xml
SOR-2000-300.xml
SOR-2000-303.xml
SOR-2000-306.xml
SOR-2000-324.xml
SOR-2000-375.xml
SOR-2000-376.xml
SOR-2000-387.xml
SOR-2000-418.xml
SOR-2000-42.xml
SOR-2000-43.xml
SOR-2000-52.xml
SOR-2000-53.xml
SOR-2000-54.xml
SOR-2000-55.xml
SOR-2000-60.xml
SOR-2000-66.xml
SOR-2000-69.xml
SOR-2000-7.xml
SOR-2000-70.xml
SOR-2000-71.

SOR-2014-290.xml
SOR-2014-291.xml
SOR-2014-292.xml
SOR-2014-293.xml
SOR-2014-299.xml
SOR-2014-300.xml
SOR-2014-301.xml
SOR-2014-304.xml
SOR-2014-321.xml
SOR-2014-37.xml
SOR-2014-38.xml
SOR-2014-44.xml
SOR-2014-55.xml
SOR-2014-56.xml
SOR-2014-58.xml
SOR-2014-59.xml
SOR-2014-60.xml
SOR-2014-61.xml
SOR-2014-68.xml
SOR-2014-69.xml
SOR-2014-70.xml
SOR-2014-91.xml
SOR-2014-94.xml
SOR-2014-95.xml
SOR-2015-1.xml
SOR-2015-103.xml
SOR-2015-105.xml
SOR-2015-114.xml
SOR-2015-12.xml
SOR-2015-121.xml
SOR-2015-124.xml
SOR-2015-13.xml
SOR-2015-130.xml
SOR-2015-131.xml
SOR-2015-14.xml
SOR-2015-145.xml
SOR-2015-15.xml
SOR-2015-16.xml
SOR-2015-164.xml
SOR-2015-165.xml
SOR-2015-167.xml
SOR-2015-168.xml
SOR-2015-17.xml
SOR-2015-177.xml
SOR-2015-181.xml
SOR-2015-2.xml
SOR-2015-200.xml
SOR-2015-202.xml
SOR-2015-203.xml
SOR-2015-240.xml
SOR-2015-241.xml
SOR-2015-26.xml
SOR-2015-3.xml
SOR-2015-36.xml
SOR-2015-4.xml
SOR-2015-44.xml
SOR-2015-5.xml
SOR-2015-56.xml
SOR-2015-6.xml
SOR-2015-68.xml
SOR-2015-80.xml
SO