In [1]:
'''
The purpose of this code is to generate the first part of the flat table: 'Identification', 'Order', and 'Scheduale'
XML tag extraction. In addition to 'Body', these are the primary XML tags

Note that the body will be extracted in part II as it requires the 'label' to be preserved with each phrase
'''

"\nThe purpose of this code is to generate the first part of the flat table: 'Identification', 'Order', and 'Scheduale'\nXML tag extraction. In addition to 'Body', these are the primary XML tags\n\nNote that the body will be extracted in part II as it requires the 'label' to be preserved with each phrase\n"

In [2]:
#Import the required packages
import pandas as pd
import xml.etree.ElementTree
from lxml import etree
import os
import io

#Hide Future Version warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#Code to convert XML element tree to a dictionary
def make_dict_from_tree(element_tree):
    def internal_iter(tree, accum):
        if tree is None:
            return accum
        
        if tree.getchildren():
            accum[tree.tag] = {}
            for each in tree.getchildren():
                result = internal_iter(each, {})
                if each.tag in accum[tree.tag]:
                    if not isinstance(accum[tree.tag][each.tag], list):
                        accum[tree.tag][each.tag] = [
                            accum[tree.tag][each.tag]
                        ]
                    accum[tree.tag][each.tag].append(result[each.tag])
                else:
                    accum[tree.tag].update(result)
        else:
            accum[tree.tag] = tree.text
        return accum
    return internal_iter(element_tree, {})

## Extracting the Instrument Tags & Bulk Scheduale Info

In [4]:
#Create the empty lists to append to
list_InstrumentNumber = []
list_RegistrationDate = []
list_ConsolidationDate = []
list_LastModifiedDate = []
list_EnablingAuthority = []
list_ShortTitle = []
list_LongTitle = []
list_RegulationMaker = []
list_RegulationOrderNumber = []
list_RegulationMakerOrderDate = []
list_Schedule = []

In [5]:
#Obtain the list of regulations only from the working directory
list_regs = []
files = [f for f in os.listdir('.') if os.path.isfile(f)]

for file in files:
    if file[-3:] == 'xml':
        list_regs.append(file)

In [6]:
len(list_regs)

3199

In [7]:
for file in list_regs:
    #Parse each xml file
    doc = etree.parse(file)
    doc_dict = make_dict_from_tree(doc.getroot())

    #Extracting all the Identification Headers
    try:
        InstrumentNumber = doc_dict['Regulation']['Identification']['InstrumentNumber']
    except: 
        InstrumentNumber = ""

    try:
        RegistrationDate_Year = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['YYYY']
        RegistrationDate_Month = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['MM']
        RegistrationDate_Day = doc_dict['Regulation']['Identification']['RegistrationDate']['Date']['DD']
        RegistrationDate = RegistrationDate_Month + "/" + RegistrationDate_Day + "/" + RegistrationDate_Year
    except: 
        RegistrationDate = ""

    try:
        ConsolidationDate_Year = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['YYYY']
        ConsolidationDate_Month = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['MM']
        ConsolidationDate_Day = doc_dict['Regulation']['Identification']['ConsolidationDate']['Date']['DD']
        ConsolidationDate = ConsolidationDate_Month + "/" + ConsolidationDate_Day + "/" + ConsolidationDate_Year
    except: 
        ConsolidationDate = ""

    try:
        LastModifiedDate_Year = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['YYYY']
        LastModifiedDate_Month = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['MM']
        LastModifiedDate_Day = doc_dict['Regulation']['Identification']['LastModifiedDate']['Date']['DD']
        LastModifiedDate = LastModifiedDate_Month + "/" + LastModifiedDate_Day + "/" + LastModifiedDate_Year
    except:
        LastModifiedDate = ""

    try:
        EnablingAuthority = doc_dict['Regulation']['Identification']['EnablingAuthority'].values()
    except:
        EnablingAuthority = ""

    try:
        ShortTitle = doc_dict['Regulation']['Identification']['ShortTitle']
    except:
        ShortTitle = ""

    try:
        LongTitle = doc_dict['Regulation']['Identification']['LongTitle']
    except:
        LongTitle = ""

    try:
        RegulationMaker = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['RegulationMaker']
    except:
        RegulationMaker = ""

    try:
        RegulationOrderNumber = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['OrderNumber']
    except:
        RegulationOrderNumber = ""

    try:
        RegulationMakerOrderDate_Year = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['YYYY']
        RegulationMakerOrderDate_Month = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['MM']
        RegulationMakerOrderDate_Day = doc_dict['Regulation']['Identification']['RegulationMakerOrder']['Date']['DD']
        RegulationMakerOrderDate = RegulationMakerOrderDate_Month + "/" + RegulationMakerOrderDate_Day + "/" + RegulationMakerOrderDate_Year
    except:
        RegulationMakerOrderDate = ""

   #Schedule Section of the Regulation
    try:
        Schedule = doc_dict['Regulation']['Schedule'].values()
    except:
        Schedule = ""

    #Append to all of the lists
    list_InstrumentNumber.append(InstrumentNumber)
    list_RegistrationDate.append(RegistrationDate)
    list_ConsolidationDate.append(ConsolidationDate)
    list_LastModifiedDate.append(LastModifiedDate)
    list_EnablingAuthority.append(EnablingAuthority)
    list_ShortTitle.append(ShortTitle)
    list_LongTitle.append(LongTitle)
    list_RegulationMaker.append(RegulationMaker)
    list_RegulationOrderNumber.append(RegulationOrderNumber)
    list_RegulationMakerOrderDate.append(RegulationMakerOrderDate)
    list_Schedule.append(Schedule)


In [8]:
#Create final dataframe 
df = pd.DataFrame()
df["Instrumentation_Num"] = list_InstrumentNumber
df["Registration_Date"] = list_RegistrationDate
df["Consolidation_Date"] = list_ConsolidationDate
df["Last_Mod_Date"] = list_LastModifiedDate
df["Enabling_Authority"] = list_EnablingAuthority
df["Short_Title"] = list_ShortTitle
df["Long_Title"] = list_LongTitle
df["Reg_Maker"] = list_RegulationMaker
df["Reg_Order_Num"] = list_RegulationOrderNumber
df["Reg_Order_Maker_Date"] = list_RegulationMakerOrderDate
df["Schedule"] = list_Schedule

In [9]:
len(list_LastModifiedDate)

3199

## Extracting the Order Info

In [10]:
#Note that for some reason, extracting the order tags is not straight forward like 'Idnetification'...
file = 'SOR-2001-377.xml'
doc = etree.parse(file)
doc_dict = make_dict_from_tree(doc.getroot())

In [11]:
#Note that the following doesn't give the same output as when you directly open the reg...text is missing
doc_dict['Regulation']['Order']

{'Provision': {'Footnote': [{'Label': 'a', 'Text': 'S.C. 2001, c. 9, s. 183'},
   {'Label': 'b', 'Text': 'S.C. 1991, c. 46'}],
  'Text': {'FootnoteRef': ['a', 'b'],
   'XRefExternal': ['Bank Act',
    'Equity of a Bank or a Bank Holding Company Regulations']}}}

In [12]:
len(doc_dict['Regulation']['Order'])

1

##### Order Extraction Test

In [13]:
#Method 2...read the entire XML as a string an then keep everything between the 'order' tags
import xml.etree.ElementTree as ET
tree = ET.parse(file)
root = tree.getroot()
xmlstr = ET.tostring(root, encoding='utf8', method='xml')

In [14]:
#For some reason, the order text is preserved using this method
xmlstr

'<?xml version=\'1.0\' encoding=\'utf8\'?>\n<Regulation regulation-type="SOR" xml:lang="en"><Identification><InstrumentNumber>SOR/2001-377</InstrumentNumber><RegistrationDate><Date><YYYY>2001</YYYY><MM>10</MM><DD>4</DD></Date></RegistrationDate><ConsolidationDate><Date><YYYY>2018</YYYY><MM>06</MM><DD>21</DD></Date></ConsolidationDate><LastModifiedDate><Date><YYYY>2005</YYYY><MM>9</MM><DD>22</DD></Date></LastModifiedDate><EnablingAuthority><XRefExternal reference-type="act">BANK ACT</XRefExternal></EnablingAuthority><LongTitle>Equity of a Bank or a Bank Holding Company Regulations</LongTitle><RegulationMakerOrder><RegulationMaker>P.C.</RegulationMaker><OrderNumber>2001-1748</OrderNumber><Date><YYYY>2001</YYYY><MM>10</MM><DD>4</DD></Date></RegulationMakerOrder></Identification><Order><Provision format-ref="indent-0-0" language-align="yes" list-item="no"><Text>Her Excellency the Governor General in Council, on the recommendation of the Minister of Finance, pursuant to section 978<Footnote

In [15]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

In [16]:
find_between(xmlstr, '<Order>', '</Order>')

'<Provision format-ref="indent-0-0" language-align="yes" list-item="no"><Text>Her Excellency the Governor General in Council, on the recommendation of the Minister of Finance, pursuant to section 978<FootnoteRef idref="footnotea_e">a</FootnoteRef> of the <XRefExternal reference-type="act">Bank Act</XRefExternal><FootnoteRef idref="footnoteb_e">b</FootnoteRef>, hereby makes the annexed <XRefExternal reference-type="regulation">Equity of a Bank or a Bank Holding Company Regulations</XRefExternal>.</Text><Footnote id="footnotea_e" placement="page" status="official"><Label>a</Label><Text>S.C. 2001, c. 9, s. 183</Text></Footnote><Footnote id="footnoteb_e" placement="page" status="official"><Label>b</Label><Text>S.C. 1991, c. 46</Text></Footnote></Provision>'

##### Extracting the Order Section from all Regulations

In [17]:
import xml.etree.ElementTree as ET

In [20]:
list_order = []

for file in list_regs:  
    try:
        tree = ET.parse(file)
        root = tree.getroot()
        xmlstr = ET.tostring(root, encoding='utf8', method='xml')
        order = find_between(xmlstr, '<Order>', '</Order>')
        list_order.append(str(order))
    
    except:
        list_order.append("")
    
df["Order"] = list_order

# Export to Excel

In [25]:
#df.to_excel("AI_Reg_Flat_Table_V3_Part1.xlsx")

In [24]:
writer = pd.ExcelWriter('AI_Reg_Flat_Table_V3_Part1.xlsx', engine='openpyxl')
df.to_excel(writer)
writer.save()