# Document metadata

In [1]:
metadata_regulation_number = "1404"
metadata_regulation_title = "Regulations of 22 November 2013 No. 1404 on fishing vessels of less than 15 meters in overall length"
metadata_year = "2013"
metadata_month = "11"
metadata_day = "22"

# Initialize

In [2]:
!pip install nltk

You should consider upgrading via the 'c:\users\lhustvei\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [3]:
# Does the input string contain a digit?
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

hasNumbers('4444')

True

In [4]:
#
# Lastar inn teksten vi jobbar med
#

with open('./input - Regulation - 1404-2013-11-22 - Text from word document with correct formating.txt', 'r', encoding='utf-8') as f:

    forward_file_lines = f.readlines() 
    
print(forward_file_lines)



# Pull out info from chapter text

In [17]:
from nltk.tokenize import word_tokenize

forward_identified_sentence_type_list = []

for line_index_number, line_text in enumerate(forward_file_lines):

    word_tokens = word_tokenize(line_text, language="english")

    sentence_word_count = len(word_tokens)

    if sentence_word_count > 3 and word_tokens[0] == '(' and hasNumbers(word_tokens[1]) and word_tokens[2] == ')':
        sentence_type = 'part'
        type_value = word_tokens[1] # part number
        forward_identified_sentence_type_list.append((str(line_index_number),sentence_type,type_value))

    elif sentence_word_count > 3 and word_tokens[0] == 'Section' and hasNumbers(word_tokens[1]) and word_tokens[2] == '.':
        sentence_type = 'headline-section'
        type_value = word_tokens[1] # section number
        forward_identified_sentence_type_list.append((str(line_index_number),sentence_type,type_value))

    elif sentence_word_count > 2 and word_tokens[1] == ')':
        sentence_type = 'sub-part'
        type_value = word_tokens[0] # sub-part character
        forward_identified_sentence_type_list.append((str(line_index_number),sentence_type,type_value))

    elif sentence_word_count > 2 and hasNumbers(word_tokens[0]) and word_tokens[1] == "." and not line_text[0].isspace():
        sentence_type = 'headline-chapter'
        type_value = word_tokens[0] # chapter number
        forward_identified_sentence_type_list.append((str(line_index_number),sentence_type,type_value))

    else:
        sentence_type = 'none'
        type_value = 'none'
        forward_identified_sentence_type_list.append((str(line_index_number),sentence_type,type_value))

#print(forward_identified_sentence_type_list)

# Create XML content

In [18]:
from xml.etree import ElementTree, cElementTree
from xml.dom import minidom

root = ElementTree.Element('dokument')

new_id = "for-" + metadata_year + "-" + metadata_month + "-" + metadata_day + "-" + metadata_regulation_number
root.set("id", new_id)

root.set("type", "for")

new_url = "https://lovdata.no/forskrift/" + metadata_year + "-" + metadata_month + "-" + metadata_day + "-" + metadata_regulation_number
root.set("url", new_url)

ElementTree.SubElement(root, 'metadata')

element_text = ElementTree.SubElement(root, 'tekst')

element_regulation_title = ElementTree.SubElement(element_text, 'tittel')
element_regulation_title.text = metadata_regulation_title

In [19]:
import re

chapter_number = "undetected"
lovdata_chapter_url = "undetected"

section_number = "undetected"
part_number = "undetected"
sub_part_id = "undetected"

element_kapittel = ""
element_section = ""
element_part = ""
element_sub_part = ""

for line_index_number, line_text in enumerate(forward_file_lines):

    # get metadata about sentence
    identified_sentence_type_as_tuple = forward_identified_sentence_type_list[line_index_number]
    identified_sentence_type = identified_sentence_type_as_tuple[1]
    identified_sentence_value = identified_sentence_type_as_tuple[2]

    ###

    if identified_sentence_type == "headline-chapter":
        chapter_number = identified_sentence_value

        element_kapittel = ElementTree.SubElement(element_text, 'kapittel', type="kapittel")

        attribute_id = "/kapittel/" + chapter_number
        element_kapittel.set('id', attribute_id)

        lovdata_chapter_url = "https://lovdata.no/forskrift/" + metadata_year + "-" + metadata_month + "-" + metadata_day + "-" + metadata_regulation_number + "/k" + chapter_number
        element_kapittel.set('url', lovdata_chapter_url)

        element_ktittel = ElementTree.SubElement(element_kapittel, "ktittel")
        element_ktittel.text = line_text.replace("\n","")

    elif identified_sentence_type == "headline-section":
        section_number = identified_sentence_value

        element_section = ElementTree.SubElement(element_kapittel, "paragraf")

        new_id = "/kapittel/" + chapter_number + "/paragraf/" + section_number
        element_section.set("id", new_id)

        lovdata_section_url = lovdata_chapter_url + "/p" + section_number
        element_section.set('url', lovdata_section_url)

        element_ptittel = ElementTree.SubElement(element_section, "ptittel")
        element_ptittel.text = line_text.replace("\n","")

    elif identified_sentence_type == "part":
        part_number = identified_sentence_value

        element_part = ElementTree.SubElement(element_section, "ledd")
        element_part.text = line_text.replace("\n","")

        new_id = "/kapittel/" + chapter_number + "/paragraf/" + section_number + "/ledd/" + part_number
        element_part.set("id", new_id)

    elif identified_sentence_type == "sub-part":
        sub_part_id = identified_sentence_value

        element_sub_part = ElementTree.SubElement(element_part, "liste")

        new_id = "/kapittel/" + chapter_number + "/paragraf/" + section_number + "/ledd/" + part_number + "/liste/" + sub_part_id
        element_sub_part.set("id", new_id)

        element_sub_part_lverdi = ElementTree.SubElement(element_sub_part, "lverdi")
        element_sub_part_lverdi.text = identified_sentence_value

        element_sub_part_avsnitt = ElementTree.SubElement(element_sub_part, "avsnitt")
        element_sub_part_avsnitt.text = line_text.replace("\n","")

        new_id = "/kapittel/" + chapter_number + "/paragraf/" + section_number + "/ledd/" + part_number + "/liste/" + sub_part_id + "/avsnitt/1"
        element_sub_part_avsnitt.set("id", new_id)

    elif identified_sentence_type == "none" and len(line_text) > 0 and line_text != "\n":
        
        if re.match("^\d+\t", line_text):
            element_footnote = ElementTree.SubElement(element_section, "fotnote") # Norwegian name on the tag so fotnote is correct spelling
            element_footnote.text = line_text.replace("\n","")

        else:
            element_part = ElementTree.SubElement(element_section, "ledd")
            element_part.text = line_text.replace("\n","")

            new_id = "/kapittel/" + chapter_number + "/paragraf/" + section_number + "/ledd"
            element_part.set("id", new_id)

# Create tree to XML file

In [20]:
tree = cElementTree.ElementTree(root) # wrap it in an ElementTree instance, and save as XML

# Since ElementTree write() has no pretty printing support, used minidom to beautify the xml
t = minidom.parseString(ElementTree.tostring(root)).toprettyxml()
tree1 = ElementTree.ElementTree(ElementTree.fromstring(t))

tree1.write("Regulation - 1404-2013-11-22 - All chapters.xml", encoding='utf-8', xml_declaration=True)