# Convert html to xml

In [1]:
# Open html document
document = open('Data/Clay.html', mode = 'r', encoding='utf-8')
document

<_io.TextIOWrapper name='Data/Clay.html' mode='r' encoding='utf-8'>

In [2]:
# Read html document
document.read()

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n\n\n<div class="chapter">\n\n<h2><a name="chap10"></a>CLAY</h2>\n\n<p>\nThe matron had given her leave to go out as soon as the women&rsquo;s tea was\nover and Maria looked forward to her evening out. The kitchen was spick and\nspan: the cook said you could see yourself in the big copper boilers. The fire\nwas nice and bright and on one of the side-tables were four very big\nbarmbracks. These barmbracks seemed uncut; but if you went closer you would see\nthat they had been cut into long thick even slices and were ready to be handed\nround at tea. Maria had cut them herself.\n</p>\n\n<p>\nMaria was a very, very small person indeed but she had a very long nose and a\nvery long chin. She talked a little through her nose, always soothingly:\n<i>&ldquo;Yes, my dear,&rdquo;</i> and <i>&ldquo;No, my dear.&rdquo;</

In [3]:
# Installing lxml, if it hasn't been installed
# lxml is used for parsing and manipulating XML and HTML documents
!pip install lxml



In [4]:
# Parse html document
from lxml import html
document = open('Data/Clay.html', mode = 'r', encoding='utf-8')
htmldoc = html.fromstring(document.read())
htmldoc

<Element html at 0x137081ed900>

In [5]:
# Open a output.xml file and write the element/document to an encoded string representation of its XML tree
from lxml import etree
open("Data/Clay.xml", 'wb').write(etree.tostring(htmldoc)) # "wb" for binary mode for xml files

14591

In [6]:
# Load the xml file
xml_document = open('Data/Clay.xml', mode = 'r')
xml_document.read()

'<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" xml:lang="en">\n\n\n<body><div class="chapter">\n\n<h2><a name="chap10" id="chap10"></a>CLAY</h2>\n\n<p>\nThe matron had given her leave to go out as soon as the women&#8217;s tea was\nover and Maria looked forward to her evening out. The kitchen was spick and\nspan: the cook said you could see yourself in the big copper boilers. The fire\nwas nice and bright and on one of the side-tables were four very big\nbarmbracks. These barmbracks seemed uncut; but if you went closer you would see\nthat they had been cut into long thick even slices and were ready to be handed\nround at tea. Maria had cut them herself.\n</p>\n\n<p>\nMaria was a very, very small person indeed but she had a very long nose and a\nvery long chin. She talked a little through her nose, always soothingly:\n<i>&#8220;Yes, my dear,&#8221;</i> and <i>&#8220;No, my dear.&#8221;</i> She\nwas always sent for when the women 

In [7]:
# Use BeautifulSoup to parse the xml
from bs4 import BeautifulSoup
xml_document = open('Data/Clay.xml', mode = 'r')
soup = BeautifulSoup(xml_document, 'lxml')
soup

<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<body><div class="chapter">
<h2><a id="chap10" name="chap10"></a>CLAY</h2>
<p>
The matron had given her leave to go out as soon as the women’s tea was
over and Maria looked forward to her evening out. The kitchen was spick and
span: the cook said you could see yourself in the big copper boilers. The fire
was nice and bright and on one of the side-tables were four very big
barmbracks. These barmbracks seemed uncut; but if you went closer you would see
that they had been cut into long thick even slices and were ready to be handed
round at tea. Maria had cut them herself.
</p>
<p>
Maria was a very, very small person indeed but she had a very long nose and a
very long chin. She talked a little through her nose, always soothingly:
<i>“Yes, my dear,”</i> and <i>“No, my dear.”</i> She
was always sent for when the women quarrelled over their tubs and always
succeeded in making peace. One day the matron had said to her:
</p>
<p

In [8]:
# Prettify the parsed xml
print(soup.prettify())

<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <body>
  <div class="chapter">
   <h2>
    <a id="chap10" name="chap10">
    </a>
    CLAY
   </h2>
   <p>
    The matron had given her leave to go out as soon as the women’s tea was
over and Maria looked forward to her evening out. The kitchen was spick and
span: the cook said you could see yourself in the big copper boilers. The fire
was nice and bright and on one of the side-tables were four very big
barmbracks. These barmbracks seemed uncut; but if you went closer you would see
that they had been cut into long thick even slices and were ready to be handed
round at tea. Maria had cut them herself.
   </p>
   <p>
    Maria was a very, very small person indeed but she had a very long nose and a
very long chin. She talked a little through her nose, always soothingly:
    <i>
     “Yes, my dear,”
    </i>
    and
    <i>
     “No, my dear.”
    </i>
    She
was always sent for when the women quarrelled over their tubs 

# Converting to TEI Following the TEI guidelines

In [9]:
# Update root element to align with TEI guidelines (Chapter 2)
root = soup.find()
root.name = 'TEI'
del root['lang']
del root['xml:lang']
root['xmlns'] = 'http://www.tei-c.org/ns/1.0'
root

<TEI xmlns="http://www.tei-c.org/ns/1.0">
<body><div class="chapter">
<h2><a id="chap10" name="chap10"></a>CLAY</h2>
<p>
The matron had given her leave to go out as soon as the women’s tea was
over and Maria looked forward to her evening out. The kitchen was spick and
span: the cook said you could see yourself in the big copper boilers. The fire
was nice and bright and on one of the side-tables were four very big
barmbracks. These barmbracks seemed uncut; but if you went closer you would see
that they had been cut into long thick even slices and were ready to be handed
round at tea. Maria had cut them herself.
</p>
<p>
Maria was a very, very small person indeed but she had a very long nose and a
very long chin. She talked a little through her nose, always soothingly:
<i>“Yes, my dear,”</i> and <i>“No, my dear.”</i> She
was always sent for when the women quarrelled over their tubs and always
succeeded in making peace. One day the matron had said to her:
</p>
<p>
“Maria, you are a verita

In [10]:
# Update the div element to align with TEI guidelines (Chapter 3.1)
body = soup.find('body')
div = body.find('div')
del div['class']
div['xml:id'] = 'DUB10'
div['n'] = 10
div['type'] = 'chapter'
div

<div n="10" type="chapter" xml:id="DUB10">
<h2><a id="chap10" name="chap10"></a>CLAY</h2>
<p>
The matron had given her leave to go out as soon as the women’s tea was
over and Maria looked forward to her evening out. The kitchen was spick and
span: the cook said you could see yourself in the big copper boilers. The fire
was nice and bright and on one of the side-tables were four very big
barmbracks. These barmbracks seemed uncut; but if you went closer you would see
that they had been cut into long thick even slices and were ready to be handed
round at tea. Maria had cut them herself.
</p>
<p>
Maria was a very, very small person indeed but she had a very long nose and a
very long chin. She talked a little through her nose, always soothingly:
<i>“Yes, my dear,”</i> and <i>“No, my dear.”</i> She
was always sent for when the women quarrelled over their tubs and always
succeeded in making peace. One day the matron had said to her:
</p>
<p>
“Maria, you are a veritable peace-maker!”
</p>
<p>


In [11]:
# Update the header of the chapter to align with TEI guidelines (Chapter 3.2)
head = div.find('h2')
head.name = 'head'
a = head.find('a')
a.extract()
div

<div n="10" type="chapter" xml:id="DUB10">
<head>CLAY</head>
<p>
The matron had given her leave to go out as soon as the women’s tea was
over and Maria looked forward to her evening out. The kitchen was spick and
span: the cook said you could see yourself in the big copper boilers. The fire
was nice and bright and on one of the side-tables were four very big
barmbracks. These barmbracks seemed uncut; but if you went closer you would see
that they had been cut into long thick even slices and were ready to be handed
round at tea. Maria had cut them herself.
</p>
<p>
Maria was a very, very small person indeed but she had a very long nose and a
very long chin. She talked a little through her nose, always soothingly:
<i>“Yes, my dear,”</i> and <i>“No, my dear.”</i> She
was always sent for when the women quarrelled over their tubs and always
succeeded in making peace. One day the matron had said to her:
</p>
<p>
“Maria, you are a veritable peace-maker!”
</p>
<p>
And the sub-matron and two of

In [12]:
# Update the poem paragraph to align with TEI guidelines (Chapter 3.3.1)
poem_paragraph = div.find('p', 'poem') # find the poem paragraph
del poem_paragraph['class']
poem_paragraph.name = 'lg'
poem_paragraph['rhyme'] = 'ABAB'
poem_text = poem_paragraph.find('i')
poem_text.unwrap() # Move the contents of <i> directly under <lg>
poem_paragraph

<lg rhyme="ABAB">
I dreamt that I dwelt in marble halls<br/>
    With vassals and serfs at my side<br/>
And of all who assembled within those walls<br/>
    That I was the hope and the pride.<br/>
I had riches too great to count, could boast<br/>
    Of a high ancestral name,<br/>
But I also dreamt, which pleased me most,<br/>
    That you loved me still the same.
</lg>

In [13]:
# Split by lines and get the context
poem_text = poem_paragraph.stripped_strings
for line in poem_text:
    print(line)

I dreamt that I dwelt in marble halls
With vassals and serfs at my side
And of all who assembled within those walls
That I was the hope and the pride.
I had riches too great to count, could boast
Of a high ancestral name,
But I also dreamt, which pleased me most,
That you loved me still the same.


In [14]:
# Update each line of the poem paragraph and replace the original poem_paragraph with new_poem_paragraph
poem_text = poem_paragraph.stripped_strings
new_poem_paragraph = soup.new_tag('lg', rhyme=poem_paragraph['rhyme']) # create a new_poem_paragraph

count = 0 # this is used to calculate odds and even lines (for rhyme patterns)
for line in poem_text:
    new_tag = soup.new_tag('l') # create a new <l> tag
    new_tag.string = ' '.join(line.split()[:-1]) + ' ' # set the new tag to be the line without the rhyme word
    if count %2 == 0: # if 1,3,5,7 lines
        rhyme_tag = soup.new_tag('rhyme', label='A') # rhyme label is "A"
        if line.split()[-1][-1].isalpha(): # if last character of the last element after splitting is a character
            rhyme_tag.string = line.split()[-1] # the last element after splitting is the rhyme word
            new_tag.append(rhyme_tag)
        else: # if last character of the last element after splitting is a punctuation
            rhyme_tag.string = line.split()[-1][:-1] # the last element without the last character is the rhyme word
            new_tag.append(rhyme_tag)
            new_tag.append(line.split()[-1][-1]) # add the punctuation after the rhyme word ends
    else: # if 2,4,6,8 lines
        rhyme_tag = soup.new_tag('rhyme', label='B') # rhyme label is "B"
        if line.split()[-1][-1].isalpha(): # if last character of the last element after splitting is a character
            rhyme_tag.string = line.split()[-1] # the last element after splitting is the rhyme word
            new_tag.append(rhyme_tag)
        else: # if last character of the last element after splitting is a punctuation
            rhyme_tag.string = line.split()[-1][:-1] # the last element without the last character is the rhyme word
            new_tag.append(rhyme_tag)
            new_tag.append(line.split()[-1][-1]) # add the punctuation after the rhyme word ends
    new_poem_paragraph.append(new_tag)
    count += 1
new_poem_paragraph

<lg rhyme="ABAB"><l>I dreamt that I dwelt in marble <rhyme label="A">halls</rhyme></l><l>With vassals and serfs at my <rhyme label="B">side</rhyme></l><l>And of all who assembled within those <rhyme label="A">walls</rhyme></l><l>That I was the hope and the <rhyme label="B">pride</rhyme>.</l><l>I had riches too great to count, could <rhyme label="A">boast</rhyme></l><l>Of a high ancestral <rhyme label="B">name</rhyme>,</l><l>But I also dreamt, which pleased me <rhyme label="A">most</rhyme>,</l><l>That you loved me still the <rhyme label="B">same</rhyme>.</l></lg>

In [15]:
# Replace the poem paragraph with the newly created paragraph
poem_paragraph.replace_with(new_poem_paragraph)
div

<div n="10" type="chapter" xml:id="DUB10">
<head>CLAY</head>
<p>
The matron had given her leave to go out as soon as the women’s tea was
over and Maria looked forward to her evening out. The kitchen was spick and
span: the cook said you could see yourself in the big copper boilers. The fire
was nice and bright and on one of the side-tables were four very big
barmbracks. These barmbracks seemed uncut; but if you went closer you would see
that they had been cut into long thick even slices and were ready to be handed
round at tea. Maria had cut them herself.
</p>
<p>
Maria was a very, very small person indeed but she had a very long nose and a
very long chin. She talked a little through her nose, always soothingly:
<i>“Yes, my dear,”</i> and <i>“No, my dear.”</i> She
was always sent for when the women quarrelled over their tubs and always
succeeded in making peace. One day the matron had said to her:
</p>
<p>
“Maria, you are a veritable peace-maker!”
</p>
<p>
And the sub-matron and two of

In [16]:
# Update quotations to align with TEI guidelines (Chapter 3.5.2)
paragraphs = div.find_all('p')
for paragraph in paragraphs:
    quotations = paragraph.find_all('i') # quote and quotations are in italic
    for quotation in quotations:
        if quotation.text[0] == '“': # if it is a direct quote
            quotation.name = 'q' # we use quoted: "contains material which is distinguished from the surrounding text using
                                 # quotation marks or a similar method, for any one of a variety of reasons including, but not limited to: 
                                 # direct speech or thought, technical terms or jargon, authorial distance, quotations from elsewhere, 
                                 # and passages that are mentioned but not used
            print(quotation) 
        else: # if it is not a direct quote
            quotation.name = 'q' # we use quotation: "(quotation) contains a phrase or passage attributed
                                 # by the narrator or author to some agency external to the text."
            print(quotation)

<q>“Yes, my dear,”</q>
<q>“No, my dear.”</q>
<q>A Present from Belfast</q>
<q>Dublin
by Lamplight</q>
<q>“O, here’s Maria!”</q>
<q>O, I know all about it!</q>
<q>“Do, please, Maria!”</q>
<q>“Now,
Maria!”</q>
<q>I Dreamt that I Dwelt</q>


In [17]:
print(soup.prettify())

<TEI xmlns="http://www.tei-c.org/ns/1.0">
 <body>
  <div n="10" type="chapter" xml:id="DUB10">
   <head>
    CLAY
   </head>
   <p>
    The matron had given her leave to go out as soon as the women’s tea was
over and Maria looked forward to her evening out. The kitchen was spick and
span: the cook said you could see yourself in the big copper boilers. The fire
was nice and bright and on one of the side-tables were four very big
barmbracks. These barmbracks seemed uncut; but if you went closer you would see
that they had been cut into long thick even slices and were ready to be handed
round at tea. Maria had cut them herself.
   </p>
   <p>
    Maria was a very, very small person indeed but she had a very long nose and a
very long chin. She talked a little through her nose, always soothingly:
    <q>
     “Yes, my dear,”
    </q>
    and
    <q>
     “No, my dear.”
    </q>
    She
was always sent for when the women quarrelled over their tubs and always
succeeded in making peace. One da

# Tasks

Please use the new HTML document "A_Little_Cloud_except.html" and complete the following tasks:<br>
(1) Convert the provided HTML document to an XML document, and parse the xml document using Beautiful Soup (you may copy the codes above and simply modify the file directory accordingly). (0.5 points);<br>
(2) Update the root element, the div element, and the header to align with the TEI guidelines (you may copy the codes above and change the "div['xml:id']" and "div['n']"). (0.5 points);<br>
(3) Update the poem paragraphs and each line of the poem paragraphs, and replace the poem paragraphs with the new paragraphs elements (you may copy the codes above. Since there are multiple poem paragraphs in the document, consider implementing a for loop to update each of them, and create new paragraphs elements to replace each poem paragraphs). (0.5 points);<br>
(4) There is no \<i> element in this document. Is there a way to identify the quotations and quotes in the document? Write a short paragraph on how you would solve this issue (you have to propose a solution, or at least outline some ideas to tackle this issue. Actual code implementation is not required). (0.5 points);<br>
(5) Reflect on the process of converting HTML to TEI. Share your insights and takeaways, focusing on various aspects. Consider discussing: (a) What are the challenges encountered during the conversion process? (b) Whether aligning with TEI guidelines is useful? Why or why not? Feel free to provide any other perspectives you have gained from this task. (0.5 points)

In [18]:
import re
new_document = open('Data/A_Little_Cloud_excerpt.html', mode = 'r', encoding='utf-8')

htmldoc = html.fromstring(new_document.read())
open("Data/A_Little_Cloud_excerpt.xml", 'wb').write(etree.tostring(htmldoc))
xml_document = open('Data/A_Little_Cloud_excerpt.xml', mode = 'r')
soup = BeautifulSoup(xml_document, 'lxml')

root = soup.find()
root.name = 'TEI'
del root['lang']
del root['xml:lang']
root['xmlns'] = 'http://www.tei-c.org/ns/1.0'

body = soup.find('body')
div = body.find('div')
del div['class']
div['xml:id'] = 'DUB08'
div['n'] = 8
div['type'] = 'chapter'

head = div.find('h2')
head.name = 'head'
a = head.find('a')
a.extract()

poem_paragraphs = div.find_all('p', 'poem')
for poem_paragraph in poem_paragraphs:
    del poem_paragraph['class']
    poem_paragraph.name = 'lg'
    poem_paragraph['rhyme'] = 'ABAB'
    poem_text = poem_paragraph.find('i')
    poem_text.unwrap()
    
for poem_paragraph in poem_paragraphs:
    poem_text = poem_paragraph.stripped_strings
    new_poem_paragraph = soup.new_tag('lg', rhyme=poem_paragraph['rhyme'])
    
    count = 0
    for line in poem_text:
        new_tag = soup.new_tag('l')
        new_tag.string = ' '.join(line.split()[:-1]) + ' '
        if count %2 == 0:
            rhyme_tag = soup.new_tag('rhyme', label='A')
            matches = re.findall(r'(\w+|[^\w]+)', line.split()[-1])
            if len(matches)==1:
                rhyme_tag.string = matches[0]
                new_tag.append(rhyme_tag)
            else:
                rhyme_tag.string = matches[0]
                new_tag.append(rhyme_tag)
                new_tag.append(matches[1])
        else:
            rhyme_tag = soup.new_tag('rhyme', label='B')
            matches = re.findall(r'(\w+|[^\w]+)', line.split()[-1])
            if len(matches)==1:
                rhyme_tag.string = matches[0]
                new_tag.append(rhyme_tag)
            else:
                rhyme_tag.string = matches[0]
                new_tag.append(rhyme_tag)
                new_tag.append(matches[1])
        new_poem_paragraph.append(new_tag)
        count += 1
    poem_paragraph.replace_with(new_poem_paragraph)

print(soup.prettify())

<TEI xmlns="http://www.tei-c.org/ns/1.0">
 <body>
  <div n="8" type="chapter" xml:id="DUB08">
   <head>
    A LITTLE CLOUD
   </head>
   <p>
    Little Chandler sat in the room off the hall, holding a child in his arms. To
save money they kept no servant but Annie’s young sister Monica came for
an hour or so in the morning and an hour or so in the evening to help. But
Monica had gone home long ago. It was a quarter to nine. Little Chandler had
come home late for tea and, moreover, he had forgotten to bring Annie home the
parcel of coffee from Bewley’s. Of course she was in a bad humour and
gave him short answers. She said she would do without any tea but when it came
near the time at which the shop at the corner closed she decided to go out
herself for a quarter of a pound of tea and two pounds of sugar. She put the
sleeping child deftly in his arms and said:
   </p>
   <p>
    “Here. Don’t waken him.”
   </p>
   <p>
    A little lamp with a white china shade stood upon the table and i