In [46]:
from datetime import datetime
from dateutil.parser import parse

import xml.dom.minidom
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup as bs

#### Reading and formatting the files

In [4]:
xmlfile1 = "./data/rezeptdatenbank.WordPress.2019-07-29_Rezepte.xml"
xmlfile2 = "./data/rezeptdatenbank.WordPress.2019-07-29_Rezeptev2.xml"

In [5]:
with open(xmlfile1, "r") as f:
    content = f.read()

In [6]:
content = content.replace('\t','')
content = content.replace('<content:encoded>\n','<content:encoded>')
content = content.replace('<excerpt:encoded>\n','<excerpt:encoded>')
content = content.replace('<title>\n','<title>')
content = content.replace('</item>','</item>\n')
content = content.replace('<wp:author>','<wp:author>\n')
content = content.replace('</wp:author>','</wp:author>\n')
content = content.replace('</wp:author_id>','</wp:author_id>\n')
content = content.replace('</wp:author_login>','</wp:author_login>\n')
content = content.replace('</wp:author_email>','</wp:author_email>\n')
content = content.replace('</wp:author_display_name>','</wp:author_display_name>\n')
content = content.replace('</wp:author_first_name>','</wp:author_first_name>\n')
content = content.replace('</wp:author_last_name>','</wp:author_last_name>\n')

#### Saving a formatted version of the file with all recipes

In [7]:
with open(xmlfile2, "w") as f:
    f.write(content)

#### Finding each individual recipe and saving on a list

In [8]:
def find_recipe(text):
    pos1 = text.find('<item>')
    pos2 = text.find('</item>')+7
    return pos1, pos2

In [9]:
pos1 = 0
text = content
recipes = []
while pos1 != -1:
    pos1, pos2 = find_recipe(text)
    recipes.append(text[pos1:pos2])
    text =  text[pos2+1:]

In [10]:
len(recipes)

7037

#### Discarding recipes without content

In [11]:
recipes_notnull = [r for r in recipes if '<content:encoded><![CDATA[]]></content:encoded>' not in r]

In [12]:
len(recipes_notnull)

6893

In [17]:
print(recipes[0])

<item>
<title>König Suppen Zumachen</title>
<link>http://gastrosophie.sbg.ac.at/kbforschung/r-datenbank/?rdb_rezepte=koenig-suppen-zumachen</link>
<pubDate>Tue, 22 Apr 2014 07:48:11 +0000</pubDate>
<dc:creator><![CDATA[mernst]]></dc:creator>
<guid isPermaLink="false">http://gastrosophie.sbg.ac.at/kbforschung/r-datenbank/?post_type=rdb_rezepte&#038;p=163</guid>
<description></description>
<content:encoded><![CDATA[Nimbe ein Guetten Coppaun*, siede* / ihme marb*, nimb die Prust von dem / Coppaun, hackh sye gar khlein, Nimb / das March* vnnd die Nirn auch eiterl*, / vnnd Wasche es Sauber aus, khere* sye in / Mell vmb, Pache* sye in butter, das / sye Zimblich Resch* Werden, thue daß / Gehackte in ein Zinnene* schisßl*, Gies* / ein Coppauner* oder henner brie* daran, / seze es Auf ein Gluett*, lasse es siden*,. / Wan du es Anrichten Wilst, so Giesse / ein Dotter Suppen* daran, Nimb / lemony*, trucke den Safft darein, Nimb / Pistazy*, Ziechs ab*, thue es auf die / Suppen, Ziere den schisßl r

#### Converting files to TEI-XML format

In [14]:
template = '''
<?xml version="1.0" encoding="UTF-8" ?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>{0}</title>
            </titleStmt>
            <publicationStmt>
                <publisher>
                    <orgName>Des Zentrums für Gastrosophie</orgName>
                    <ptr target="http://gastrosophie.sbg.ac.at/"/>
                </publisher>
                <date when="{1}"/>
            </publicationStmt>
            <sourceDesc>
                <biblStruct>
                    <monogr>
                        <title>{0}</title>
                        <title type="alt">{2}</title>
                        <note>{3}</note>
                        <author>
                            <persName xml:id="person1">
                                <forename>Marlene</forename>
                                <surname>Ernst</surname>
                                <email>marlene.ernst@sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person32">
                                <forename>Christine</forename>
                                <surname>Überei</surname>
                                <email>christine.ueberei@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person33">
                                <forename>Juliane</forename>
                                <surname>Wiemerslage</surname>
                                <email>Jwiemerslage@web.de</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person30">
                                <forename>Irene</forename>
                                <surname>Tripp</surname>
                                <email>irenetripp@hotmail.com</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person19">
                                <forename>Lukas</forename>
                                <surname>Fallwickl</surname>
                                <email>Lukas_Andreas.Fallwickl@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person20">
                                <forename>Magdalena</forename>
                                <surname>Bogenhuber</surname>
                                <email>magdalena.bogenhuber@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person29">
                                <forename>Andrea</forename>
                                <surname>Sobieszek</surname>
                                <email>A.Sobieszek@utanet.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person18">
                                <forename>Mae</forename>
                                <surname>Lene</surname>
                                <email>marlene.ernst@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person28">
                                <forename>Sabine</forename>
                                <surname>Schwaiger</surname>
                                <email>sabine.schwaiger@a1.net</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person36">
                                <forename>Lotte</forename>
                                <surname>Caglar</surname>
                                <email>lotte.caglar@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person38">
                                <forename>Wolfram</forename>
                                <surname>Kracker</surname>
                                <email>wolfram.kracker@aon.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person39">
                                <forename>Waltraud</forename>
                                <surname>Ernst</surname>
                                <email>wernst@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person35">
                                <forename>Marlies</forename>
                                <surname>Berger</surname>
                                <email>marlies.berger@sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person26">
                                <forename>Helga</forename>
                                <surname>Kraihamer</surname>
                                <email>h.kraihamer@gmail.com</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person37">
                                <forename>Margit</forename>
                                <surname>Gull</surname>
                                <email>m.gull@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person41">
                                <forename>Maximilian</forename>
                                <surname>Schmidauer</surname>
                                <email>maximilian.schmidauer@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <textLang mainLang="deu"/>
                        <imprint>
                            <biblScope unit="volume">{4}</biblScope>
                            <biblScope unit="chapter">{5}</biblScope>
                            <biblScope unit="entry">{6}</biblScope>
                        </imprint>
                    </monogr>
                </biblStruct>
                <msDesc>
                    <msIdentifier>
                        <repository>Rezeptdatenbank</repository>
                        <idno>{7}</idno>
                    </msIdentifier>
                </msDesc>
            </sourceDesc>
        </fileDesc>
    </teiHeader>
    <text>
        <body>
        {8}
        {9}
        </body>
    </text>
</TEI>
'''

In [143]:
soup = bs(recipes_notnull[number], "lxml")

title = soup.find('title').text #0

declared_date = parse(soup.find('pubdate').text)
tei_formatted_date = f'{declared_date.year}-{declared_date.month:02d}-{declared_date.day}' #1

trans_pos1 = recipes_notnull[number].find('<wp:meta_key><![CDATA[translation]]></wp:meta_key>\n<wp:meta_value><![CDATA[')
if trans_pos1 != -1:
    trans_pos2 = recipes_notnull[number].find(']]></wp:meta_value>', trans_pos1)
    translation = recipes_notnull[number][trans_pos1+75:trans_pos2] #2

note_pos1 = recipes_notnull[number].find('<wp:meta_key><![CDATA[anmerkungen]]></wp:meta_key>\n<wp:meta_value><![CDATA[')
if note_pos1 != -1:
    note_pos2 = recipes_notnull[number].find(']]></wp:meta_value>', note_pos1)
    anmerkungen = recipes_notnull[number][note_pos1+75:note_pos2] #3

idno = soup.find('guid').text[soup.find('guid').text.find('p=')+2:] #7



In [158]:
soup = bs(recipes_notnull[0], "lxml")
idno = soup.find('guid').text[soup.find('guid').text.find('p=')+2:]
print(idno)

163


In [148]:
len('<wp:meta_key><![CDATA[translation]]></wp:meta_key>\n<wp:meta_value><![CDATA[')

75

In [None]:
recipe = template.format(title,
                         tei_formatted_date,
                         translation,
                         anmerkungen,
                         volume,
                         chapter,
                         entry,
                         idno,
                         ingredients,
                         recipe_text
                        )

#### Creating separate files using template

In [15]:
for idn, r in enumerate(recipes_notnull):
    with open(f'./outputs/recipe_{idn}.xml', "w") as f:
        f.write(header + r + footer)

#### Alternative ways to navigate the contents

In [None]:
soup = bs(content, "lxml")
#result = soup.find_all('content:encoded')
result = soup.find('item')
print(len(list(result)))

In [None]:
doc = xml.dom.minidom.parse(xmlfile)
tree = ET.parse(xmlfile)

In [None]:
recipes = doc.getElementsByTagName("item")
len(recipes)

In [None]:
root = tree.getroot()
print(root)

In [None]:
for child in root:
    print(child.tag, child.attrib)

In [None]:
set([elem.tag for elem in root.iter()])

In [None]:
for event, elem in ET.iterparse(xmlfile):
    if elem.tag == "item":
        print(repr(elem.findtext("link")))
        elem.clear() # won't need this again
        break

In [None]:
for event, elem in ET.iterparse(xmlfile):
    if elem.tag == "title":
        print repr(elem.text)
        break # we're done