In [186]:
import re
from datetime import datetime
from dateutil.parser import parse

import xml.dom.minidom
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup as bs

#### Reading and formatting the files

In [4]:
original_file = "./data/rezeptdatenbank.WordPress.2019-07-29_Rezepte.xml"
cleaned_file = "./data/rezeptdatenbank.WordPress.2019-07-29_Rezeptev2.xml"

In [207]:
with open(original_file, "r") as f:
    content = f.read()

In [208]:
content = content.replace('\t','')
content = content.replace('<content:encoded>\n','<content:encoded>')
content = content.replace('<excerpt:encoded>\n','<excerpt:encoded>')
content = content.replace('<title>\n','<title>')
content = content.replace('</item>','</item>\n')
content = content.replace('<wp:author>','<wp:author>\n')
content = content.replace('</wp:author>','</wp:author>\n')
content = content.replace('</wp:author_id>','</wp:author_id>\n')
content = content.replace('</wp:author_login>','</wp:author_login>\n')
content = content.replace('</wp:author_email>','</wp:author_email>\n')
content = content.replace('</wp:author_display_name>','</wp:author_display_name>\n')
content = content.replace('</wp:author_first_name>','</wp:author_first_name>\n')
content = content.replace('</wp:author_last_name>','</wp:author_last_name>\n')
content = re.sub(r'(<!\[CDATA\[)(.*)(\]\]>)','\\2', content)

#### Saving a formatted version of the file with all recipes

In [7]:
with open(cleaned_file, "w") as f:
    f.write(content)

#### Finding each individual recipe and saving on a list

In [8]:
def find_recipe(text):
    pos1 = text.find('<item>')
    pos2 = text.find('</item>')+7
    return pos1, pos2

In [9]:
pos1 = 0
text = content
recipes = []
while pos1 != -1:
    pos1, pos2 = find_recipe(text)
    recipes.append(text[pos1:pos2])
    text =  text[pos2+1:]

In [10]:
len(recipes)

7037

#### Discarding recipes without content

In [11]:
recipes_notnull = [r for r in recipes if '<content:encoded><![CDATA[]]></content:encoded>' not in r]

In [12]:
len(recipes_notnull)

6893

#### Converting files to TEI-XML format

In [14]:
template = '''
<?xml version="1.0" encoding="UTF-8" ?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>{0}</title>
            </titleStmt>
            <publicationStmt>
                <publisher>
                    <orgName>Des Zentrums für Gastrosophie</orgName>
                    <ptr target="http://gastrosophie.sbg.ac.at/"/>
                </publisher>
                <date when="{1}"/>
            </publicationStmt>
            <sourceDesc>
                <biblStruct>
                    <monogr>
                        <title>{0}</title>
                        <title type="alt">{2}</title>
                        <note>{3}</note>
                        <author>
                            <persName xml:id="person1">
                                <forename>Marlene</forename>
                                <surname>Ernst</surname>
                                <email>marlene.ernst@sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person32">
                                <forename>Christine</forename>
                                <surname>Überei</surname>
                                <email>christine.ueberei@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person33">
                                <forename>Juliane</forename>
                                <surname>Wiemerslage</surname>
                                <email>Jwiemerslage@web.de</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person30">
                                <forename>Irene</forename>
                                <surname>Tripp</surname>
                                <email>irenetripp@hotmail.com</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person19">
                                <forename>Lukas</forename>
                                <surname>Fallwickl</surname>
                                <email>Lukas_Andreas.Fallwickl@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person20">
                                <forename>Magdalena</forename>
                                <surname>Bogenhuber</surname>
                                <email>magdalena.bogenhuber@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person29">
                                <forename>Andrea</forename>
                                <surname>Sobieszek</surname>
                                <email>A.Sobieszek@utanet.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person18">
                                <forename>Mae</forename>
                                <surname>Lene</surname>
                                <email>marlene.ernst@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person28">
                                <forename>Sabine</forename>
                                <surname>Schwaiger</surname>
                                <email>sabine.schwaiger@a1.net</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person36">
                                <forename>Lotte</forename>
                                <surname>Caglar</surname>
                                <email>lotte.caglar@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person38">
                                <forename>Wolfram</forename>
                                <surname>Kracker</surname>
                                <email>wolfram.kracker@aon.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person39">
                                <forename>Waltraud</forename>
                                <surname>Ernst</surname>
                                <email>wernst@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person35">
                                <forename>Marlies</forename>
                                <surname>Berger</surname>
                                <email>marlies.berger@sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person26">
                                <forename>Helga</forename>
                                <surname>Kraihamer</surname>
                                <email>h.kraihamer@gmail.com</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person37">
                                <forename>Margit</forename>
                                <surname>Gull</surname>
                                <email>m.gull@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person41">
                                <forename>Maximilian</forename>
                                <surname>Schmidauer</surname>
                                <email>maximilian.schmidauer@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <textLang mainLang="deu"/>
                        <imprint>
                            <biblScope unit="volume">{4}</biblScope>
                            <biblScope unit="chapter">{5}</biblScope>
                            <biblScope unit="entry">{6}</biblScope>
                        </imprint>
                    </monogr>
                </biblStruct>
                <msDesc>
                    <msIdentifier>
                        <repository>Rezeptdatenbank</repository>
                        <idno>{7}</idno>
                    </msIdentifier>
                </msDesc>
            </sourceDesc>
        </fileDesc>
    </teiHeader>
    <text>
        <body>
            <div type="ingredients">
                <listObject>
                    {8}
                 </listObject>
            </div>
             <div type="utensils">
                <listObject>
                    {9}
                </listObject>
            </div>
        <div type="recipe">
        <p>{10}</p>
            </div>
        </body>
    </text>
</TEI>
'''

In [164]:
number = 0

In [165]:
soup = bs(recipes_notnull[number], "lxml")

#0
title = soup.find('title').text

#1
declared_date = parse(soup.find('pubdate').text)
tei_formatted_date = f'{declared_date.year}-{declared_date.month:02d}-{declared_date.day}' 

#2
trans_pos1 = recipes_notnull[number].find('<wp:meta_key><![CDATA[translation]]></wp:meta_key>\n<wp:meta_value><![CDATA[')
if trans_pos1 != -1:
    trans_pos2 = recipes_notnull[number].find(']]></wp:meta_value>', trans_pos1)
    translation = recipes_notnull[number][trans_pos1+75:trans_pos2]

#3
note_pos1 = recipes_notnull[number].find('<wp:meta_key><![CDATA[anmerkungen]]></wp:meta_key>\n<wp:meta_value><![CDATA[')
if note_pos1 != -1:
    note_pos2 = recipes_notnull[number].find(']]></wp:meta_value>', note_pos1)
    anmerkungen = recipes_notnull[number][note_pos1+75:note_pos2]

#4

#5

#6

#7
idno = soup.find('guid').text[soup.find('guid').text.find('p=')+2:]

#8 and 9

#10
recipe_pos1 = recipes_notnull[number].find('<content:encoded><![CDATA[')
if recipe_pos1 != -1:
    recipe_pos2 = recipes_notnull[number].find(']]></content:encoded>', recipe_pos1)
    recipe_text = recipes_notnull[number][recipe_pos1+26:recipe_pos2]

In [185]:
number = 0
soup = bs(recipes_notnull[0], "lxml")
ingredients = soup.findAll('category')
for ing in ingredients:
    if ing['domain'] in ['Zutaten','MaterielleKultur', 'Diaetetik']:
        print(ing['domain'], ing['nicename'], ing.text)

Zutaten austern 
Zutaten eier 
Zutaten kapaun 
Zutaten pistazien 
Zutaten zimt 
MaterielleKultur zinnschuessel 
Zutaten zitronensaft 


In [169]:



recipe_pos1 = recipes_notnull[number].find('<content:encoded><![CDATA[')
if recipe_pos1 != -1:
    recipe_pos2 = recipes_notnull[number].find(']]></content:encoded>', recipe_pos1)
    recipe_text = recipes_notnull[number][recipe_pos1+26:recipe_pos2] #8
print(recipe_text)

&nbsp;

MAch ein gutes Gehäck von gesottner Lungen / saltz / gewürtz / und röste es mit geriebenem Brod im Butter / schlag Eyer daran / mit wenig fetter Fleischbrühe / rührs ab / wie einen andern Knödlein= Taig / mach die Strützlein von Nudeln= oder mürben Taig / über einen Koch= Löffel= Stiel / benetz den halben Stengel mit abgeschlagenen Eyern / mach die Maurachen vom Gehäck daran / eins nach dem andern / und schiebs in Ofen / oder bache es im Schmaltz.


In [161]:
len('<content:encoded><![CDATA[')

26

In [None]:
recipe = template.format(title,
                         tei_formatted_date,
                         translation,
                         anmerkungen,
                         volume,
                         chapter,
                         entry,
                         idno,
                         ingredients,
                         recipe_text
                        )

#### Creating separate files using template

In [15]:
for idn, r in enumerate(recipes_notnull):
    with open(f'./outputs/recipe_{idn}.xml', "w") as f:
        f.write(header + r + footer)

#### Alternative ways to navigate the contents

In [None]:
soup = bs(content, "lxml")
#result = soup.find_all('content:encoded')
result = soup.find('item')
print(len(list(result)))

In [None]:
doc = xml.dom.minidom.parse(xmlfile)
tree = ET.parse(xmlfile)

In [None]:
recipes = doc.getElementsByTagName("item")
len(recipes)

In [None]:
root = tree.getroot()
print(root)

In [None]:
for child in root:
    print(child.tag, child.attrib)

In [None]:
set([elem.tag for elem in root.iter()])

In [None]:
for event, elem in ET.iterparse(xmlfile):
    if elem.tag == "item":
        print(repr(elem.findtext("link")))
        elem.clear() # won't need this again
        break

In [None]:
for event, elem in ET.iterparse(xmlfile):
    if elem.tag == "title":
        print repr(elem.text)
        break # we're done