In [1]:
import re
from datetime import datetime
from dateutil.parser import parse

import xml.dom.minidom
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup as bs

#### Reading and formatting the files

In [2]:
original_file = "./data/rezeptdatenbank.WordPress.2019-07-29_Rezepte.xml"
cleaned_file = "./data/rezeptdatenbank.WordPress.2019-07-29_Rezeptev2.xml"

In [3]:
with open(original_file, "r") as f:
    content = f.read()

In [4]:
content = content.replace('\t','')
content = content.replace('<content:encoded>\n','<content:encoded>')
content = content.replace('<excerpt:encoded>\n','<excerpt:encoded>')
content = content.replace('<title>\n','<title>')
content = content.replace('</item>','</item>\n')
content = content.replace('<wp:author>','<wp:author>\n')
content = content.replace('</wp:author>','</wp:author>\n')
content = content.replace('</wp:author_id>','</wp:author_id>\n')
content = content.replace('</wp:author_login>','</wp:author_login>\n')
content = content.replace('</wp:author_email>','</wp:author_email>\n')
content = content.replace('</wp:author_display_name>','</wp:author_display_name>\n')
content = content.replace('</wp:author_first_name>','</wp:author_first_name>\n')
content = content.replace('</wp:author_last_name>','</wp:author_last_name>\n')
content = content.replace('&nbsp;\n\n','')
content = re.sub(r'(<!\[CDATA\[)(.*)(\]\]>)','\\2', content)

#### Saving a formatted version of the file with all recipes

In [5]:
with open(cleaned_file, "w") as f:
    f.write(content)

#### Finding each individual recipe and saving on a list

In [6]:
def find_recipe(text):
    pos1 = text.find('<item>')
    pos2 = text.find('</item>')+7
    return pos1, pos2

In [7]:
pos1 = 0
text = content
recipes = []
while pos1 != -1:
    pos1, pos2 = find_recipe(text)
    recipes.append(text[pos1:pos2])
    text =  text[pos2+1:]

In [8]:
len(recipes)

7037

#### Discarding recipes without content

In [9]:
recipes_notnull = [r for r in recipes if '<content:encoded></content:encoded>' not in r]

In [10]:
len(recipes_notnull)

6893

#### Converting files to TEI-XML format

In [11]:
template = '''<?xml version="1.0" encoding="UTF-8" ?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>{0}</title>
            </titleStmt>
            <publicationStmt>
                <publisher>
                    <orgName>Des Zentrums für Gastrosophie</orgName>
                    <ptr target="http://gastrosophie.sbg.ac.at/"/>
                </publisher>
                <date when="{1}"/>
            </publicationStmt>
            <sourceDesc>
                <biblStruct>
                    <monogr>
                        <title>{0}</title>
                        <title type="alt">{2}</title>
                        <note>{3}</note>
                        <author>
                            <persName xml:id="person1">
                                <forename>Marlene</forename>
                                <surname>Ernst</surname>
                                <email>marlene.ernst@sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person32">
                                <forename>Christine</forename>
                                <surname>Überei</surname>
                                <email>christine.ueberei@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person33">
                                <forename>Juliane</forename>
                                <surname>Wiemerslage</surname>
                                <email>Jwiemerslage@web.de</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person30">
                                <forename>Irene</forename>
                                <surname>Tripp</surname>
                                <email>irenetripp@hotmail.com</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person19">
                                <forename>Lukas</forename>
                                <surname>Fallwickl</surname>
                                <email>Lukas_Andreas.Fallwickl@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person20">
                                <forename>Magdalena</forename>
                                <surname>Bogenhuber</surname>
                                <email>magdalena.bogenhuber@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person29">
                                <forename>Andrea</forename>
                                <surname>Sobieszek</surname>
                                <email>A.Sobieszek@utanet.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person18">
                                <forename>Mae</forename>
                                <surname>Lene</surname>
                                <email>marlene.ernst@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person28">
                                <forename>Sabine</forename>
                                <surname>Schwaiger</surname>
                                <email>sabine.schwaiger@a1.net</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person36">
                                <forename>Lotte</forename>
                                <surname>Caglar</surname>
                                <email>lotte.caglar@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person38">
                                <forename>Wolfram</forename>
                                <surname>Kracker</surname>
                                <email>wolfram.kracker@aon.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person39">
                                <forename>Waltraud</forename>
                                <surname>Ernst</surname>
                                <email>wernst@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person35">
                                <forename>Marlies</forename>
                                <surname>Berger</surname>
                                <email>marlies.berger@sbg.ac.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person26">
                                <forename>Helga</forename>
                                <surname>Kraihamer</surname>
                                <email>h.kraihamer@gmail.com</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person37">
                                <forename>Margit</forename>
                                <surname>Gull</surname>
                                <email>m.gull@gmx.at</email>
                            </persName>
                        </author>
                        <author>
                            <persName xml:id="person41">
                                <forename>Maximilian</forename>
                                <surname>Schmidauer</surname>
                                <email>maximilian.schmidauer@stud.sbg.ac.at</email>
                            </persName>
                        </author>
                        <textLang mainLang="deu"/>
                        <imprint>
                            <biblScope unit="volume">{4}</biblScope>
                            <biblScope unit="chapter">{5}</biblScope>
                            <biblScope unit="entry">{6}</biblScope>
                        </imprint>
                    </monogr>
                </biblStruct>
                <msDesc>
                    <msIdentifier>
                        <repository>Rezeptdatenbank</repository>
                        <idno>{7}</idno>
                    </msIdentifier>
                </msDesc>
            </sourceDesc>
        </fileDesc>
    </teiHeader>
    <text>
        <body>
            <div type="ingredients">
                <listObject>
\t\t\t{8}
                 </listObject>
            </div>
             <div type="utensils">
                <listObject>
\t\t\t{9}
                </listObject>
            </div>
            <div type="dietetic">
                <listObject>
\t\t\t{10}
                </listObject>
            </div>
        <div type="recipe">
        <p>{11}</p>
            </div>
        </body>
    </text>
</TEI>
'''

In [12]:
def extract_features(raw):
    soup = bs(raw, "lxml")
    #0 title
    title=''
    title = soup.find('title').text
    #print(title)

    #1 tei_formatted_date
    tei_formatted_date = ''
    declared_date = parse(soup.find('pubdate').text)
    tei_formatted_date = f'{declared_date.year}-{declared_date.month:02d}-{declared_date.day}' 

    #2 translation
    translation = ''
    trans_pos1 = raw.find('<wp:meta_key>translation</wp:meta_key>\n<wp:meta_value>')
    if trans_pos1 != -1:
        trans_pos2 = raw.find('</wp:meta_value>', trans_pos1)
        translation = raw[trans_pos1+54:trans_pos2]

    #3 anmerkungen
    anmerkungen = ''
    note_pos1 = raw.find('<wp:meta_key>anmerkungen</wp:meta_key>\n<wp:meta_value>')
    if note_pos1 != -1:
        note_pos2 = raw.find('</wp:meta_value>', note_pos1)
        anmerkungen = raw[note_pos1+54:note_pos2]
       

    #4 volume, #5 chapter and #6 entry
    number = ''
    number_pos1 = raw.find('<wp:meta_key>number</wp:meta_key>\n<wp:meta_value>')
    if number_pos1 != -1:
        number_pos2 = raw.find('</wp:meta_value>', number_pos1)
        number = raw[number_pos1+49:number_pos2]

    #7 idno
    idno = 0
    idno = soup.find('guid').text[soup.find('guid').text.find('p=')+2:]

    #8 Zutaten, #9 MaterielleKultur and #10 diaetetik
    zutaten = ''
    materielle = ''
    diaetetik = ''
    objects = soup.findAll('category')
    for obj in objects:
        if obj['domain'] == 'Zutaten':
            zutaten = zutaten + '<object type="Zutaten" xml:id="{}">{}</object>\n\t\t\t'.format(obj['nicename'], obj.text)
        elif obj['domain'] == 'MaterielleKultur':
            materielle = materielle + '<object type="MaterielleKultur" xml:id="{}">{}</object>\n\t\t\t'.format(obj['nicename'], obj.text)
        elif obj['domain'] == 'Diaetetik':
            diaetetik = diaetetik + '<object type="Diaetetik" xml:id="{}">{}</object>\n\t\t\t'.format(obj['nicename'], obj.text)
    if zutaten:
        zutaten = zutaten[:-4]
    if materielle:
        materielle = materielle[:-4]
    if diaetetik:
        diaetetik = diaetetik[:-4]
            
    #11 recipe_text
    recipe_text = ''
    recipe_text = soup.find('content:encoded').text
    if recipe_text:
        recipe_text = recipe_text.replace('/','<lb/>') 

    
    return (title,
            tei_formatted_date,
            translation,
            anmerkungen,
            number, #volume,  #still need to parse bibliographic information
            number, #chapter, #still need to parse bibliographic information 
            number, #entry,   #still need to parse bibliographic information
            idno,
            zutaten,
            materielle,
            diaetetik,
            recipe_text
            )

#### Creating separate files using template

In [13]:
for idn, r in enumerate(recipes_notnull[0:10]):
    with open(f'./outputs/recipe_{idn}.xml', "w") as f:
        features = extract_features(r)
        recipe = template.format(*features)
        f.write(recipe)