# XMLPOS

## append_text

In [3]:
def append_text(text, str):
    if not str or str == "":
        return text 
    
    str = str.strip()
    if tcext == "":
        return str 
    if text[-1] == '¬':
        return text[:-1] + str 
    return text + " " + str 

print(f'<<{append_text("abc", "   def ")}>>')
print(f'<<{append_text("unge¬", "legen")}>>')

<<abc def>>
<<ungelegen>>


## TreeExtractor

In [30]:
import xml.etree.ElementTree as ET 

class TextExtractor:
    def __init__(self, root):
        self.root = root 
        
    def extract(self):
        self.text = ""
        self.extract_rec(self.root)
        return self.text
        
    def extract_rec(self, node):
        self.text = append_text(self.text, node.text)
        for c in node: 
            self.extract_rec(c)
        #self.text = append_text(self.text, node.tail)
        
e = TextExtractor(ET.fromstring('<a><b>1<c>2<d/>3</c></b>4<lb/>5</a>'))
print(f'<<{e.extract()}>>')

<<1 2>>


## Alternative 1: gather_text

In [18]:
def gather_text(node, text):
    text = append_text(text, node.text)
    for c in  node:
        text = gather_text(c, text)
    return text
    #return append_text(text, node.tail)

root = ET.fromstring('<a><b>1<c>2<d/>3</c></b>4<lb/>5</a>')
text = gather_text(root, "")
print(f'<<{text}>>')

<<1 2>>


## Alternative 2: gather_text_alt

In [24]:
def gather_text_alt(node):
    return gather_text_rec(node, "")

def gather_text_rec(node, text):
    text = append_text(text, node.text)
    for c in  node:
        text = gather_text_rec(c, text)
    return append_text(text, node.tail)
    
root = ET.fromstring('<a><b>1<c>2<d/>3</c></b>4<lb/>5</a>')
text = gather_text_alt(root)
print(f'<<{text}>>')

<<1 2 3 4 5>>


## Download der XML-Dateien

In [38]:
import urllib.request
url = "https://www.deutschestextarchiv.de/book/download_xml/altmann_elementarorganismen_1890"
# url = "https://www.deutschestextarchiv.de/book/download_xml/brandes_naturlehre03_1832"

with urllib.request.urlopen(url) as f:
    root = ET.fromstring(f.read())
    
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
e = TextExtractor(root.find('tei:text', ns))
text = e.extract()
print(text[1002:1100])

von Mohl Diese Definition des Protoplasmas hat ihre Geltung im Max Schultze Max Schultze Brücke Er


c## Download des [Spacy](https://spacy.io) Modells

In [41]:
%%bash
python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0-py3-none-any.whl (19.3 MB)
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


## Importe

In [48]:
import sys
!{sys.executable} -m pip install --upgrade pip spacy
import spacy
!{sys.executable} -m pip install --upgrade pip nltk
from nltk import sent_tokenize
import xml.dom.minidom



## POS-Tagging und Aufbau des XML-Baums

In [50]:


nlp = spacy.load('de_core_news_sm')
out = ET.Element('doc')
sid = 1 # sentence id
wid = 1 # word id

sents = sent_tokenize(text)
for sent in sents:
    stag = ET.SubElement(out, 's')
    stag.attrib = {'id': f's-{sid:05d}'}
    sid += 1
    tokens = nlp(sent)
    for token in tokens:
        wtag = ET.SubElement(stag, 'w')
        wtag.text = token.text
        wtag.attrib = {'pos': token.pos_, 'id': f'w-{wid:05d}'}
        wid += 1
dom = xml.dom.minidom.parseString(ET.tostring(out))
print(dom.toprettyxml())

<?xml version="1.0" ?>
<doc>
	<s id="s-00001">
		<w id="w-00001" pos="DET">DIE</w>
		<w id="w-00002" pos="PROPN">ELEMENTARORGANISMEN</w>
		<w id="w-00003" pos="PROPN">BEZIEHUNGEN</w>
		<w id="w-00004" pos="ADP">ZU</w>
		<w id="w-00005" pos="PROPN">DEN</w>
		<w id="w-00006" pos="PROPN">ZELLEN</w>
		<w id="w-00007" pos="PROPN">VON</w>
		<w id="w-00008" pos="SPACE"> </w>
		<w id="w-00009" pos="PROPN">RICHARD</w>
		<w id="w-00010" pos="PROPN">ALTMANN</w>
		<w id="w-00011" pos="ADP">MIT</w>
		<w id="w-00012" pos="PROPN">ZWEI</w>
		<w id="w-00013" pos="PROPN">ABBILDUNGEN</w>
		<w id="w-00014" pos="ADP">IM</w>
		<w id="w-00015" pos="PROPN">TEXT</w>
		<w id="w-00016" pos="CCONJ">UND</w>
		<w id="w-00017" pos="PROPN">XXI</w>
		<w id="w-00018" pos="PROPN">TAFELN</w>
		<w id="w-00019" pos="PUNCT">.</w>
	</s>
	<s id="s-00002">
		<w id="w-00020" pos="PROPN">LEIPZIG</w>
		<w id="w-00021" pos="PUNCT">,</w>
		<w id="w-00022" pos="NOUN">VERLAG</w>
		<w id="w-00023" pos="PROPN">VON</w>
		<w id="w-00024"

* [F-Strings](https://cis.bentley.edu/sandbox/wp-content/uploads/Documentation-on-f-strings.pdf)
* [Spacy](https://spacy.io/)
* [DTA](https://www.deutschestextarchiv.de/)
* [TEI](https://github.com/TEIC/TEI)