In [6]:
import sys
!{sys.executable} -m pip install --upgrade pip nltk
!{sys.executable} -m pip install --upgrade pip treetaggerwrapper
import nltk
import treetaggerwrapper as TTW 
import xml.etree.ElementTree as ET 

import logging 
logging.basicConfig(level=logging.DEBUG)

ns = {'tc': "http://www.dspin.de/data/textcorpus"}
urls = [
    'https://www.deutschestextarchiv.de/book/download_fulltcf/16377',
    'https://www.deutschestextarchiv.de/book/download_fulltcf/16178',
    'https://www.deutschestextarchiv.de/book/download_fulltcf/25157',
    'https://www.deutschestextarchiv.de/book/download_fulltcf/16552',
    'https://www.deutschestextarchiv.de/book/download_fulltcf/16299',
]   




In [7]:
def tags_from_tcf(root):
    token_ids = {t.attrib["ID"]: t.text for t in root.find('tc:TextCorpus', ns).find('tc:tokens', ns)}
    pos_tags = {t.attrib["tokenIDs"]: t.text for t in root.find('tc:TextCorpus', ns).find('tc:POStags', ns)}
    return [[(token_ids[id], pos_tags[id]) for id in sent.attrib['tokenIDs'].split(" ")] for sent in root.find('tc:TextCorpus', ns).find('tc:sentences', ns)]

In [8]:
def read_xml_from_url(url):
    # Get the file name of the url.
    path = urllib.parse.urlparse(url)
    _, filename = os.path.split(path.path)
    filename += ".xml"
    # Check if sentence file is cached.
    if os.path.isfile(filename):
        logging.info(f"reading from cache: {filename}")
        with open(filename) as f:
            return f.read()
    # Download file.
    logging.debug(f"downloading from url: {url}")
    with urllib.request.urlopen(url) as f:
        contents = f.read()    
    logging.info(f"caching to file: {filename}")    
    with open(filename, 'wb') as f:
        f.write(contents)
    return contents
    


In [9]:
import urllib.request

def read_sentences_from_url(url):
    # Get the file name of the url.
    path = urllib.parse.urlparse(url)
    _, filename = os.path.split(path.path)
    filename += ".sents.txt"
    # Check if sentence file is cached.
    if os.path.isfile(filename):
        print(f"reading from cache: {filename}")
        with open(filename) as f:
            # Read sentences from the cached file.
            return [[nltk.str2tuple(t) for t in sent[:-1].split(" ")] for sent in f.readlines()]

    # File is not cached; download it and cache it.
    sents = tags_from_tcf(ET.fromstring(read_xml_from_url(url)))
    print(f"caching to file: {filename}")    
    with open(filename, 'w', encoding='utf-8') as f:
        for sent in sents:
            f.write(" ".join([t[0] + "/" + t[1] for t in sent]))
            f.write("\n")
    # Return sentences.
    return sents 

In [24]:
tagged_sents = [sent for url in urls for sent in read_sentences_from_url(url)]
print("number of sentences:", len(tagged_sents))

number of sentences: 42269


In [25]:
import random
random.seed(13)
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.9)
test_sents = tagged_sents[size:]
train_sents = tagged_sents[:size]
print("test set: ", len(test_sents))
print("train set:", len(train_sents))

test set:  4227
train set: 38042


In [26]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)

0.9381128823782853

In [27]:
class LogTagger(nltk.tag.sequential.SequentialBackoffTagger):
    def __init__(self, backoff=None):
        super().__init__(backoff)

    def choose_tag(self, tokens, index, history):
        print("tokens: ", tokens)
        print("index:  ", index)
        print("history:", history)
        return None 

for sent in test_sents:
    if len(sent) == 5:
        short_sent = sent 

tx = LogTagger(backoff=t0)
tx.evaluate([short_sent])

tokens:  ['Hierzu', 'noch', 'einige', 'Erläuterungen', '.']
index:   0
history: []
tokens:  ['Hierzu', 'noch', 'einige', 'Erläuterungen', '.']
index:   1
history: ['NN']
tokens:  ['Hierzu', 'noch', 'einige', 'Erläuterungen', '.']
index:   2
history: ['NN', 'NN']
tokens:  ['Hierzu', 'noch', 'einige', 'Erläuterungen', '.']
index:   3
history: ['NN', 'NN', 'NN']
tokens:  ['Hierzu', 'noch', 'einige', 'Erläuterungen', '.']
index:   4
history: ['NN', 'NN', 'NN', 'NN']


0.2

In [19]:
def tripple2tupple(str):
    tmp = str.split("\t")
    return (tmp[0], tmp[1])

class TreeTaggerX(nltk.tag.sequential.SequentialBackoffTagger):
    def __init__(self, language, directory, backoff=None):
        super().__init__(backoff)
        self.tagger = TTW.TreeTagger(TAGLANG=language, TAGDIR=directory)
        self.tags = None

    def choose_tag(self, tokens, index, history):
        if index == 0:
            self.tags = [tripple2tupple(tripple) for tripple in self.tagger.tag_text(" ".join(tokens))]
        if index < len(self.tags):
            return self.tags[index][1]
        return None 
       
tt = TreeTaggerX('de', '../08/tree-tagger', t0)
tt.evaluate(test_sents)

ERROR:TreeTagger:Time out for TreeTagger reply.


TreeTaggerError: Time out for TreeTagger reply, enable debug / see error logs

In [29]:
class TreeTagger(nltk.tag.sequential.SequentialBackoffTagger):
    def __init__(self, language, directory, backoff=None):
        super().__init__(backoff)
        self.tagger = TTW.TreeTagger(TAGLANG=language, TAGDIR=directory, TAGOPT='-token -sgml -quiet')
        self.tags = None

    def choose_tag(self, tokens, index, history):
        if index == 0:
            self.tags = [tripple2tupple(tripple) for tripple in self.tagger.tag_text(" ".join(tokens))]
        if index < len(self.tags):
            return self.tags[index][1]
        return None
tt = TreeTagger('de', '../08/tree-tagger', t0)
tt.evaluate(test_sents)



0.7903317535545024

In [31]:
tt = TreeTagger('de', '../08/tree-tagger', t2)
tt.evaluate(test_sents)



0.7903317535545024

In [33]:
tt = TreeTagger('de', '../08/tree-tagger', t0)
t2 = nltk.BigramTagger(train_sents, backoff=tt)
t2.evaluate(test_sents)



0.8316415338216286

In [34]:
url='https://www.deutschestextarchiv.de/book/download_fulltcf/34066'
tagged_sents = read_sentences_from_url(url)

t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(tagged_sents)

0.886370569214813

In [35]:
tt = TreeTagger('de', '../08/tree-tagger', t0)
tt.evaluate(tagged_sents)



0.8629125739735221

In [38]:
url = 'https://www.deutschestextarchiv.de/book/download_fulltcf/32274'
tagged_sents = read_sentences_from_url(url)
t2.evaluate(tagged_sents)

0.9168110918544194

In [39]:
tt.evaluate(tagged_sents)

0.9410745233968805