In [1]:
import re
import os
from glob import glob
import pywebanno
from importlib import reload
from lxml import etree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from NBAdvancementBar import log_progress
from idai_journals.nlp import tagDAI, tagDAINer, recognizeLang
from idai_journals.utils import reg_tok
from collections import namedtuple, OrderedDict
import requests

ModuleNotFoundError: No module named 'NBAdvancementBar'

In [2]:
#sklearn imports
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



# Introduction

In this notebook we illustrate the process to train, apply and test a CRF classifier for the . Our goal is to recognize first, then link to a knowledge base, a series of named-entities in a corpus of letters between archaeologists; the target NEs include:

* persons
* places
* institutions
* archaeological objects, monuments and artifacts
* dates and time expressions

# Pre-processing

## Collecting the training data

We get our training data (manually annotated and reviewed) from the Webanno instance running on [nlp.dainst.org](http://nlp.dainst.org:18080/webanno/).

We make use of he (experimental) Remote API of Webanno in order to extract the annotation. The annotated texts will then require a lot of pre-processing before they could be fed to the training functions defined below.

In Webanno, the project we're working on is the `Gelehrtekorrespondenz_Test`, which has the ID 3

In [2]:
proj_id = '3'

In [4]:
doc_ids = pywebanno.getDocIds(proj_id)

In [6]:
tsv = pywebanno.getAnnotation(proj_id, 461, "ctsv3")
tsv_2 = pywebanno.getAnnotation(proj_id, 373, "ctsv3")

In [15]:
#check the annotation status
a = pywebanno.listAnnotations(proj_id, 461, ("francesco", "sammathnaur"))
print(a[0]["state"])

FINISHED


Some of the documents have nested annotations at the same layer, which results in export errors from the API. Let's get a list of those files. We'll fix that later:

In [17]:
err = []
for k in log_progress( doc_ids.keys()):
    a = pywebanno.getAnnotation(proj_id, k, "ctsv3")
    if a.status_code == 500:
        err.append(k) 

In [65]:
print("The number of files with nested annotation is: {}".format(len(err)))

The number of files with nested annotation is: 59


Now we create a simple function to split the texts in lines

In [5]:
def splitInLines(tsv_in):
    lines = []
    for l in tsv_in.split("\n"):
        l = l.strip()
        l = l.split("\t")
        if len(l) > 1:
            lines.append(l)
    return lines

Let us now work with a sample file; we'll set up the main loop later

In [338]:
lines = splitInLines(tsv.text)

## Preprocessing Steps

### Add the POS feature

In [6]:
def addPos(lines, lang="de"):
    toks = tagDAI([t[2] for t in lines], lang="de")
    assert len(toks) == len(lines), "mismatch between lines of files and tokens"
    for l, t in zip(lines, toks):
        l.append(t[1])
        l.append(t[2])
    return lines

In [423]:
lines = splitInLines(tsv.text)

In [388]:
tagged_lines = addPos(lines)

In [389]:
tagged_lines[:10]

[['1-1', '0-3', '687', 'HEAD[1]', '_', '_', 'CARD', '687'],
 ['1-2', '4-9', 'Braun', 'HEAD[1]', '_', '_', 'NN', 'Braun'],
 ['1-3', '10-12', 'an', 'HEAD[1]', '_', '_', 'APPR', 'an'],
 ['1-4', '13-20', 'Gerhard', 'HEAD[1]', '_', '_', 'NE', 'Gerhard'],
 ['1-5', '21-24', 'Rom', 'HEAD[1]', '_', '_', 'NE', 'Rom'],
 ['1-6', '24-25', ',', 'HEAD[1]', '_', '_', '$,', ','],
 ['1-7', '26-28', '28', 'HEAD[1]', '_', '_', 'CARD', '28'],
 ['1-8', '28-29', '.', 'HEAD[1]', '_', '_', '$.', '.'],
 ['2-1', '30-37', 'Oktober', 'HEAD[1]', '_', '_', 'NN', 'Oktober'],
 ['2-2', '38-42', '1834', 'HEAD[1]', '_', '_', 'CARD', '@card@']]

### Fix the IOB

In [7]:
def tsv2iob(lines):
    """
    Quick and dirty function to convert WebAnno's TSV into IOB (2 columns, tab-separated).
    """
    
    def normalize_ne_tag(tag):
        """
        WebAnno's TSV format uses square brackets to represent entities
        made up of several tokens. This functions returns just the entity
        tag.
        """
        return tag.split("[")[0] if "[" in tag else tag
    
    Token  = namedtuple('Token',["sentence_n", "token_n", "offsets", "surface","lemma", "pos", "text_layer",
                                "entity_id", "ne"])
    
    
    
    tokens = []
    
    for line in lines:
        sentence_number = int(line[0].split("-")[0])
        token_number = int(line[0].split("-")[1])
        offsets = line[1]
        surface = line[2]
        pos = line[6]
        ne = line[5]
        lemma = line[7]
        text_layer = line[3]
        entity_id = line[4]
        ne = ne if ne!="_" else "O"
        tokens.append(Token(sentence_n=sentence_number
                            , token_n=token_number
                           , offsets=offsets
                           , surface=surface
                           , lemma = lemma
                           , pos =pos
                           , text_layer=text_layer
                           , entity_id = entity_id
                           , ne=ne))
        
    sentences = OrderedDict()
    for n,token in enumerate(tokens):

        # if sentence not yet added, add it
        if not token.sentence_n in sentences:
                sentences[token.sentence_n] = []

        prev_token = tokens[n-1]
        
        if token.ne!="O" and prev_token.ne=="O":
            ne = "B-%s" % normalize_ne_tag(token.ne)
        
        elif token.ne!="O" and prev_token.ne !="O" and token.ne != prev_token.ne:
            ne = "B-%s" % normalize_ne_tag(token.ne)
        
        elif token.ne!="O" and prev_token.ne !="O" and token.ne == prev_token.ne:
            ne = "I-%s" % normalize_ne_tag(token.ne)
        
        elif token.ne!="O" and token.ne == prev_token.ne:
            ne = "I-%s" % normalize_ne_tag(token.ne)
        
        else:
            ne = normalize_ne_tag(token.ne)
        
        sentences[token.sentence_n].append([token.surface, token.pos, token.lemma, token.text_layer, ne, token.entity_id])
    
    # transform the dictionary of sentences into a list of lists   
    iob = [sentences[key] for key in sentences.keys()]
    return iob
    

In [390]:
iob = tsv2iob(tagged_lines)

### Fix the Labels

In [8]:
lab_map = {'ANN': 'ANN',
 '_' : "_",
  "O" : "O",         
 'HEAD': 'HEAD',
 'LIT': 'LIT',
 'LOC': 'LOC',
 'MISC': 'MISC',
 'OBJ': 'OBJ',
 'ORG': 'ORG',
 'PER': 'PER',
 'TIME': 'TIME',
 'building': 'OBJbuilding',
 'date-answered': 'DATEanswered',
 'date-mentioned': 'DATEmentioned',
 'date-recieved': 'DATErecieved',
 'individual-object': 'OBJindividual',
 'letter-date': 'DATEletter',
 'multipart-monument': 'OBJmultipartmonument',
 'org-addressee': 'ORGaddressee',
 'org-author': 'ORGauthor',
 'org-mentioned': 'ORGmentioned',
 'part-of-building': 'OBJpartofbuild',
 'per-addressee': 'PERaddressee',
 'per-author': 'PERauthor',
 'per-mentioned': 'PERmentioned',
 'place-from': 'PLACEfrom',
 'place-mentioned': 'PLACEmentioned',
 'place-to': 'PLACEto',
 'post-stamp': 'DATEpoststamp',
 'topography': 'OBJtopography'
}

In [9]:
def fixLab(lab):
    reg = re.compile(r'^([IB])-(\w+(-\w*){,2})')
    m = reg.search(lab)
    if m:
        pre,tag = (m.group(1), m.group(2))
        s = "{}-{}".format(pre,reg.sub(lab_map[tag], lab))
        return s
    else:
        return lab

In [394]:
for sent in iob:
    for t in sent:
        t[4] = fixLab(t[4])

## Preprocessing: main loop

We now execute the main loop over all the documents (except for those 50 or some with overlapping errors).

We start by creating an output dir and a function to define an output file name

In [27]:
os.makedirs("data/IOB", exist_ok=True)

In [137]:
os.listdir()

['data',
 'Annotation Workflow.ipynb',
 'VIAFLookup.ipynb',
 '.ipynb_checkpoints',
 'Brat2Webanno.ipynb',
 'Prepare for Brat.ipynb',
 'Conditional Random Field Model for the Correspondance.ipynb']

In [10]:
def getOutName(in_fname, root_dir):
    base = '_'.join(in_fname.split(".")[:2]) + '.iob'
    return os.path.join(root_dir, base)

We get a list of finished docs and we stored in a variable

In [139]:
doc_ids = pywebanno.getDocIds(proj_id)
a = pywebanno.listAnnotations(proj_id, 461, ("francesco", "sammathnaur"))
print(a[0]["state"])

FINISHED


In [142]:
%%time
finished_docs = []
for d in doc_ids.keys():
    try:
        s = pywebanno.listAnnotations(proj_id, d, ("francesco", "sammathnaur"))[0]["state"]
    except IndexError:
        continue
    else:
        if s == "FINISHED":
            finished_docs.append(d)

CPU times: user 522 ms, sys: 64 ms, total: 586 ms
Wall time: 6.01 s


In [143]:
len(finished_docs)

202

In [144]:
#let us also review the list of docs and good docs...
#good_doocs = [k for k in doc_ids.keys() if k not in err]
listdir = os.listdir("data/IOB")

for d in log_progress(finished_docs):
    fname = doc_ids[d]
    outname = getOutName(fname, "data/IOB")
    #don't overwite, comment out if you want to do it, instead
    if os.path.basename(outname) in listdir:
        print("already in outdir")
        continue
    #get annotation
    tsv = pywebanno.getAnnotation(3, d, "ctsv3")
    if tsv.status_code == 404:
        tsv = pywebanno.getDocument(3, d, "ctsv3")
    if tsv.status_code == 500:
        continue
    #split lines
    lines = splitInLines(tsv.text)
    #add pos
    tagged_lines = addPos(lines)
    #fix IOB
    try:
        iob = tsv2iob(tagged_lines)
    except ValueError:
        print(d,fname, "dot in token num")
        continue
    #fix labels while writing the conll
    with open(outname, "w") as out:
        for sent in iob:
            for t in sent:
                t[4] = fixLab(t[4])
                #final touches
                ##we also get rid of the index in the text_annotation layer
                t[3] = t[3].split("[")[0]
                
                #write conll
                out.write("\t".join(t) + "\n")
            out.write("\n")

already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in outdir
already in

In [55]:
l = os.listdir("data/IOB")

In [56]:
len(l)

186

In [87]:
iob = tsv2iob(tagged_lines)

# Postprocessing 

I'd like to script a way to get rid of the false sentence ends. A false sentence break is one that separates two named entities. There *should* be an easy way to do it. If there's a B-NamedEntity followed by a sentence break and then a I-NamedEntity (of the same type as the B-NE, but that is redundant information), then *that* is a phoney sentence end!

Let us try to implement this

In [146]:
with open("data/IOB/1_Braun_an_Gerhard1832-35_page001.iob") as f:
    iob = f.readlines()

In [11]:
def delFalseSBreaks(filepath):
    with open(filepath) as f:
        iob = f.readlines()
    to_del = []
    for i,line in enumerate(iob):
        t = line.split("\t")
        #reg = re.compile(r'^[IB]-(\w+)$')
        if len(t) == 6 and i + 2 < len(iob):
            #m = reg.search(t[3])
            if t[0] == "." and t[4] != 'O':
                if iob[i+1] == "\n":
                    if iob[i+2].split("\t")[3] == t[3].replace("B-", "I-"):
                        to_del.append(i+1)
    return to_del

In [182]:
iobs = glob("data/IOB/*.iob")
len(iobs)

186

In [183]:
delFalseSBreaks("data/IOB/1_Braun_an_Gerhard1832-35_page062.iob")

[]

In [184]:
for iob in iobs:
    to_del = delFalseSBreaks(iob)
    #assert all(x == '\n' for x in to_del), "there is something wrong in file: {}".format(i)
    with open(iob) as f:
        lines = f.readlines()
    assert all(x == '\n' for x in [lines[i] for i in to_del]), "not all the lines are sentence breaks"
    with open(iob, "w") as out:
        for i,l in enumerate(lines):
            if i not in to_del:
                out.write(l)

In [185]:
with open(iob) as f:
    lines = f.readlines()
[lines[i] for i in to_del]

[]

# Reading the corpus

## An ad hoc CorpuReader

We will basically rely on NLTK's ConllCorpusReader, but we'll introduce a couple of modifications in order to account for the special column types and to get the full list of features in CoNLL style

In [12]:
from nltk.corpus.reader import ConllCorpusReader
from nltk.util import LazyMap, LazyConcatenation

class KorrIOBCorpusReader(ConllCorpusReader):
    
    def __init__(self, root, fileids, columntypes):
        super().__init__(root, fileids, [c for c in columntypes if c not in ["textlayer", "entityid", "lemma"]])
        self.TEXTLAYER = "textlayer"
        self.ENTITYID="entityid"
        self.LEMMA="lemma"
        self.COLUMN_TYPES = (self.WORDS, self.LEMMA, self.POS, self.TREE, self.CHUNK, self.NE, self.SRL, 
                             self.IGNORE, self.TEXTLAYER, self.ENTITYID)
        self._colmap = dict((c,i) for (i,c) in enumerate(columntypes))
        
    def full_tagged_words(self, fileids=None, tagset=None):
        #self._require(self.WORDS, self.POS, self.TEXTLAYER, self.CHUNK, self.LEMMA)#, self.ENTITYID)
        def get_tagged_words(grid):
            return self._get_full_tagged_words(grid, tagset)
        return LazyConcatenation(LazyMap(get_tagged_words,
                                         self._grids(fileids)))

    def full_tagged_sents(self, fileids=None, tagset=None):
        #self._require(self.WORDS, self.POS)
        def get_tagged_words(grid):
            return self._get_full_tagged_words(grid, tagset)
        return LazyMap(get_tagged_words, self._grids(fileids))

    def _get_full_tagged_words(self, grid, tagset=None):
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        return list(zip(
                self._get_column(grid, self._colmap['words']),
                pos_tags,
                self._get_column(grid, self._colmap['lemma']),
                self._get_column(grid, self._colmap['textlayer']),
                self._get_column(grid, self._colmap['chunk'])#,
#                self._get_column(grid, self._colmap['entityid'])
                ))
                

In [68]:
newcols = ["words", "pos", "lemma", "textlayer", "chunk", "entityid"]
newcorpus = KorrIOBCorpusReader("data/IOB", r".*\.iob", columntypes=newcols)

In [69]:
newcorpus.full_tagged_words()[:9]

[('(', '$(', '(', 'HEAD', 'O'),
 (',', '$,', ',', 'HEAD', 'O'),
 ('Braun', 'NE', 'Braun', 'HEAD', 'B-PERauthor'),
 ('an', 'APPR', 'an', 'HEAD', 'O'),
 ('Gerhard', 'NE', 'Gerhard', 'HEAD', 'B-PERaddressee'),
 ('Dresden', 'NE', 'Dresden', 'HEAD', 'B-PLACEfrom'),
 (',', '$,', ',', 'HEAD', 'O'),
 ('10', 'CARD', '10', 'HEAD', 'B-DATEletter'),
 ('.', '$.', '.', 'HEAD', 'I-DATEletter')]

In [70]:
newcorpus.full_tagged_sents()[0]

[('(', '$(', '(', 'HEAD', 'O'),
 (',', '$,', ',', 'HEAD', 'O'),
 ('Braun', 'NE', 'Braun', 'HEAD', 'B-PERauthor'),
 ('an', 'APPR', 'an', 'HEAD', 'O'),
 ('Gerhard', 'NE', 'Gerhard', 'HEAD', 'B-PERaddressee'),
 ('Dresden', 'NE', 'Dresden', 'HEAD', 'B-PLACEfrom'),
 (',', '$,', ',', 'HEAD', 'O'),
 ('10', 'CARD', '10', 'HEAD', 'B-DATEletter'),
 ('.', '$.', '.', 'HEAD', 'I-DATEletter')]

In [71]:
len(newcorpus.words())

38086

In [67]:
!grep -rnw 'data/IOB/' -e 'AdI 1833'

data/IOB/1_Braun_an_Gerhard1832-35_page143.iob:211:3te	ADJA	<unknown>	_	B-LIT	AdI 1833[9]
data/IOB/1_Braun_an_Gerhard1832-35_page143.iob:212:Heft	NN	Heft	_	I-LIT	AdI 1833[9]
data/IOB/1_Braun_an_Gerhard1832-35_page143.iob:213:der	ART	die	_	I-LIT	AdI 1833[9]
data/IOB/1_Braun_an_Gerhard1832-35_page143.iob:214:Annalen	NN	Annalen	_	I-LIT	AdI 1833[9]
data/IOB/1_Braun_an_Gerhard1832-35_page143.iob:215:33	CARD	33	_	I-LIT	AdI 1833[9]


## Training-Test split

In [13]:
def splitTrainTest(iob_corpus, perc_test=0.2):
    import random
    import numpy as np
    
    assert 0 < perc_test < 1, "The test percentage must be a value between 0 and 1"
    c = 0
    max_test_len = int(np.ceil(len(iob_corpus.words()) * perc_test))
    corpus_train, corpus_test, feature_train, feature_test = ([],[],[],[])
    sents = list(iob_corpus.full_tagged_sents())
    random.shuffle(sents)
    for sent in sents:
        if c < max_test_len:
            corpus_test.append(sent)
            c = c + len(sent)
        else:
            corpus_train.append(sent)
    return corpus_train, corpus_test#, feature_train, feature_test

In [14]:
def countSentenceTokens(sentence_list):
    return len([t for s in sentence_list for t in s])

In [74]:
train, test = splitTrainTest(newcorpus)

In [75]:
countSentenceTokens(train)

30464

# CRF model

## Feature selection 

Let's start with a list of features that we may include:
* word

In [15]:
def getPostagChain(sent, i, direction="forward"):
    fwd = [i,i+1,i+2]
    bck = [i-2,i-1,i]
    li = fwd if direction == "forward" else bck
    tagchain = ''
    for l in li:
        if l < 0:
            tagchain = tagchain + "-0"
        else:
            try:
                tagchain = tagchain + "-{}".format(sent[l][1])
            except IndexError:
                tagchain = tagchain + "-0"
    return tagchain.lstrip("-")

In [16]:
def endsWithDigit(s):
    import re
    return True if re.search(r'\d+$', s) else False

In [17]:
def hasDigit(s):
    return any(i.isdigit() for i in s)

In [18]:
def isInDic(li, v):
    return v in li

In [19]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    lemma = sent[i][2]
    with open("data/dictionaries/persons.txt") as f:
        persons_dic = f.read().split("\n")
    with open("data/dictionaries/places.txt") as f:
        places_dic = f.read().split("\n")
    
    features = {
        #'bias': 1.0, #?
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        "word.hasdigit()" :  hasDigit(word),
        #suffix
        'word[-4:]': word[-4:],
        'word[-2:]': word[-2:],
        #prefix
        'word[:5]' : word[:5],
        'word[:3]' : word[:3],
        'word.endswithdigit()' : endsWithDigit(word),
        'word.isInPersonDic' : isInDic(persons_dic, word),
        'word.isInPlacesDic' : isInDic(places_dic, word),
        'word.isHeader' : False if sent[i][3] == '_' else True,
        'postag': postag,
        'lemma' : lemma,
        #'postag[:2]': postag[:2],
        'postag_chain_fwd' : getPostagChain(sent, i),
        'postag_chain_bck' : getPostagChain(sent, i, "backward")
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        lemma1 = sent[i-1][2]
        #features['postag_chain'] = postag1 + "-" + features['postag_chain']
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word[-4:]': word1[-4:],
            '-1:word[-2:]': word1[-2:],            
            '-1:word.isInPersonDic' : isInDic(persons_dic, word1),
            '-1:word.isInPlacesDic' : isInDic(places_dic, word1),
            '-1:postag': postag1,
            '-1:lemma' : lemma1,
            'isPrevBigramInPlacesDic': isInDic(places_dic, " ".join([word1, word]))
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        lemma1 = sent[i+1][2]
        #features['postag_chain'] = features['postag_chain'] + "-" + postag1
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word[-4:]': word1[-4:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isInPersonDic' : isInDic(persons_dic, word1),
            '+1:word.isInPlacesDic' : isInDic(places_dic, word1),
            '+1:postag': postag1,
            '+1:lemma' : lemma1,
            'isFollwBigramInPlacesDic': isInDic(places_dic, " ".join([word, word1]))
            #'+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

In [20]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, lemma, text_layer, label in sent]

def sent2tokens(sent):
    return [token for token, postag, lemma, text_layer, label in sent]

In [86]:
%%time
sent2features(train[0])[17]
#sent2tokens(train[0])[0]

CPU times: user 51 ms, sys: 216 µs, total: 51.3 ms
Wall time: 49.9 ms


{'+1:lemma': 'mehr',
 '+1:postag': 'ADV',
 '+1:word.isInPersonDic': False,
 '+1:word.isInPlacesDic': False,
 '+1:word.isdigit()': False,
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'mehr',
 '+1:word[-2:]': 'hr',
 '+1:word[-4:]': 'mehr',
 '-1:lemma': 'ich',
 '-1:postag': 'PPER',
 '-1:word.isInPersonDic': False,
 '-1:word.isInPlacesDic': False,
 '-1:word.isdigit()': False,
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.lower()': 'ich',
 '-1:word[-2:]': 'ch',
 '-1:word[-4:]': 'ich',
 'isFollwBigramInPlacesDic': False,
 'isPrevBigramInPlacesDic': False,
 'lemma': 'Sie|sie',
 'postag': 'PPER',
 'postag_chain_bck': 'VVFIN-PPER-PPER',
 'postag_chain_fwd': 'PPER-ADV-KOKOM',
 'word.endswithdigit()': False,
 'word.hasdigit()': False,
 'word.isHeader': False,
 'word.isInPersonDic': False,
 'word.isInPlacesDic': False,
 'word.isdigit()': False,
 'word.istitle()': True,
 'word.isupper()': False,
 'word.lower()': 'sie',
 'word[-2:]': 'ie',
 'wor

## Fitting the Model

In [99]:
%%time
X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_test = [sent2features(s) for s in test]
y_test = [sent2labels(s) for s in test]

CPU times: user 29.6 s, sys: 835 ms, total: 30.4 s
Wall time: 30.7 s


In [100]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [101]:
%%time
crf.fit(X_train, y_train)

CPU times: user 42.2 s, sys: 5.81 ms, total: 42.2 s
Wall time: 42.3 s


CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

## Model evaluation

In [103]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-PERmentioned',
 'B-ORGmentioned',
 'B-PLACEmentioned',
 'B-LIT',
 'B-OBJindividual',
 'I-OBJindividual',
 'I-PERmentioned',
 'I-LIT',
 'I-DATEmentioned',
 'B-PERauthor',
 'B-PERaddressee',
 'B-PLACEfrom',
 'B-DATEletter',
 'I-DATEletter',
 'B-MISC',
 'I-MISC',
 'B-OBJmultipartmonument',
 'B-DATEmentioned',
 'I-ORGmentioned',
 'I-PERaddressee',
 'B-OBJtopography',
 'I-OBJtopography',
 'I-PERauthor',
 'I-PLACEmentioned',
 'I-DATEpoststamp',
 'B-DATEpoststamp',
 'I-PLACEfrom',
 'B-*',
 'I-*',
 'B-OBJbuilding',
 'I-OBJmultipartmonument',
 'I-OBJbuilding',
 'B-OBJpartofbuild']

In [104]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.71028713112264841

Inspect the classes

In [105]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                        precision    recall  f1-score   support

                   B-*      1.000     0.833     0.909         6
                   I-*      1.000     0.500     0.667         2
          B-DATEletter      1.000     1.000     1.000        49
          I-DATEletter      0.984     1.000     0.992       126
       B-DATEmentioned      0.750     0.214     0.333        14
       I-DATEmentioned      0.600     0.300     0.400        30
       B-DATEpoststamp      0.000     0.000     0.000         0
       I-DATEpoststamp      0.000     0.000     0.000         0
                 B-LIT      0.643     0.209     0.316        43
                 I-LIT      0.480     0.203     0.286        59
                B-MISC      1.000     0.167     0.286         6
                I-MISC      0.000     0.000     0.000         0
         B-OBJbuilding      0.000     0.000     0.000         0
         I-OBJbuilding      0.000     0.000     0.000         0
       B-OBJindividual      0.812     0

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### Hyperparameter optimization

In [106]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'reca

CPU times: user 1min 18s, sys: 2.31 s, total: 1min 21s
Wall time: 6min 5s


In [107]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.05839310106111339, 'c2': 0.008184205122822818}
best CV score: 0.6701704845609529
model size: 0.81M


In [109]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                        precision    recall  f1-score   support

                   B-*      1.000     0.833     0.909         6
                   I-*      1.000     0.500     0.667         2
          B-DATEletter      1.000     1.000     1.000        49
          I-DATEletter      0.969     1.000     0.984       126
       B-DATEmentioned      0.800     0.286     0.421        14
       I-DATEmentioned      0.625     0.333     0.435        30
       B-DATEpoststamp      0.000     0.000     0.000         0
       I-DATEpoststamp      0.000     0.000     0.000         0
                 B-LIT      0.636     0.326     0.431        43
                 I-LIT      0.500     0.305     0.379        59
                B-MISC      1.000     0.167     0.286         6
                I-MISC      0.000     0.000     0.000         0
         B-OBJbuilding      0.000     0.000     0.000         0
         I-OBJbuilding      0.000     0.000     0.000         0
       B-OBJindividual      0.867     0

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### What the classifier learned

In [110]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-DATEmentioned -> I-DATEmentioned 7.009593
B-OBJtopography -> I-OBJtopography 6.890013
I-OBJtopography -> I-OBJtopography 6.774018
I-PLACEmentioned -> I-PLACEmentioned 6.730179
O      -> O       6.563014
I-MISC -> I-MISC  6.347162
I-OBJmultipartmonument -> I-OBJmultipartmonument 6.320063
I-ORGmentioned -> I-ORGmentioned 6.223036
I-LIT  -> I-LIT   6.187674
B-DATEletter -> I-DATEletter 6.109272
I-OBJindividual -> I-OBJindividual 5.728987
B-OBJindividual -> I-OBJindividual 5.720990
I-DATEpoststamp -> I-DATEpoststamp 5.659648
B-DATEmentioned -> I-DATEmentioned 5.496451
I-DATEletter -> I-DATEletter 5.490149
B-ORGmentioned -> I-ORGmentioned 5.485261
B-PLACEmentioned -> I-PLACEmentioned 4.986394
I-PERmentioned -> I-PERmentioned 4.983721
B-LIT  -> I-LIT   4.706322
B-PERmentioned -> I-PERmentioned 4.653803

Top unlikely transitions:
B-DATEletter -> O       -1.529994
O      -> I-PLACEmentioned -1.557958
B-PERauthor -> B-PERauthor -1.623163
B-PERmentioned -> I-ORGmentione

In [111]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
6.613460 B-OBJindividual word[-4:]:vase
6.456555 B-*      -1:postag:_
6.391896 B-ORGmentioned postag_chain_bck:VVINF-KON-NN
5.905394 B-PLACEmentioned word.isInPlacesDic
5.609687 O        postag:_
5.148770 O        BOS
4.410498 B-PERmentioned word[:3]:Bel
4.369584 B-ORGmentioned lemma:Institut
4.348198 I-OBJindividual -1:word.lower():nr
4.281120 B-ORGmentioned word[:3]:Gov
4.174359 B-PERmentioned word[:5]:Silli
4.129707 O        postag_chain_fwd:NE-VAFIN-ADV
3.722150 B-PLACEfrom word.isInPlacesDic
3.640489 B-ORGmentioned word[-2:]:ät
3.608367 B-*      lemma:NE
3.607903 O        postag:PPER
3.594653 O        postag_chain_fwd:NE-$,-XY
3.583790 B-PLACEmentioned word[:5]:Capit
3.545907 B-PERmentioned word[-2:]:ws
3.522069 B-DATEletter postag_chain_fwd:VVFIN-CARD-$.
3.478186 B-LIT    -1:word.lower():müllers
3.435822 O        postag_chain_bck:APPRART-NN-$.
3.433746 B-OBJindividual word[-4:]:eler
3.412532 O        postag_chain_fwd:NN-ADV-NN
3.395586 B-LIT    word[-4:]:hnis
3.3773

# Scrapbook

In [100]:
i = 2
l = ['a','b','c']
l[i]

'c'

In [103]:
" ".join(l[i:i+2])

'c'

In [63]:
x = pywebanno.getXMI(3,260,"francesco")

In [65]:
from lxml import etree

In [61]:
r = changeAnnotation(226)

Braun in the HEAD
Gerhard in the HEAD
Rom or Dresden in the HEAD


In [62]:
r.text

'Text of annotation document does not match text of source document at offset [33]. Expected [ März 1835 - 4 - bed] but found [\nMärz 1835 - 4 - bed].'

In [40]:
l = tsv.text.split("\n")[9].split("\t")
l

['1-4', '13-20', 'Gerhard', 'HEAD[1]', '_', '_', '']

In [42]:
print("\n".join(["puppa", "", "tu ma'"]))

puppa

tu ma'


In [5]:
doc_ids

{225: '1.Braun_an_Gerhard1832-35_page189.iob-doc-1.ann.xmi',
 226: '1.Braun_an_Gerhard1832-35_page190.iob-doc-1.ann.xmi',
 227: '1.Braun_an_Gerhard1832-35_page191.iob-doc-1.ann.xmi',
 228: '1.Braun_an_Gerhard1832-35_page192.iob-doc-1.ann.xmi',
 229: '1.Braun_an_Gerhard1832-35_page193.iob-doc-1.ann.xmi',
 230: '1.Braun_an_Gerhard1832-35_page194.iob-doc-1.ann.xmi',
 231: '1.Braun_an_Gerhard1832-35_page195.iob-doc-1.ann.xmi',
 232: '1.Braun_an_Gerhard1832-35_page196.iob-doc-1.ann.xmi',
 233: '1.Braun_an_Gerhard1832-35_page197.iob-doc-1.ann.xmi',
 234: '1.Braun_an_Gerhard1832-35_page198.iob-doc-1.ann.xmi',
 235: '1.Braun_an_Gerhard1832-35_page199.iob-doc-1.ann.xmi',
 236: '1.Braun_an_Gerhard1832-35_page200.iob-doc-1.ann.xmi',
 237: '1.Braun_an_Gerhard1832-35_page201.iob-doc-1.ann.xmi',
 238: '1.Braun_an_Gerhard1832-35_page202.iob-doc-1.ann.xmi',
 239: '1.Braun_an_Gerhard1832-35_page203.iob-doc-1.ann.xmi',
 240: '1.Braun_an_Gerhard1832-35_page204.iob-doc-1.ann.xmi',
 241: '1.Braun_an_Gerhar

In [96]:
sents[0][1]

('Braun', 'NN', 'HEAD', 'O', '_')

In [34]:
int(np.ceil(2.))

2

In [35]:
int(np.ceil(len(newcorpus.words()) * 0.2))

6114

In [24]:
0 < 0.2 < 1

True

In [136]:
tag = t[3]
tag = "B-PERmentioned"

In [137]:
tag.replace("B-", "I-")

'I-PERmentioned'

In [91]:
a = -1
2 + a

1

In [126]:
reg = re.compile(r'^[IB]-(\w+)$')

In [128]:
t = iob[90].split("\t")
t

['.', '$.', '_', 'I-PERmentioned', '118717030[7]\n']

In [123]:
iob[123 + 2]

'\n'

In [29]:
e = doc_ids[good_doocs[0]]
e

'1.Braun_an_Gerhard1832-35_page189.iob-doc-1.ann.xmi'

In [44]:
getOutName(e, "data/IOB")

'data/IOB/1_Braun_an_Gerhard1832-35_page189.iob'

In [45]:
iob[1]

[['März', 'NN', 'HEAD[1]', 'O', '_'],
 ['183', 'ADJA', 'HEAD[1]', 'O', '_'],
 ['~', 'XY', 'HEAD[1]', 'O', '_'],
 ['Dresden', 'NE', 'HEAD[1]', 'B-PLACEfrom', '2044633'],
 ['10', 'CARD', 'HEAD[1]', 'B-DATEletter', '*[2]'],
 ['März', 'NN', 'HEAD[1]', 'I-DATEletter', '*[2]'],
 ['1832', 'CARD', 'HEAD[1]', 'I-DATEletter', '*[2]'],
 ['Mein', 'PPOSAT', '_', 'O', '_'],
 ['freundlichster', 'ADJA', '_', 'O', '_'],
 ['Herr', 'NN', '_', 'B-PERaddressee', '118717030[3]'],
 ['Professor', 'NN', '_', 'I-PERaddressee', '118717030[3]'],
 ['Die', 'ART', '_', 'O', '_'],
 ['Wohlthat', 'NE', '_', 'O', '_'],
 ['und', 'KON', '_', 'O', '_'],
 ['Annehmlichkeit', 'NN', '_', 'O', '_'],
 ['in', 'APPR', '_', 'O', '_'],
 ['Ihrer', 'PPOSAT', '_', 'O', '_'],
 ['nächsten', 'ADJA', '_', 'O', '_'],
 ['Nähe', 'NN', '_', 'O', '_'],
 ['zu', 'APPR', '_', 'O', '_'],
 ['logiren', 'NN', '_', 'O', '_'],
 ['ist', 'VAFIN', '_', 'O', '_'],
 ['mir', 'PPER', '_', 'O', '_'],
 ['zu', 'PTKA', '_', 'O', '_'],
 ['werth', 'ADJD', '_', 'O', 

In [46]:
lines[1]

['1-2', '1-2', ',', 'HEAD[1]', '_', '_', '']

In [90]:
for t in iob[1]:
    m = reg.search(t[3])
    if m:
        pre,tag = (m.group(1), m.group(2))
        s = "{}-{}".format(pre,reg.sub(lab_map[tag], t[3]))
        print(s)
    else:
        print(t[3])

O
O
O
B-PLACEfrom
B-DATEletter
I-DATEletter
I-DATEletter
O
O
B-PERaddressee
I-PERaddressee
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O


In [86]:
iob[1]

[['März', 'NN', 'HEAD[1]', 'O', '_'],
 ['183', 'ADJA', 'HEAD[1]', 'O', '_'],
 ['~', 'XY', 'HEAD[1]', 'O', '_'],
 ['Dresden', 'NE', 'HEAD[1]', 'B-place-from', '2044633'],
 ['10', 'CARD', 'HEAD[1]', 'B-letter-date', '*[2]'],
 ['März', 'NN', 'HEAD[1]', 'I-letter-date', '*[2]'],
 ['1832', 'CARD', 'HEAD[1]', 'I-letter-date', '*[2]'],
 ['Mein', 'PPOSAT', '_', 'O', '_'],
 ['freundlichster', 'ADJA', '_', 'O', '_'],
 ['Herr', 'NN', '_', 'B-per-addressee', '118717030[3]'],
 ['Professor', 'NN', '_', 'I-per-addressee', '118717030[3]'],
 ['Die', 'ART', '_', 'O', '_'],
 ['Wohlthat', 'NE', '_', 'O', '_'],
 ['und', 'KON', '_', 'O', '_'],
 ['Annehmlichkeit', 'NN', '_', 'O', '_'],
 ['in', 'APPR', '_', 'O', '_'],
 ['Ihrer', 'PPOSAT', '_', 'O', '_'],
 ['nächsten', 'ADJA', '_', 'O', '_'],
 ['Nähe', 'NN', '_', 'O', '_'],
 ['zu', 'APPR', '_', 'O', '_'],
 ['logiren', 'NN', '_', 'O', '_'],
 ['ist', 'VAFIN', '_', 'O', '_'],
 ['mir', 'PPER', '_', 'O', '_'],
 ['zu', 'PTKA', '_', 'O', '_'],
 ['werth', 'ADJD', '_',

In [74]:
m=reg.search("B-per-mentioned")
m

<_sre.SRE_Match object; span=(0, 15), match='B-per-mentioned'>

In [80]:
m.group(2)

'-mentioned'

In [17]:
lines = splitInLines(tsv.text)

In [18]:
lines[:10]

[['1-1', '0-1', '(', 'HEAD[1]', '_', '_', ''],
 ['1-2', '1-2', ',', 'HEAD[1]', '_', '_', ''],
 ['1-3', '3-8', 'Braun', 'HEAD[1]', '_', '_', ''],
 ['1-4', '9-11', 'an', 'HEAD[1]', '_', '_', ''],
 ['1-5', '12-19', 'Gerhard', 'HEAD[1]', '_', '_', ''],
 ['1-6', '20-27', 'Dresden', 'HEAD[1]', '_', '_', ''],
 ['1-7', '27-28', ',', 'HEAD[1]', '_', '_', ''],
 ['1-8', '29-31', '10', 'HEAD[1]', '_', '_', ''],
 ['1-9', '31-32', '.', 'HEAD[1]', '_', '_', ''],
 ['2-1', '33-37', 'März', 'HEAD[1]', '_', '_', '']]

In [19]:
i = tsv2iob(lines)

In [22]:
i[1]

[['März', '', 'HEAD[1]', 'O', '_'],
 ['183', '', 'HEAD[1]', 'O', '_'],
 ['~', '', 'HEAD[1]', 'O', '_'],
 ['Dresden', '', 'HEAD[1]', 'B-place-from', '2044633'],
 ['10', '', 'HEAD[1]', 'B-letter-date', '*[2]'],
 ['März', '', 'HEAD[1]', 'I-letter-date', '*[2]'],
 ['1832', '', 'HEAD[1]', 'I-letter-date', '*[2]'],
 ['Mein', '', '_', 'O', '_'],
 ['freundlichster', '', '_', 'O', '_'],
 ['Herr', '', '_', 'B-per-addressee', '118717030[3]'],
 ['Professor', '', '_', 'I-per-addressee', '118717030[3]'],
 ['Die', '', '_', 'O', '_'],
 ['Wohlthat', '', '_', 'O', '_'],
 ['und', '', '_', 'O', '_'],
 ['Annehmlichkeit', '', '_', 'O', '_'],
 ['in', '', '_', 'O', '_'],
 ['Ihrer', '', '_', 'O', '_'],
 ['nächsten', '', '_', 'O', '_'],
 ['Nähe', '', '_', 'O', '_'],
 ['zu', '', '_', 'O', '_'],
 ['logiren', '', '_', 'O', '_'],
 ['ist', '', '_', 'O', '_'],
 ['mir', '', '_', 'O', '_'],
 ['zu', '', '_', 'O', '_'],
 ['werth', '', '_', 'O', '_'],
 ['und', '', '_', 'O', '_'],
 ['lieb', '', '_', 'O', '_'],
 [',', '', '

In [58]:
tsv.text

'Annotation for user [francesco] on document [235] in project [3] not found.'

In [187]:
!grep -rnw 'data/IOB/' -e 'B-\*'

In [205]:
from nltk.corpus.reader.util import read_blankline_block

In [224]:
import io
stream = io.StringIO('''

.	$.	.	_	O	_
.	$.	.	_	O	_
<	$(	<	_	O	_
[	$(		_	O	_
i	VVFIN	<unknown>	_	O	_
.	$.	.	_	O	_
l	NN	l	_	O	_
,	$,	,	_	O	_
"	$(	"	_	O	_
12	CARD	12	HEAD[1]	O	_
Braun	NN	Braun	HEAD[1]	O	_
an	APPR	an	HEAD[1]	O	_
Gerhard	NE	Gerhard	HEAD[1]	O	_
Rom	NE	Rom	HEAD[1]	O	_
,	$,	,	HEAD[1]	O	_
8	CARD	8	HEAD[1]	O	_
.	$.	.	HEAD[1]	O	_

''')

In [216]:
grids = []
for block in read_blankline_block(stream):
    block = block.strip()
    if not block: continue

    grid = [line.split() for line in block.split('\n')]

    # If there's a docstart row, then discard. ([xx] eventually it
    # would be good to actually use it)

    # Check that the grid is consistent.
    for row in grid:
        if len(row) != len(grid[0]):
            raise ValueError('Inconsistent number of columns:\n%s'
                             % block)
    grids.append(grid)

ValueError: Inconsistent number of columns:
.	$.	.	_	O	_
.	$.	.	_	O	_
<	$(	<	_	O	_
[	$(		_	O	_
i	VVFIN	<unknown>	_	O	_
.	$.	.	_	O	_
l	NN	l	_	O	_
,	$,	,	_	O	_
"	$(	"	_	O	_
12	CARD	12	HEAD[1]	O	_
Braun	NN	Braun	HEAD[1]	O	_
an	APPR	an	HEAD[1]	O	_
Gerhard	NE	Gerhard	HEAD[1]	O	_
Rom	NE	Rom	HEAD[1]	O	_
,	$,	,	HEAD[1]	O	_
8	CARD	8	HEAD[1]	O	_
.	$.	.	HEAD[1]	O	_

In [225]:
r = read_blankline_block(stream)

In [234]:
st = [l[0] for l in grid[:-1]]
st

['.',
 '.',
 '<',
 '[',
 'i',
 '.',
 'l',
 ',',
 '"',
 '12',
 'Braun',
 'an',
 'Gerhard',
 'Rom',
 ',',
 '8',
 '.']

In [235]:
tagDAI(st, lang="de")

[['.', '$.', '.'],
 ['.', '$.', '.'],
 ['<', '$(', '<'],
 ['[', '$(', '['],
 ['i', 'VVFIN', '<unknown>'],
 ['.', '$.', '.'],
 ['l', 'NN', 'l'],
 [',', '$,', ','],
 ['"', '$(', '"'],
 ['12', 'CARD', '12'],
 ['Braun', 'NN', 'Braun'],
 ['an', 'APPR', 'an'],
 ['Gerhard', 'NE', 'Gerhard'],
 ['Rom', 'NE', 'Rom'],
 [',', '$,', ','],
 ['8', 'CARD', '8'],
 ['.', '$.', '.']]

In [232]:
grid = [line.split() for line in block.split('\n')]
for g in grid: print(len(g))

6
6
6
5
6
6
6
6
6
6
6
6
6
6
6
6
6
0


In [6]:
postags = ["NN", "DET", "PP", "S"]
i = 0

In [86]:
def isInDic(li, v):
    return v in li

In [88]:
isInDic([1,2,3,4], 2)

True

In [117]:
import json
with open("data/objekt.json") as data_file:    
    txt = data_file.readlines()

In [127]:
objekts = json.loads("".join(txt[9:]))

In [132]:
locations_fields = ["Fundort", "Fundstaat"]

In [130]:
fields = ["GattungAllgemein", 
        "KurzbeschreibungObjekt",
        "Materialbeschreibung",
        "Material",
        "TechnikDetails",
        "Technik",
        "funktionaleVerwendung",
        "Fundkontext"
       ]

In [135]:
terms = []
locations = []
for o in objekts:
    for f in fields:
        t = o[f]
        if t != '':
            terms.append(t)
    for l in locations_fields:
        loc = o[l]
        if loc != '':
            locations.append(loc)

In [141]:
refined_terms = []
for term in terms:
    for t in term.split(";"):
        refined_terms.append(t)

In [144]:
with open("data/dictionaries/objects.txt", "w") as out:
    for t in set(refined_terms):
        out.write(t + "\n")

In [134]:
objekts[163]

{'Adressat': '',
 'AntikerAufstellungsort': '',
 'ApplizierteElemente': '',
 'Arbeitsnotiz': '',
 'Auftraggeber': '',
 'BearbeiterObjekt': 'M.L. (06.12.1994)',
 'Bearbeitungen': '',
 'BearbeitungenAntik': '',
 'BearbeitungenModern': '',
 'BerlinDatenblatt': '',
 'BerlinInstitution': None,
 'BerlinInventar': '',
 'BerlinObjekt': '',
 'BerlinObjektID': None,
 'BerlinObjektstatus': None,
 'BerlinVerantwortlich': None,
 'BreiteGesamt': '',
 'DatensatzGruppeObjekt': 'Arachne',
 'DekorAllgemein': '',
 'DurchmesserGesamt': '',
 'Erhaltung': '',
 'Erhaltungszustand': '',
 'FS_CMSNR': None,
 'FS_GruppenID_bak': None,
 'FS_Objekt_Siegel_Key': '0',
 'FS_Plomben_ID': None,
 'FS_TopographieID': None,
 'Farbreste': '',
 'FreieBeschreibung': None,
 'FunddatumObjekt': '',
 'Fundkontext': '',
 'Fundort': 'Amrit',
 'Fundstaat': '',
 'Funktion': '',
 'GattungAllgemein': 'Rundplastik',
 'GewichtGesamt': '',
 'GipsInBonn': '',
 'HerkFundKommentar': '',
 'Herkunft': '',
 'HoeheGesamt': '',
 'InschriftCorpus

In [125]:
print("".join(txt[9:15]))

[{
    "PS_ObjektID": "1",
    "FS_GruppenID_bak": null,
    "FS_TopographieID": null,
    "DatensatzGruppeObjekt": "Arachne",
    "Adressat": "",

