# PROVIDEDH Collaborative platform
## Jupyter notebook

In [4]:
import re
import os
from functools import reduce
from lxml import etree as et
from lxml.etree import Element
import itertools
import tqdm
from pathlib import Path
from similarity.normalized_levenshtein import NormalizedLevenshtein

In [269]:
original_dep_folder = './Version 8.3.10 with original normalized depositions + marked persons/'
seed_dep_folder = './seed/'
processed_dep_folder = './depositions_subset'#'./Version 8.3.10 with original normalized depositions + marked persons + gazeette ner/'

## 1 File Processing

File procesing for retrieving a  list of non annotated text tokens along with the
information regarding the DOM context.

1. Preprocess the document
    1. Cleanup of embbeded characters:
        - __`&#13`__ -> __`<br/>`__
        - __`&gt`__ -> __`>`__
        - __`&lt`__ -> __`<`__

    2. Creation of XML DOM tree
2. Processing of the document
    3. Extraction of non empty text nodes
    4. Recursively assign parent tag types
    5. Filter out already annotated text nodes
    6. Split text nodes into word tokens

In [6]:
namespaces = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    'xml': 'http://www.w3.org/XML/1998/namespace',
    'xi': 'http://www.w3.org/2001/XInclude',
}

In [48]:
def setup_annotation_list(dep_raw):
    dep_tree = et.fromstring(dep_raw.encode())

    certainties = dep_tree.xpath('//tei:teiHeader'
                                 '//tei:classCode[@scheme="http://providedh.eu/uncertainty/ns/1.0"]',
                                 namespaces=namespaces)

    if not certainties:
        add_annotation_list(dep_tree)

    text = et.tounicode(dep_tree)

    return text

def add_annotation_list(dep_tree):
    default_namespace = namespaces['tei']
    default = "{%s}" % default_namespace

    ns_map = {
        None: default_namespace
    }

    profile_desc = dep_tree.xpath('//tei:teiHeader/tei:profileDesc', namespaces=namespaces)

    if not profile_desc:
        tei_header = dep_tree.xpath('//tei:teiHeader', namespaces=namespaces)
        profile_desc = et.Element(default + 'profileDesc', nsmap=ns_map)
        tei_header[0].append(profile_desc)

    text_class = dep_tree.xpath('//tei:teiHeader/tei:profileDesc/tei:textClass', namespaces=namespaces)

    if not text_class:
        profile_desc = dep_tree.xpath('//tei:teiHeader/tei:profileDesc', namespaces=namespaces)
        text_class = et.Element(default + 'textClass', nsmap=ns_map)
        profile_desc[0].append(text_class)

    class_code = dep_tree.xpath(
        '//tei:teiHeader/tei:profileDesc/tei:textClass/tei:classCode[@scheme="http://providedh.eu/uncertainty/ns/1.0"]',
        namespaces=namespaces)

    if not class_code:
        text_class = dep_tree.xpath('//tei:teiHeader/tei:profileDesc/tei:textClass', namespaces=namespaces)
        class_code = et.Element(default + 'classCode', scheme="http://providedh.eu/uncertainty/ns/1.0",
                                   nsmap=ns_map)
        text_class[0].append(class_code)

In [261]:
def add_annotator(dep_raw):
    dep_tree = et.fromstring(dep_raw.encode())

    list_person = dep_tree.xpath('//tei:teiHeader'
                             '//tei:listPerson[@type="PROVIDEDH Annotators"]', namespaces=namespaces)

    if not list_person:
        dep_tree = create_list_person(dep_tree)
        list_person = dep_tree.xpath('//tei:teiHeader'
                                 '//tei:listPerson[@type="PROVIDEDH Annotators"]', namespaces=namespaces)

    annotators = dep_tree.xpath('//tei:teiHeader'
                                '//tei:listPerson[@type="PROVIDEDH Annotators"]'
                                '/tei:person', namespaces=namespaces)

    xml_ids = []
    for annotator in annotators:
        prefix = '{%s}' % namespaces['xml']
        xml_id = annotator.get(prefix + 'id')

        xml_ids.append(xml_id)
        
    if 'automatic_gazeetter_ner' not in xml_ids:
        list_person[0].append(et.fromstring(
            u"""
                <person xml:id="automatic_gazeetter_ner">
                  <persName>
                    <forename>Gazeette based NER</forename>
                    <surname>none</surname>
                    <email>none</email>
                  </persName>
                  <link>none</link>
                </person>
            """
        ))

    text = et.tounicode(dep_tree)

    return text

def create_list_person(dep_tree):
    prefix = "{%s}" % namespaces['tei']

    ns_map = {
        None: namespaces['tei']
    }

    profile_desc = dep_tree.xpath('//tei:teiHeader/tei:profileDesc', namespaces=namespaces)

    if not profile_desc:
        tei_header = dep_tree.xpath('//tei:teiHeader', namespaces=namespaces)
        profile_desc = et.Element(prefix + 'profileDesc', nsmap=ns_map)
        tei_header[0].append(profile_desc)

    partic_desc = dep_tree.xpath('//tei:teiHeader/tei:profileDesc/tei:particDesc', namespaces=namespaces)

    if not partic_desc:
        profile_desc = dep_tree.xpath('//tei:teiHeader/tei:profileDesc', namespaces=namespaces)
        partic_desc = et.Element(prefix + 'particDesc', nsmap=ns_map)
        profile_desc[0].append(partic_desc)

    list_person = dep_tree.xpath(
        '//tei:teiHeader/tei:profileDesc/tei:particDesc/tei:listPerson[@type="PROVIDEDH Annotators"]',
        namespaces=namespaces)

    if not list_person:
        partic_desc = dep_tree.xpath('//tei:teiHeader/tei:profileDesc/tei:particDesc',
                                 namespaces=namespaces)
        list_person = et.Element(prefix + 'listPerson', type="PROVIDEDH Annotators", nsmap=ns_map)
        partic_desc[0].append(list_person)

    return dep_tree

In [224]:
def character_cleanup(file_text):
    clean_carriege_return = lambda s: s.replace('&#13;','<br/>')
    clean_carriege_return2 = lambda s: s.replace('#xd;','<br/>')
    clean_greater_than = lambda s: s.replace('&gt;','\>')
    clean_lesser_than = lambda s: s.replace('&lt;','\<')
    
    preprocessing = [
        clean_carriege_return, 
        clean_carriege_return2,
        #clean_greater_than, 
        #clean_lesser_than,
    ]
    
    return reduce(lambda x,f: f(x), preprocessing, file_text)

def create_lxml_tree(file_text):
    return et.fromstring(file_text.encode())

def preprocess(file_text):
    preprocessing = (
        character_cleanup,
        add_annotator,
        setup_annotation_list,
        create_lxml_tree
    )

    return reduce(lambda x,f: f(x), preprocessing, file_text)

In [9]:
# Namescpaces should be set in a previous cell
# namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}
def get_body_text_nodes(tree_root, namespaces=namespaces):
    """Retrieve all text nodes in the body of lxml tree."""

    body = tree_root.find('.//tei:body', namespaces=namespaces)
    return body.xpath('.//text()')

def filter_out_empty_text_nodes(text_nodes):
    not_empty = lambda node: node.strip() != ''
    return filter(not_empty, text_nodes)

def remove_extra_spaces(text_nodes):
    inner_spaces = re.compile('[ \n\r]{2,}')
    tree_root = None

    for node in text_nodes:
        if tree_root is None:
            tree_root = node.getparent().getroottree()

        trimmed_text= inner_spaces.sub(' ', node).strip()

        if node.is_text:
            node.getparent().text = trimmed_text
        elif node.is_tail:
            node.getparent().tail = trimmed_text

    if tree_root is not None:
        text_nodes = get_body_text_nodes(tree_root)
      
    return text_nodes

def assign_types(text_nodes):
    def _get_ancestors(node):
        parent = node.getparent()
        parent_ancestors = tuple(parent.iterancestors())

        ancestors = (parent,)+parent_ancestors if node.is_text else parent_ancestors
        return ancestors

    namespace_regex = re.compile('\{.*\}')
    remove_namespace = lambda namespace: namespace_regex.sub('', namespace)

    get_types = lambda node: tuple(remove_namespace(anc.tag) for anc in _get_ancestors(node))

    return ((node, get_types(node)) for node in text_nodes)

def remove_irrelevant_tags(text_nodes, irrelevant_tags):
    return ((node, tuple(tag for tag in tags if tag not in irrelevant_tags)) for node,tags in text_nodes)

def remove_irrelevant_tei_tags(text_nodes): 
    return remove_irrelevant_tags(text_nodes, 
            ('unclear', 'TEI', 'body', 'text', 'damage', 'add', 'supplied', 'div', 'span', 'p', 'del', 'note'))

def assign_entity_2_fragments(text_nodes, tei_2_entity):
    data = []
    for fragment in text_nodes:
        tags = fragment[1]
        entity = 'text'
        for tag in tags:
            if tag in tei_2_entity:
                entity = tei_2_entity[tag]
                break
    
        data.append((fragment[0], entity))

    return data

def apply_tokenizer(text_nodes, tokenizer):
    tokens = []
    tags = []
    for fragment in text_nodes:
        fragment_tokens = tokenizer(fragment[0])
        tokens.extend(fragment_tokens)
        tags.extend(tuple(fragment[1] for _ in range(len(fragment_tokens))))

    return tokens, tags

def apply_str_tokenizer(text_nodes):
    return apply_tokenizer(text_nodes, str)

def filter_out_annotated(text_nodes):
        return filter(lambda text_node: len(text_node[1]) == 0, text_nodes)
    
def filter_out_non_annotated(text_nodes):
        return filter(lambda text_node: len(text_node[1]) != 0, text_nodes)
    
def clean_spaces(text):
    sanityzed = text.replace('\n', ' ')
    sanityzed = sanityzed.replace('\r', ' ')
    return re.sub('[ ]+', ' ', sanityzed)

def extract_text_nodes(xml_tree):
    processing = (
        get_body_text_nodes, 
        filter_out_empty_text_nodes,
        assign_types,
        remove_irrelevant_tei_tags,
        filter_out_annotated,
        lambda text_nodes: map(lambda node: node[0], text_nodes),
        list
    )
    
    return reduce(lambda x,f: f(x), processing, xml_tree)

def extract_annotations(xml_tree):
    processing = (
        get_body_text_nodes, 
        filter_out_empty_text_nodes,
        assign_types,
        remove_irrelevant_tei_tags,
        filter_out_non_annotated,
        tuple
    )
    
    return reduce(lambda x,f: f(x), processing, xml_tree)

In [10]:
def process(file_text):
    return extract_annotations(preprocess(file_text))

In [218]:
def create_gazeette(processed_dep_folder):
    processed_depositions = os.listdir(processed_dep_folder)
    processed = ()
    for dep_name in tqdm.tqdm(processed_depositions):
        with open(os.path.join(processed_dep_folder, dep_name)) as f:
            new_terms = tuple(map(lambda x: (*x,dep_name), process(f.read())))
            processed += new_terms
    zipped = filter(lambda node: True or len(node[0]) > 0, processed)
    gazeette = {clean_spaces(text):{'tag':tag,'origin':dep} 
                    for text, tag, dep 
                    in ((x[0], x[1][0], x[2]) 
                    for x in zipped)}
    return gazeette

In [264]:
gazeette = create_gazeette(seed_dep_folder)
sorted(gazeette.items(), key=lambda a: -len(a[0].split()))

100%|██████████| 16/16 [00:00<00:00, 268.39it/s]

dep_812092r111_tei_(original_normalized_depositions_marked_persons).xml
dep_810153r202_tei_(original_normalized_depositions_marked_persons).xml
dep_810134r188_tei_(original_normalized_depositions_marked_persons).xml
dep_814242r151_tei_(original_normalized_depositions_marked_persons).xml
dep_810039r058_tei_(original_normalized_depositions_marked_persons).xml
dep_835191r231_tei_(original_normalized_depositions_marked_persons).xml
dep_809329r212_tei_(original_normalized_depositions_marked_persons).xml
dep_822013r014_tei_(original_normalized_depositions_marked_persons).xml
dep_810308r324_tei_(original_normalized_depositions_marked_persons).xml
dep_838052r072_tei_(original_normalized_depositions_marked_persons).xml
dep_811203r142_tei_(original_normalized_depositions_marked_persons).xml
dep_838078r145_tei_(original_normalized_depositions_marked_persons).xml
dep_837006r005_tei_(original_normalized_depositions_marked_persons).xml
dep_810357r364A_tei_(original_normalized_depositions_marked_pers




[('past 1641 he this',
  {'tag': 'date',
   'origin': 'dep_810134r188_tei_(original_normalized_depositions_marked_persons).xml'}),
 ('Januar 1652 before Justice',
  {'tag': 'date',
   'origin': 'dep_811203r142_tei_(original_normalized_depositions_marked_persons).xml'}),
 ('towne of the ffurrowes',
  {'tag': 'placeName',
   'origin': 'dep_810357r364A_tei_(original_normalized_depositions_marked_persons).xml'}),
 ('town of the furroughs',
  {'tag': 'placeName',
   'origin': 'dep_810357r364A_tei_(original_normalized_depositions_marked_persons).xml'}),
 ('Murogh Doogh O Neill',
  {'tag': 'person',
   'origin': 'dep_838125r189_tei_(original_normalized_depositions_marked_persons).xml'}),
 ('6 or 7 weekes',
  {'tag': 'time',
   'origin': 'dep_834017r015_tei_(original_normalized_depositions_marked_persons).xml'}),
 ('County of Catherlogh',
  {'tag': 'placeName',
   'origin': 'dep_812092r111_tei_(original_normalized_depositions_marked_persons).xml'}),
 ('James O Knae ',
  {'tag': 'person',
   'o

## 2 Text Annotation

In [253]:
def sanityze_text_content(text):
    sanityzed = text.replace('<', '_')
    sanityzed = sanityzed.replace('>', '_')
    return sanityzed

In [256]:
def create_match_certainty_tag(target, match, gazeette_term, origin):
    
    similarity_sum = reduce(lambda ac,dc: ac+dc[0] if dc[1] else ac, 
                            zip(match['similarity'],match['matches']), 
                            0)
    cert = similarity_sum/match['number_matches']
    
    desc = f'Automatically annotated based on the initial annotation \'{sanityze_text_content(gazeette_term)}\' ' \
            +f'from the document \'{origin}\'.\n' \
            +f'{match["number_matches"]} out of {len(gazeette_term.split())} highly coincident terms.\n' \
            +f'Normalized Levenshtein similarity between terms: {match["similarity"]}.' 
        
    text = f'<certainty category="incompletness" locus="name" degree="{cert}" cert="unknown" resp="#automatic_gazeetter_ner" target="#{target}" desc="{desc}"/>'
    return et.fromstring(text)

e = create_match_certainty_tag('place0001', match, 'Edward Piggot', 'own.xml')
print(et.tounicode(e))

<certainty category="incompletness" locus="name" degree="0.75" cert="unknown" resp="#automatic_gazeetter_ner" target="#place0001" desc="Automatically annotated based on the initial annotation 'Edward Piggot' from the document 'own.xml'. 1 out of 2 highly coincident terms. Normalized Levenshtein similarity between terms: (0.7142857142857143, 0.75)."/>


In [243]:
def create_time_certainty_tag(target, gazeette_term):    
    desc = f'Automatically annotated based on high confidence in date match ({sanityze_text_content(gazeette_term)}). Exact month match.' 
        
    text = f'<certainty category="incompletness" locus="name" degree="0.8" cert="unknown" resp="#automatic_gazeetter_ner" target="#{target}" desc="{desc}"/>'
    return et.fromstring(text)

e = create_time_certainty_tag('date0001', 'march')
print(et.tounicode(e))

<certainty category="incompletness" locus="name" degree="0.8" cert="unknown" resp="#automatic_gazeetter_ner" target="#date0001" desc="Automatically annotated based on high confidence in date match (march). Exact month match."/>


In [244]:
def create_year_certainty_tag(target, gazeette_term):    
    desc = f'Automatically annotated based on low confidence in date match ({sanityze_text_content(gazeette_term)}). High similarity with short length annotations.' 
        
    text = f'<certainty category="incompletness" locus="name" degree="0.3" cert="unknown" resp="#automatic_gazeetter_ner" target="#{target}" desc="{desc}"/>'
    return et.fromstring(text)

e = create_year_certainty_tag('date0001', 'march')
print(et.tounicode(e))

<certainty category="incompletness" locus="name" degree="0.3" cert="unknown" resp="#automatic_gazeetter_ner" target="#date0001" desc="Automatically annotated based on low confidence in date match (march). High similarity with short length annotations."/>


In [95]:
match = {'match': True,
         'similarity': (0.7142857142857143, 0.75),
         'matches': (False, True),
         'number_matches': 1}

In [49]:
def add_certainty(dep_tree, certainty):
    certainties = dep_tree.xpath('//tei:teiHeader'
                                 '//tei:classCode[@scheme="http://providedh.eu/uncertainty/ns/1.0"]',
                                 namespaces=namespaces)

    certainties[0].append(certainty)

In [159]:
class TagIds:
    def __init__(self):
        self._counters = {}
    def next_id(self, tag):
        if tag in self._counters:
            return f'm_{tag}{next(self._counters[tag]):04}'
        else:
            self._counters[tag] = itertools.count()
            return f'm_{tag}{0:04}'
tag_ids = TagIds()

In [146]:
def wrap_text_in_tag(tag_id, text,substring,tag_name):
    text_node = text.getparent()
    parent = text_node.getparent()
    unprocessed_leaf_nodes: tuple
    partitions = text.partition(substring)
    
    new_element = et.fromstring(f'<{tag_name} xml:id="{tag_id}"></{tag_name}>')
    new_element.text = partitions[1]
    new_element.tail = partitions[2]
    
    new_element_tail = new_element.xpath("//text()")[1]
        
    if text.is_text:
        text_node.text = partitions[0]
        text_node.insert(0, new_element)
        
        parent_text = text_node.xpath('.//text()')[0]     
        unprocessed_leaf_nodes = (parent_text, new_element_tail)
        
    elif text.is_tail:
        index = parent.index(text_node) + 1
        text_node.tail = partitions[0]
        parent.insert(index,new_element)
        
        parent_tail = None
        for x in parent.getparent().xpath('.//text()'):
            if x.getparent() == parent and x.is_tail == True:
                parent_tail = x
        unprocessed_leaf_nodes = (parent_tail, new_element_tail)
        
    return unprocessed_leaf_nodes

In [148]:
a = et.fromstring('''
<span>
<p>
    <person>
    Alex
    </person>
    of Dublin Doctor in Divinity being duely
</p>
</span>
''')
a_ = a.xpath('//text()')

b = et.fromstring('''
<span>
<p>
    He,
    <person>
    Alex
    </person>
    of Dublin Doctor in Divinity being duely
</p>
</span>
''')
b_ = b.xpath('//text()')

c = et.fromstring('''
<span>
<p>
    of Dublin Doctor
    <person>
    Alex
    </person>
    in Divinity being duely
</p>
</span>
''')
c_ = c.xpath('//text()')

d = et.fromstring('''
<span>
<p>
    from
    <person>
    Alex
    </person>
    Dublin <date>Doctor</date> in Divinity being duely
</p>
</span>
''')
d_ = d.xpath('//text()')

e = et.fromstring('''
<span>
<p>
    <person>
    Alex
    </person>
    Doctor in Divinity being of Dublin
</p>
</span>
''')
e_ = e.xpath('//text()')

f = et.fromstring('''
<span>
<p>
    <person>
    Alex from Dublin
    </person>
    of in Divinity being duely
</p>
</span>
''')
f_ = f.xpath('//text()')

print(et.tostring(a))
leafs = wrap_text_in_tag('a0', a_[3], 'Dublin', 'place')
print(et.tostring(a))
print('Textual context of the new tag >',leafs)

print('\n-----------------------\n')

print(et.tostring(b))
leafs = wrap_text_in_tag('a0', b_[3], 'Dublin', 'place')
print(et.tostring(b))
print('Textual context of the new tag >',leafs)

print('\n-----------------------\n')

print(et.tostring(c))
leafs = wrap_text_in_tag('a0', c_[1], 'Dublin', 'place')
print(et.tostring(c))
print('Textual context of the new tag >',leafs)

print('\n-----------------------\n')

print(et.tostring(d))
leafs = wrap_text_in_tag('a0', d_[3], 'Dublin', 'place')
print(et.tostring(d))
print('Textual context of the new tag >',leafs)

print('\n-----------------------\n')

print(et.tostring(e))
leafs = wrap_text_in_tag('a0', e_[3], 'Dublin', 'place')
print(et.tostring(e))
print('Textual context of the new tag >',leafs)

print('\n-----------------------\n')

print(et.tostring(f))
leafs = wrap_text_in_tag('a0', f_[2], 'Dublin', 'place')
print(et.tostring(f))
print('Textual context of the new tag >',leafs)

b'<span>\n<p>\n    <person>\n    Alex\n    </person>\n    of Dublin Doctor in Divinity being duely\n</p>\n</span>'
b'<span>\n<p>\n    <person>\n    Alex\n    </person>\n    of <place xml:id="a0">Dublin</place> Doctor in Divinity being duely\n</p>\n</span>'
Textual context of the new tag > ('\n', ' Doctor in Divinity being duely\n')

-----------------------

b'<span>\n<p>\n    He,\n    <person>\n    Alex\n    </person>\n    of Dublin Doctor in Divinity being duely\n</p>\n</span>'
b'<span>\n<p>\n    He,\n    <person>\n    Alex\n    </person>\n    of <place xml:id="a0">Dublin</place> Doctor in Divinity being duely\n</p>\n</span>'
Textual context of the new tag > ('\n', ' Doctor in Divinity being duely\n')

-----------------------

b'<span>\n<p>\n    of Dublin Doctor\n    <person>\n    Alex\n    </person>\n    in Divinity being duely\n</p>\n</span>'
b'<span>\n<p>\n    of <place xml:id="a0">Dublin</place> Doctor\n    <person>\n    Alex\n    </person>\n    in Divinity being duely\n</p>\n</sp

### 3. String Matching for Gazeette entries

The distance of each word of an n-gram to the correspondent of the gazeette entry is meassured using the Normalized Levenshtein distance.

__The Normalized Levenshtein distance is not a metric__: Therefore the triangle inequality does not apply; which
allow us to sum the distances of each word and not take order into account.

__Given the high degree of variability in named entties and the low coverage of annotated text__, matching is based on the amount of matches for a given n-gram instead of relying solely on the distance of the complete text fragments.

In [28]:
normalized_levenshtein = NormalizedLevenshtein()

In [278]:
min_word_similarity = 0.75
min_n_gram_similarity = 2/3

def test_match_(text1, text2, min_word_similarity, min_n_gram_similarity):
    similarity = tuple(normalized_levenshtein.similarity(x.lower(), y.lower()) for x,y in zip(text1, text2))
    
    matching = tuple(map(lambda x: x>=min_word_similarity, similarity))
    match_n = reduce(lambda ac,dc: ac+1 if dc else ac, matching, 0)
    
    match = {
        'match': match_n/len(text1) >= min_n_gram_similarity,
        'similarity': similarity,
        'matches': matching,
        'number_matches': match_n
    }
    
    if len(text1) == 2 and (len(text1) < 4 or len(text2) < 4):
        match['match'] = False
    
    return match

def test_match(text1, text2):
    return test_match_(text1, text2, min_word_similarity, min_n_gram_similarity)

In [30]:
test_match(['County','of','Gilliegh'], ['Countie','of','Gilleigh'])

{'match': True,
 'similarity': (0.7142857142857143, 1.0, 0.75),
 'matches': (False, True, True)}

In [31]:
test_match('Patrick mc Gilliegh'.split(), 'Catherine mc Gilleigh'.split())

{'match': True,
 'similarity': (0.4444444444444444, 1.0, 0.75),
 'matches': (False, True, True)}

### 4. Heuristic rules for identifying dates

In [172]:
def pre_anotate_dates(dep_text_nodes):
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}
    months = [
        'january', 'jan', 'february', 'feb', 'march', 
        'mar', 'april', 'jpr', 'may', 'may', 'june', 
        'jun', 'july', 'jul', 'august', 'aug', 
        'september', 'sep', 'october', 'oct', 'november', 
        'nov', 'december', 'dec'
    ]
    
    while len(dep_text_nodes) > 0:
        node = dep_text_nodes.pop(0)
        words = node.split(' ')
             
        if len(words) < 3:
            pass
                                      
        text_len = 3 
                                      
        match = False
        for i in range(0, len(words) + 1 - text_len):
            matching_text = ' '.join(words[i : i+text_len]).strip()
            if words[i+1].lower() in months:
                tag_id = tag_ids.next_id('date')
                left_nodes = wrap_text_in_tag(tag_id, node, matching_text, 'date')
                e = create_time_certainty_tag(tag_id, words[i+1])
                add_certainty(node.getparent().getroottree(), e)
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break

In [237]:
def apply_gazeette(dep_text_nodes, gazeette):   
    gazeette_sorted = sorted(gazeette.items(), key=lambda a: -len(a[0]))
    annotated = 0
    while len(dep_text_nodes) > 0:
        node = dep_text_nodes.pop(0)
        words = node.split(' ')
            
        for entry in gazeette_sorted:
            text_len = len(entry[0].split(' '))
                
            if len(words) < text_len:
                pass
            
            match = False
            for i in range(0, len(words) + 1 - text_len):
                if i >= len(words) or (i + text_len) >= len(words):
                    break
                    
                matching_text = (words[i],) if text_len == 1 else words[i : i+text_len]
                gazeette_text = (entry[0],) if text_len == 1 else entry[0].split()
                
                matching = test_match(matching_text, gazeette_text)
                
                if matching['match']:
                    tag_id = tag_ids.next_id(entry[1]['tag'])
                    left_nodes = wrap_text_in_tag(tag_id, node, ' '.join(matching_text), entry[1]['tag'])
                    e = create_match_certainty_tag(tag_id, matching, entry[0], entry[1]['origin'])
                    add_certainty(node.getparent().getroottree(), e)
                    
                    dep_text_nodes.insert(0, left_nodes[1])
                    dep_text_nodes.insert(0, left_nodes[0])
                    match = True
                    annotated += 1
                    break
                    
            if match:
                break

In [177]:
def post_annotation_date_processing(dep_text_nodes): 
    while len(dep_text_nodes) > 0:
        node = dep_text_nodes.pop(0)
        words = node.split(' ')
             
        if len(words) < 4:
            pass
                                      
        text_len = 4 # dates
                                      
        match = False
        for i in range(0, len(words) + 1 - text_len):
            matching_text = ' '.join(words[i : i+text_len]).strip()
                
            if re.match('[0-9]{4}', words[i+1].lower()):
                tag_id = tag_ids.next_id('date')
                left_nodes = wrap_text_in_tag(tag_id, node, matching_text, 'date')
                e = create_year_certainty_tag(tag_id, words[i+1])
                add_certainty(node.getparent().getroottree(), e)
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break

In [35]:
def annotate(dep_folder, dep_name, gazeette):
    with open(os.path.join(dep_folder, dep_name)) as f:
        dep_raw = f.read()
        
    dep_tree = preprocess(dep_raw)
    
    pre_anotate_dates(extract_text_nodes(dep_tree))
    apply_gazeette(extract_text_nodes(dep_tree), gazeette)
    post_annotation_date_processing(extract_text_nodes(dep_tree))
    
    return dep_tree

## 3 Apply Gazeette to Depositions

In [262]:
tqdm.tqdm._instances.clear()

In [267]:
os.listdir(original_dep_folder)[2928]

'original_normalized_depositions_marked_persons.log'

In [270]:
last = 2928

In [279]:
tqdm.tqdm._instances.clear()

for dep_name in tqdm.tqdm(list(os.listdir(original_dep_folder))[last:last+10]):
    try:
        if Path(os.path.join(original_dep_folder, dep_name)).is_file():
            applied = annotate(original_dep_folder, dep_name, gazeette)
            
            with open(os.path.join(processed_dep_folder, dep_name), 'w') as f:
                f.write(et.tostring(applied).decode('UTF-8'))
    except Exception as e:
        #print(e)
        #break
        pass #print('!! Is empty')

100%|██████████| 10/10 [00:41<00:00,  4.12s/it]


12:20 el último revisado manualmente