# PROVIDEDH Collaborative platform
## Jupyter notebook

In [1]:
import re
import os
from functools import reduce
from lxml import etree as et
from lxml.etree import Element
import itertools

In [4]:
import nltk
import spacy
import en_core_web_md
sp_nlp_en = en_core_web_md.load()

## 1 File retrieval

In [2]:
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}

In [336]:
depositions = os.listdir('./depositions_subset')

## 2 File processing

In [4]:
def get_body_text_nodes(tree_root, namespaces={'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}):
    """Retrieve all text nodes in the body of lxml tree."""

    body = tree_root.find('.//tei:body', namespaces=namespaces)
    return body.xpath('.//text()')

def filter_out_empty_text_nodes(text_nodes):
    not_empty = lambda node: node.strip() != ''
    return filter(not_empty, text_nodes)

def remove_extra_spaces(text_nodes):
    inner_spaces = re.compile('[ \n\r]{2,}')
    tree_root = None

    for node in text_nodes:
        if tree_root is None:
            tree_root = node.getparent().getroottree()

        trimmed_text= inner_spaces.sub(' ', node).strip()

        if node.is_text:
            node.getparent().text = trimmed_text
        elif node.is_tail:
            node.getparent().tail = trimmed_text

    if tree_root is not None:
        text_nodes = get_body_text_nodes(tree_root)
      
    return text_nodes

def assign_types(text_nodes):
    def _get_ancestors(node):
        parent = node.getparent()
        parent_ancestors = tuple(parent.iterancestors())

        ancestors = (parent,)+parent_ancestors if node.is_text else parent_ancestors
        return ancestors

    namespace_regex = re.compile('\{.*\}')
    remove_namespace = lambda namespace: namespace_regex.sub('', namespace)

    get_types = lambda node: tuple(remove_namespace(anc.tag) for anc in _get_ancestors(node))

    return ((node, get_types(node)) for node in text_nodes)

def remove_irrelevant_tags(text_nodes, irrelevant_tags):
    return ((node, tuple(tag for tag in tags if tag not in irrelevant_tags)) for node,tags in text_nodes)

def remove_irrelevant_tei_tags(text_nodes): 
    return remove_irrelevant_tags(text_nodes, ('TEI', 'body', 'text', 'add', 'div', 'span', 'p', 'del', 'note'))

def assign_entity_2_fragments(text_nodes, tei_2_entity):
    data = []
    for fragment in text_nodes:
        tags = fragment[1]
        entity = 'text'
        for tag in tags:
            if tag in tei_2_entity:
                entity = tei_2_entity[tag]
                break
    
        data.append((fragment[0], entity))

    return data

def apply_tokenizer(text_nodes, tokenizer):
    tokens = []
    tags = []
    for fragment in text_nodes:
        fragment_tokens = tokenizer(fragment[0])
        tokens.extend(fragment_tokens)
        tags.extend(tuple(fragment[1] for _ in range(len(fragment_tokens))))

    return tokens, tags

def apply_str_tokenizer(text_nodes):
    return apply_tokenizer(text_nodes, str)

def filter_annotated(text_nodes):
        return filter(lambda text_node: len(text_node[1]) == 0, text_nodes)

def process(xml_tree):
    processing = (
        get_body_text_nodes, 
        remove_extra_spaces,
        filter_out_empty_text_nodes,
        assign_types,
        remove_irrelevant_tei_tags,
        #apply_str_tokenizer,
        tuple
    )

    return reduce(lambda x,f: f(x), processing, xml_tree)

In [339]:
def create_gazeette():
    processed_depositions = os.listdir('./depositions_subset/processed')
    processed = ()
    for dep_name in processed_depositions:
        with open(os.path.join('./depositions_subset/processed', dep_name)) as f:
            dep_raw = f.read()
            dep_tree = et.fromstring(dep_raw.encode())
            processed += process(dep_tree)
    zipped = filter(lambda node: len(node[1]) > 0, processed)
    gazeette = {text:tag for text, tag in ((x[0], x[1][0]) for x in zipped)}
    return gazeette

In [390]:
gazeette = create_gazeette()
sorted(gazeette.items(), key=lambda a: -len(a[0]))

[('towne of Ballelahartye', 'placeName'),
 ('Katherine mc Gilleigh', 'person'),
 ('Catherine mc Gilleigh', 'person'),
 ('Patricke mc Gilliegh', 'person'),
 ('town of Ballelaharty', 'placeName'),
 ('William Wolferston', 'person'),
 ('County of Monaghan', 'placeName'),
 ('Cormacke mc Guyre', 'person'),
 ('William Hitchcock', 'name'),
 ('Patrick O Dorogan', 'person'),
 ('Patrick mc Enalye', 'person'),
 ('Saterday morninge', 'date'),
 ('first of december', 'date'),
 ('County of Ardmagh', 'placeName'),
 ('Kath: mc Gillegh', 'person'),
 ('Kath mc Gilleigh', 'person'),
 ('north of Ireland', 'placeName'),
 ('Lawrence Beddell', 'person'),
 ('Cittie of Dublin', 'placeName'),
 ('County of Armagh', 'placeName'),
 ('George Blundell', 'name'),
 ('County of Louth', 'placeName'),
 ('Laurence Bedlow', 'person'),
 ('Sir Same Mayart', 'person'),
 ('Citty of Dublin', 'placeName'),
 ('Will: Hitchcock', 'person'),
 ('Jur 7 febr 1641', 'date'),
 ('William Aldrich', 'name'),
 ('Mathew Talbott', 'person'),
 ('

## 3 Text tagging

In [297]:
a = et.fromstring('''
<span>
<p>
    <person>
    Alex
    </person>
    of Dublin Doctor in Divinity being duely
</p>
</span>
''')
a_ = a.xpath('//text()')

b = et.fromstring('''
<span>
<p>
    He,
    <person>
    Alex
    </person>
    of Dublin Doctor in Divinity being duely
</p>
</span>
''')
b_ = b.xpath('//text()')

c = et.fromstring('''
<span>
<p>
    of Dublin Doctor
    <person>
    Alex
    </person>
    in Divinity being duely
</p>
</span>
''')
c_ = c.xpath('//text()')

d = et.fromstring('''
<span>
<p>
    from
    <person>
    Alex
    </person>
    Dublin <date>Doctor</date> in Divinity being duely
</p>
</span>
''')
d_ = d.xpath('//text()')

e = et.fromstring('''
<span>
<p>
    <person>
    Alex
    </person>
    Doctor in Divinity being of Dublin
</p>
</span>
''')
e_ = e.xpath('//text()')

f = et.fromstring('''
<span>
<p>
    <person>
    Alex from Dublin
    </person>
    of in Divinity being duely
</p>
</span>
''')
f_ = f.xpath('//text()')

print(et.tostring(a))
leafs = wrap_text_in_tag(a_[3], 'Dublin', 'place')
print(et.tostring(a))
print(leafs)

print('\n-----------------------\n')

print(et.tostring(b))
leafs = wrap_text_in_tag(b_[3], 'Dublin', 'place')
print(et.tostring(b))
print(leafs)

print('\n-----------------------\n')

print(et.tostring(c))
leafs = wrap_text_in_tag(c_[1], 'Dublin', 'place')
print(et.tostring(c))
print(leafs)

print('\n-----------------------\n')

print(et.tostring(d))
leafs = wrap_text_in_tag(d_[3], 'Dublin', 'place')
print(et.tostring(d))
print(leafs)

print('\n-----------------------\n')

print(et.tostring(e))
leafs = wrap_text_in_tag(e_[3], 'Dublin', 'place')
print(et.tostring(e))
print(leafs)

print('\n-----------------------\n')

print(et.tostring(f))
leafs = wrap_text_in_tag(f_[2], 'Dublin', 'place')
print(et.tostring(f))
print(leafs)

b'<span>\n<p>\n    <person>\n    Alex\n    </person>\n    of Dublin Doctor in Divinity being duely\n</p>\n</span>'
b'<span>\n<p>\n    <person>\n    Alex\n    </person>\n    of <place>Dublin</place> Doctor in Divinity being duely\n</p>\n</span>'
(' of ', ' Doctor in Divinity being duely\n')

-----------------------

b'<span>\n<p>\n    He,\n    <person>\n    Alex\n    </person>\n    of Dublin Doctor in Divinity being duely\n</p>\n</span>'
b'<span>\n<p>\n    He,\n    <person>\n    Alex\n    </person>\n    of <place>Dublin</place> Doctor in Divinity being duely\n</p>\n</span>'
(' of ', ' Doctor in Divinity being duely\n')

-----------------------

b'<span>\n<p>\n    of Dublin Doctor\n    <person>\n    Alex\n    </person>\n    in Divinity being duely\n</p>\n</span>'
b'<span>\n<p>\n    of <place>Dublin</place> Doctor\n    <person>\n    Alex\n    </person>\n    in Divinity being duely\n</p>\n</span>'
('\n    of ', ' Doctor\n    ')

-----------------------

b'<span>\n<p>\n    from\n    <person

In [458]:
def wrap_text_in_tag(text,substring,tag_name):
    text_node = text.getparent()
    parent = text_node.getparent()
    unprocessed_leaf_nodes: tuple
    partitions = text.partition(substring)
    
    new_element = et.Element(tag_name)
    new_element.text = partitions[1]
    new_element.tail = partitions[2]
    
    new_element_tail = new_element.xpath("//text()")[1]
        
    if text.is_text:
        text_node.text = partitions[0]
        text_node.insert(0, new_element)
        
        parent_text = text_node.xpath('.//text()')[0]      
        unprocessed_leaf_nodes = (parent_text, new_element_tail)
        
    elif text.is_tail:
        index = parent.index(text_node) + 1
        text_node.tail = partitions[0]
        parent.insert(index,new_element)
        
        parent_tail = None
        for x in p.getparent().xpath('.//text()'):
            if x.getparent() == p and x.is_tail == True:
                parent_tail = x
        unprocessed_leaf_nodes = (parent_tail, new_element_tail)
        
    return unprocessed_leaf_nodes

def annotate(dep_folder, dep_name, gazeette):
    with open(os.path.join(dep_folder, dep_name)) as f:
        dep_raw = f.read()
    dep_tree = et.fromstring(dep_raw.encode())
    
    dep_annotated = apply_gazeette( apply_rules(dep_tree), gazeette)
    return dep_annotated

def apply_rules(dep_tree):
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}
    months = [
        'january', 'jan', 'february', 'feb', 'march', 
        'mar', 'april', 'jpr', 'may', 'may', 'june', 
        'jun', 'july', 'jul', 'august', 'aug', 
        'september', 'sep', 'october', 'oct', 'november', 
        'nov', 'december', 'dec'
    ]
    
    processing = (
        get_body_text_nodes, 
        filter_out_empty_text_nodes,
        assign_types,
        remove_irrelevant_tei_tags,
        filter_annotated,
        lambda text_nodes: map(lambda node: node[0], text_nodes),
        list
    )
    
    process = lambda xml_tree: reduce(lambda x,f: f(x), processing, xml_tree)
    
    dep_text_nodes = process(dep_tree)
                                      
    while len(dep_text_nodes) > 0:
        node = dep_text_nodes.pop(0)
        words = node.split(' ')
             
        if len(words) < 3:
            pass
                                      
        text_len = 3 # county of x, barony of x
                                      
        match = False
        for i in range(0, len(words) + 1 - text_len):
            matching_text = ' '.join(words[i : i+text_len]).strip()
                
            if matching_text.lower().startswith('county of'):
                left_nodes = wrap_text_in_tag(node, matching_text, 'placeName')
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break
            elif matching_text.lower().startswith('town of'):
                left_nodes = wrap_text_in_tag(node, matching_text, 'placeName')
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break
            elif matching_text.lower().startswith('countie of'):
                left_nodes = wrap_text_in_tag(node, matching_text, 'placeName')
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break
            elif matching_text.lower().startswith('towne of'):
                left_nodes = wrap_text_in_tag(node, matching_text, 'placeName')
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break
            elif matching_text.lower().startswith('barony of'):
                left_nodes = wrap_text_in_tag(node, matching_text, 'placeName')
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break
            elif words[i+1].lower() in months:
                left_nodes = wrap_text_in_tag(node, matching_text, 'date')
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break
                
    dep_text_nodes = process(dep_tree)
                                      
    while len(dep_text_nodes) > 0:
        node = dep_text_nodes.pop(0)
        words = node.split(' ')
             
        if len(words) < 2:
            pass
                                      
        text_len = 3 # county of x, barony of x
                                      
        match = False
        for i in range(0, len(words) + 1 - text_len):
            matching_text = ' '.join(words[i : i+text_len]).strip()
                
            if words[i].lower() in months or words[i+1].lower() in months:
                left_nodes = wrap_text_in_tag(node, matching_text, 'date')
                dep_text_nodes.insert(0, left_nodes[1])
                dep_text_nodes.insert(0, left_nodes[0])
                match = True
                break
                                      
    return dep_tree
    
def apply_gazeette(dep_tree, gazeette):
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}
    
    processing = (
        get_body_text_nodes, 
        filter_out_empty_text_nodes,
        assign_types,
        remove_irrelevant_tei_tags,
        filter_annotated,
        lambda text_nodes: map(lambda node: node[0], text_nodes),
        list
    )
    
    process = lambda xml_tree: reduce(lambda x,f: f(x), processing, xml_tree)
    
    dep_text_nodes = process(dep_tree)
    
    gazeette_sorted = sorted(gazeette.items(), key=lambda a: -len(a[0]))
    while len(dep_text_nodes) > 0:
        node = dep_text_nodes.pop(0)
        words = node.split(' ')
                
        for entry in gazeette_sorted:
            text_len = len(entry[0].split(' '))
                
            if len(words) < text_len:
                pass
            
            match = False
            for i in range(0, len(words) + 1 - text_len):
                if i >= len(words) or (i + text_len) >= len(words):
                    break
                matching_text = words[i].strip() if text_len == 1 else ' '.join(words[i : i+text_len]).strip()
                
                if entry[0] in matching_text:
                    left_nodes = wrap_text_in_tag(node, entry[0], entry[1])
                    dep_text_nodes.insert(0, left_nodes[1])
                    dep_text_nodes.insert(0, left_nodes[0])
                    match = True
                    break
                    
            if match:
                break
    
    return dep_tree

In [463]:
for dep_name in depositions[24:]:
    print(dep_name)
    gazeette = create_gazeette()

    try:
        applied = annotate('./depositions_subset', dep_name, gazeette)

        with open(os.path.join('./depositions_subset', 'processed', dep_name), 'w') as f:
            f.write(et.tostring(applied).decode('UTF-8'))
    except AttributeError as e:
        print(dep_name, 'is empty')

dep_815199r275_tei_(original_normalized_depositions_marked_persons).xml
dep_815199r275_tei_(original_normalized_depositions_marked_persons).xml is empty


12:20 el último revisado manualmente

In [402]:
gazeette

{'Ja: Traill': 'name',
 'G Blundell': 'person',
 'Katherine mc Gilleigh': 'person',
 '09 June 1653': 'date',
 'Patricke mc Gilliegh': 'person',
 'Captain Wooll': 'person',
 'Cormacke mc Guyre': 'person',
 'sunday': 'date',
 'Kath: mc Gillegh': 'person',
 'George Blundell': 'name',
 'Ja Traill': 'name',
 'Catherine mc Gilleigh': 'person',
 'Patrick': 'person',
 'James Traill': 'name',
 'Kath mc Gilleigh': 'person',
 'Captain Wool': 'person',
 'William Hitchcock': 'name',
 'John Sterne': 'name',
 'Alice Hogg': 'person',
 'Murferstowne': 'placeName',
 'October': 'date',
 'William Wolferston': 'person',
 'Dublin': 'placeName',
 'Luke Toole': 'person',
 'Banabye Toole': 'person',
 'Mathew Talbott': 'person',
 'mark': 'supplied',
 '75': 'supplied',
 'Barony of Rathdowne': 'placeName',
 'County of Dublin': 'place',
 'County of Wicklow': 'placeName',
 'Banabas Toole': 'person',
 'Carrogroe': 'placeName',
 'Matthew Talbot': 'person',
 'March': 'date',
 '26th': 'date',
 '1642': 'date',
 'March 1