# PROVIDEDH Collaborative platform
## Jupyter notebook

In [1]:
from apps.files_management.models import File, FileVersion, Directory
from apps.projects.models import Project

In [2]:
from django.db.models import Q

In [3]:
from lxml import etree as et
from lxml.etree import Element
import re

In [4]:
import nltk
import spacy
import en_core_web_md
sp_nlp_en = en_core_web_md.load()

## 1 File retrieval

In [5]:
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}

In [6]:
project = Project.objects.get(title__exact='ner')
files_query = Q(project=project)

In [7]:
files = File.objects.filter(files_query)

In [10]:
v = files[0].versions.all()[1]

In [11]:
v

<FileVersion: FileVersion object (71)>

## 2 File processing

In [12]:
f = files[0]
f_download = f.download()
f_content = f_download.getvalue().decode('UTF-8')

In [325]:
f_et = et.XML(f_content)

body = f_et.xpath('.//tei:body', namespaces=namespaces)[0]

body_text = body.xpath('.//text()')

withtextnodes = lambda x: len(x.xpath('text()')) > 0
notemptyline = lambda text: len(text.strip()) > 0
withnotemptylines = lambda node: any(map(notemptyline, node.xpath('text()')))

nodes_filtered = filter(withnotemptylines, filter(withtextnodes, body.iter()))
text_nodes = list(map(lambda x: list(filter(notemptyline, x.xpath('text()'))), nodes_filtered))

body_text_filtered = ' '.join(map(lambda x: ' '.join(x), text_nodes))

## 3 Text tagging

### spacy en_core_web_md

In [326]:
body_sp = sp_nlp_en(str(body_text_filtered))

In [327]:
bulletproof = lambda text: '( |<.*>|)+'.join(text.strip().replace(' ',''))
assert len(re.findall(bulletproof('12th of'),r'12th <del rend="doublestrikethrough">of')) == 1
assert len(re.findall(bulletproof('12th of'),r'12th<del rend="doublestrikethrough">of')) == 1

In [328]:
body_sp.ents[2].text

'Dublin'

In [324]:
entities

[&   , xd;London.#xd;#xd, 1641, Moore James Ware]

In [329]:
entities = list([e for e in body_sp.ents]) 

for text_node in text_nodes:
    for fragment in text_node:
        for entity in entities:
            if entity.text in fragment:
                print(fragment)
                print(entity.text)
                print(entity.label_)
                print('...........................................')
                wrap_text_in_tag(fragment, entity.text, entity.label_)
                entities.pop(0)

261 to 268
261
CARDINAL
...........................................
 of Dublin Alderman taken the 12th of November 
Dublin
GPE
...........................................
 of Dublin Alderman taken the 12th of November 
12
CARDINAL
...........................................
 of Dublin Alderman taken the 12th of November 
Dublin
GPE
...........................................
 the clerk of the Councell by direction of the board
Councell
PERSON
...........................................
 The said examinate saith that about July Last he being then in London
                    
London
GPE
...........................................
 The said examinate saith that about July Last he being then in London
                    
London
GPE
...........................................
 The said examinate saith that about July Last he being then in London
                    
about July Last
DATE
...........................................
 an apothecary in
                    Fleetestreete where 

In [None]:
fragment = re.search(bulletproof(ent.text), text)[0]
    print(fragment)
    parts = text.partition(fragment)
    tagged += parts[0] + f'<{ent.label_} class="proposed" model="spacy_en_core_web_md" count="{i}">{parts[1]}</{ent.label_}>'
    text = parts[2]
tagged += text

In [None]:
def wrap_text_in_tag(text,substring,tag_name):
    text_node = text.getparent()
    partitions = text.partition(substring)
    text_node.tail = partitions[0]
    
    newElement = et.Element(tag_name)
    newElement.text = partitions[1]
    newElement.tail = partitions[2]
    
    index = text_node.getparent().index(text_node) + 1
    text_node.getparent().insert(index,newElement)

In [292]:
body_sp.ents[i].text in text_nodes[1][0]

True

In [70]:
et.fromstring(tagged)

XMLSyntaxError: Unescaped '<' not allowed in attributes values, line 11, column 36 (<string>, line 11)

In [32]:
body.getparent().replace(body, et.fromstring(tagged))

In [49]:
body_sp.ents[0].text

'Trinity College Dublin Dublin'

In [53]:
body_text_filtered

'Transcribed Version of Deposition\n                 Edited Edited Trinity College Dublin Dublin, Ireland Ireland Edward Lake Dublin Dublin Ireland Alderman Thomas Hicks Doctor Moore Ja: Ware 261 to 268 The examination of  Edward Lake  of Dublin Alderman taken the 12th of November  1641 : upon oath  before ministred by  the clerk of the Councell by direction of the board A\n                      The said examinate saith that about July Last he being then in London\n                     and supping at and usually resorting to  the house of  Thomas Hicks  an apothecary in\n                    Fleetestreete where one  Doctor Moore  usually  lodged the examinate\n                    & his wife one night being at Supper with the said  Thomas Hicks . the said  Doctor Moore  being\n                    then likewise at Supper with them asked him why he did not Live in England seing that he had given\n                    over  B  his trading. To which the examinate answered that he chose\n     

In [243]:
print(et.tostring(body).decode('UTF-8'))

<body xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude">
      <div type="deposition">
        <closer>
          <signed>
            <roleName type="Commissioner"/>
            <name>Ja: Ware</name>
          </signed>
        </closer>
      </div>
      <div type="original">
        <pb n="fol. 164r" pagenum="261"/>
        <p><lb/>261 to 268<lb/>The examination of <person sameAs="person809164r087-1">Edward Lake</person> of Dublin Alderman taken the 12th of November <del rend="doublestrikethrough">1641</del>: upon oath <del rend="strikethrough">before</del>
                    <add place="inline">ministred by</add> the clerk of the Councell by direction of the board<lb/><note type="marginalia">A
                    </note> The said examinate saith that about July Last he being then in London
                    <del rend="strikethrough"><add place="inline">and supping</add></del>
                    <del rend="strikethrough">at</del>
                   

In [254]:
node = text_nodes[1]

In [330]:
print(et.tostring(body).decode('UTF-8'))

<body xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude">
      <div type="deposition">
        <closer>
          <signed>
            <roleName type="Commissioner"/>
            <name>Ja: Ware</name>
          </signed>
        </closer>
      </div>
      <div type="original">
        <pb n="fol. 164r" pagenum="261"/>
        <p><lb/><CARDINAL>261</CARDINAL> to 268<lb/>The examination of <person sameAs="person809164r087-1">Edward Lake</person><LOC>Edward Lake</LOC><GPE>Dublin</GPE> Alderman taken the 12th of November <CARDINAL>12</CARDINAL>th of November <GPE>Dublin</GPE> Alderman taken the 12th of November <del rend="doublestrikethrough">1641</del><CARDINAL>1641</CARDINAL><del rend="strikethrough">before</del>
                    <add place="inline">ministred by</add> the clerk of the <PERSON>Councell</PERSON> by direction of the board<lb/><note type="marginalia">A
                    </note> The said examinate saith that <DATE>about July Last</DATE> he 

In [277]:
node[1][2]

' of Dublin Alderman taken the 12th of November '

In [281]:
wrap_text_in_tag(node[1][2],'Dublin','place')

In [280]:
def wrap_text_in_tag(text,substring,tag_name):
    text_node = text.getparent()
    partitions = text.partition(substring)
    text_node.tail = partitions[0]
    
    newElement = et.Element(tag_name)
    newElement.text = partitions[1]
    newElement.tail = partitions[2]
    
    index = text_node.getparent().index(text_node) + 1
    text_node.getparent().insert(index,newElement)

In [256]:
text = node[1][0]
text_node = node[1][0].getparent()
partitions = text.partition('1 ato 2')
print(partitions)
text_node.tail = partitions[0]

('26', '1 to 2', '68')


In [258]:
newElement = et.Element('a')
newElement.text = partitions[1]
newElement.tail = partitions[2]

0

In [266]:
index = text_node.getparent().index(text_node) + 1
text_node.getparent().insert(index,newElement)
node[1][1]

'The examination of '

In [187]:
newElement.tail = 'ewfdsx'

In [331]:
et.tostring(f_et)

b'<TEI xmlns:xi="http://www.w3.org/2001/XInclude" xmlns="http://www.tei-c.org/ns/1.0">\n  <!--<TEI>-->\n  <teiHeader>\n    <fileDesc>\n      <titleStmt>\n        <title>\n\n                </title>\n      </titleStmt>\n      <editionStmt>\n        <edition>Transcribed Version of Deposition\n                </edition>\n        <respStmt>\n          <resp xmlns="">Edited</resp>\n          <include xmlns="" href="responsiblePeople.xml" xpointer="RP2"/>\n        </respStmt>\n        <respStmt>\n          <resp xmlns="">Edited</resp>\n          <include xmlns="" href="responsiblePeople.xml" xpointer="RP4"/>\n        </respStmt>\n      </editionStmt>\n      <publicationStmt>\n        <publisher>Trinity College Dublin</publisher>\n        <pubPlace>Dublin, Ireland</pubPlace>\n      </publicationStmt>\n      <sourceDesc>\n        <include href="manuscriptDescription_tei.xml" xpointer="ms809"/>\n      </sourceDesc>\n    </fileDesc>\n    <encodingDesc>\n      <charDecl>\n            </charDecl>\