## Part of speech (POS) Tagger

This notebook tags selected words / phrases in the text based on the part of speech (e.g., noun, verb, adjective etc.) they belong to.

#### 1. Import text files to tag

In [196]:
with open("datasets/clean-data/ms-aladin-witness1.txt", "r") as infile:
    witness1text = infile.read()
    
with open("datasets/clean-data/ms-aladin-witness2.txt", "r") as infile:
    witness2text = infile.read()

#### 2. Do POS tagging

In [197]:
import spacy
from spacy import displacy
from spacy_transformers import Transformer, TransformerModel
from spacy_transformers.annotation_setters import null_annotation_setter
from spacy_transformers.span_getters import get_doc_spans
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import json

with open("datasets/clean-data/ms-aladin-simplified.xml", "r") as infile:
    soup = BeautifulSoup(infile, features="lxml-xml")
    
tags = soup.find_all()
tags_minus_irr = []
for tag in tags:
    if (tag.name in ['add', 'del', 'subst']):
        tags_minus_irr.append(tag)
                
nlp = spacy.load("nl_core_news_sm")

doc_w1 = nlp(witness1text)
doc_w2 = nlp(witness2text)

  from IPython.core.display import display, HTML


#### 3. Display the result

In [198]:
# options = {"compact": True, "bg": "#ffffff",
#            "color": "black", "font": "Source Sans Pro"}
# displacy.render(doc_w1.sents, style="dep", options=options)

In [199]:
# options = {"compact": True, "bg": "#ffffff",
#            "color": "black", "font": "Source Sans Pro"}
# displacy.render(doc_w2.sents, style="dep", options=options)

In [200]:
def is_in_edit(word):
    global tags_minus_irr
    # print("word: ", word)
    # print()
    for tag in tags_minus_irr:

        items = tag.text.split()
        # print("items: ", items)
        # print()
    
        if word in items:
            return True
        
    return False

In [201]:
def transform_to_html(nlp_doc, color="#DFF9FF"):
    html = "<!DOCTYPE html>\n<html>"
    html += '<body style="background-color:' + color + '";>'
    for sentence in nlp_doc.sents:
        html += "<p>"
        for token in sentence:
            if (token.pos_ == "VERB" and is_in_edit(token.text)):
                html += '<span style="background-color:#33FFB5">' + token.text + '</span> '
            elif (token.pos_ == "NOUN" and is_in_edit(token.text)):
                html += '<span style="background-color:#FFD433">' + token.text + '</span> '
            elif (token.pos_ == "ADJ" and is_in_edit(token.text)):
                html += '<span style="background-color:#B495FF">' + token.text + '</span> '
            else:
                if is_in_edit(token.text):
                    # html += '<span style="background-color:#FFA8A8">' + token.text + '</span> '
                    html += token.text + " "
                else:
                    html += token.text + " "
    html += "</p>"
    html += '</body>'
    html += "</html>"
    return html
        

html_w1 = transform_to_html(doc_w1)
html_w2 = transform_to_html(doc_w2, color="E5FFEE")

In [202]:
with open("datasets/clean-data/ms-aladin-witness1.html", "w") as outfile:
    outfile.write(html_w1)
    
with open("datasets/clean-data/ms-aladin-witness2.html", "w") as outfile:
    outfile.write(html_w2)

In [203]:
# display(HTML(html_w1))

In [204]:
# display(HTML(html_w2))