In [1]:
#import nltk.NgramTagger as ngramT
import nltk
from nltk.corpus import treebank
from nltk.tag import DefaultTagger 
import spacy
import math
import en_core_web_sm
from spacy.tokenizer import Tokenizer
from sklearn.metrics import classification_report


nlp = en_core_web_sm.load()

# Prepare Training & Test Splits as 90%/10%


nltk.download('treebank')

total_size = len(treebank.tagged_sents())
train_indx = math.ceil(total_size * 0.9)
trn_data = treebank.tagged_sents(tagset='universal')[:train_indx]
tst_data = treebank.tagged_sents(tagset='universal')[train_indx:]

print("\033[1mTotal:\033[0m {}; \033[1mTrain:\033[0m {}; \033[1mTest:\033[0m {}".format(total_size, len(trn_data), len(tst_data)))

print("\033[1mTagging with NgramTagger\033[0m")

backoff = DefaultTagger('NN')
ngramTagger = nltk.NgramTagger(1,train=trn_data,cutoff=1,backoff=backoff)

def nlp_accuracy(reference, test):
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    return sum(x == y for x, y in zip(reference, test)) / len(test)

# tagging sentences in test set
for s in treebank.sents()[train_indx:]:
    print("\033[1mINPUT:\033[0m {}".format(s))
    print("\033[1mTAG:\033[0m {}".format(ngramTagger.tag(s)))
    break
    
# evaluation
accuracy = ngramTagger.accuracy(tst_data)

print("\033[1mAccuracy of NgramTagger:\033[0m {:6.4f}".format(accuracy))

print("\033[1mTagging with Spacy\033[0m")
nlp.tokenizer = Tokenizer(nlp.vocab)


for id_sent,sent in enumerate(treebank.sents()[train_indx:]):
    doc = nlp(" ".join(sent))
    break

test = [(t.text, t.pos_) for t in doc]

for s in treebank.sents()[train_indx:]:
    print("\033[1mINPUT:\033[0m {}".format(s))
    reference = ngramTagger.tag(s)
    break
    
print("\033[1mREFERENCE:\033[0m {}".format(reference))
print("\033[1mTAG:\033[0m {}".format(test))

mapping_spacy_to_NLTK = {
    "ADJ": "ADJ",
    "ADP": "ADP",
    "ADV": "ADV",
    "AUX": "VERB",
    "CCONJ": "CONJ",
    "DET": "DET",
    "INTJ": "X",
    "NOUN": "NOUN",
    "NUM": "NUM",
    "PART": "PRT",
    "PRON": "PRON",
    "PROPN": "NOUN",
    "PUNCT": ".",
    "SCONJ": "CONJ",
    "SYM": "X",
    "VERB": "VERB",
    "X": "X"
}

accuracy = nlp_accuracy(reference, test)
print("\033[1mAccuracy of Spacy:\033[0m {:6.4f}".format(accuracy))

# tokens
#print([t.text for t in doc])

# Fine grained POS-tags
#print([t.tag_ for t in doc])

# Coarse POS-tags (from Universal POS Tag set)
#print([t.pos_ for t in doc])

#print (classification_report(correct_labels, predicted_labels))

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\farih\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


[1mTotal:[0m 3914; [1mTrain:[0m 3523; [1mTest:[0m 391
[1mTagging with NgramTagger[0m
[1mINPUT:[0m ['First', 'of', 'America', 'said', '0', 'some', 'of', 'the', 'managers', 'will', 'take', 'other', 'jobs', 'with', 'First', 'of', 'America', '.']
[1mTAG:[0m [('First', 'NOUN'), ('of', 'ADP'), ('America', 'NOUN'), ('said', 'VERB'), ('0', 'X'), ('some', 'DET'), ('of', 'ADP'), ('the', 'DET'), ('managers', 'NOUN'), ('will', 'VERB'), ('take', 'VERB'), ('other', 'ADJ'), ('jobs', 'NOUN'), ('with', 'ADP'), ('First', 'NOUN'), ('of', 'ADP'), ('America', 'NOUN'), ('.', '.')]
[1mAccuracy of NgramTagger:[0m 0.8471
[1mTagging with Spacy[0m
[1mINPUT:[0m ['First', 'of', 'America', 'said', '0', 'some', 'of', 'the', 'managers', 'will', 'take', 'other', 'jobs', 'with', 'First', 'of', 'America', '.']
[1mREFERENCE:[0m [('First', 'NOUN'), ('of', 'ADP'), ('America', 'NOUN'), ('said', 'VERB'), ('0', 'X'), ('some', 'DET'), ('of', 'ADP'), ('the', 'DET'), ('managers', 'NOUN'), ('will', 'VERB'), ('