Understand the code and implement the evaluaiton metrics.


## Import Libraries

In [1]:
# Brown corpus:
from nltk.corpus import brown

# load a tagger models
from nltk.tag.perceptron import PerceptronTagger
# Naive Bayes MLE 
from nltk.tag.sequential import NgramTagger

# tagset mapping:
from nltk.tag.mapping import map_tag

# plotting:
from matplotlib import pyplot as plt

# you can compare you implementation with these
# evaluation metrics:
from sklearn.metrics import (
    f1_score as _f1_score, 
    precision_score as _precision_score, 
    recall_score as _recall_score,
    accuracy_score as _accuracy_score
)

import numpy as np

## Prepare the training and testing dataset

In [2]:
# split training and testing:
test_train_split = 500
test_set = brown.tagged_sents()[:test_train_split]
train_set = brown.tagged_sents()[test_train_split:]

## Load or train the classifiers

In [3]:
# load a pre-trained perceptron tagger:
perceptron_tagger = PerceptronTagger()

In [4]:
%%time
# train Naive Bayes / count-based ngram taggers:
unigram_tagger = NgramTagger(1, train=train_set)
bigram_tagger_nobackoff = NgramTagger(2, train=train_set)
bigram_tagger = NgramTagger(2, train=train_set, backoff=unigram_tagger)
trigram_tagger = NgramTagger(3, train=train_set, backoff=bigram_tagger)

CPU times: user 49.3 s, sys: 2.06 s, total: 51.3 s
Wall time: 1min 8s


In [5]:
models = {
    "Perceptron": perceptron_tagger, 
    "Unigram": unigram_tagger, 
    "Bigram": bigram_tagger, 
    "Trigram": trigram_tagger, 
    "Bigram-backoff": bigram_tagger_nobackoff, 
}

## Evaluate the models

The test dataset and the models are based on the English Penn TreeBank tagsets. However, we don't need that fine degree of granularity. Therefore, we map each tag onto unviversal tagset.

In [6]:
# the ground truth labels according to the dataset:
tags_true = [
    map_tag("en-brown", "universal", tag)
    for tagged_sent in test_set
    for word, tag in tagged_sent
]

# strip the tags:
test_set_sents = [
    [word for word, tag in tagged_sent]
    for tagged_sent in test_set
]

tagset = sorted(list(set(tags_true)))
print(tagset)

['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


### Generate predictions

In [7]:
models_preds = dict()
for model_name, model in models.items():
    tags_pred = [
        map_tag("en-ptb", "universal", tag) if model_name == "Perceptron" else map_tag("en-brown", "universal", tag)
        for sent in test_set_sents
        for word, tag in model.tag(sent)
    ]
    models_preds[model_name] = tags_pred

In [8]:
models_preds.keys()

dict_keys(['Perceptron', 'Unigram', 'Bigram', 'Trigram', 'Bigram-backoff'])

In [9]:
tags_pred = models_preds['Unigram']

### Working on formulas

$precision = \frac{\text{tp}}{\text{tp + fp}}$


$recall = \frac{\text{tp}}{\text{tp + fn}}$


$accuracy = \frac{\text{tp + tn}}{\text{tp + fp + fn + tn}}$

$f_1 = \frac{2PR}{P+R}$

In [10]:
y_true, y_pred = tags_true, tags_pred

In [11]:
matrix = [
    [0 for x in range(len(tagset))]
    for x in range(len(tagset))
]

print(tagset)
matrix

['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [12]:
tagset.index('ADJ')

1

In [13]:
tagset

['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']

In [14]:
matrix = []
for _ in tagset:
    row = [0 for tag in tagset]
    matrix.append(row)

indices = {tagset: idx for idx, tagset in enumerate(tagset)}

In [15]:
tagset.index('ADJ')

1

In [16]:
# for genre, doc in testing_data:
#     this_guess = guess(model, doc)

#     matrix[indices[tagset]][indices[this_guess]] += 1

# return(matrix)

In [17]:
def accuracy_score(y_true, y_pred):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    """
    number_correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            number_correct += 1
    return number_correct/(len(y_true))
    
def precision_score(y_true, y_pred, labels=None, average=None):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    labels : list of unique labels for sort out the result
    average : string, ['micro', 'macro'] instead of defining labels.
    
    When true positive + false positive == 0 returns 0
    """
    if labels is None:
        if average == 'micro':
            #
            # your code here
            #
            return # the score with micro averaging
        elif average == 'macro':
            #
            # your code here
            #
            return # the score with micro averaging
    else:
        res = []
        for tag in tagset:
            tp = 0
            fp = 0
            for i in range(len(y_true)):
                if y_pred[i] == tag:
                    if y_true[i] == tag:
                        tp += 1 
                    else:
                        fp += 1
            res.append(tp / (tp + fp))
        return res

def recall_score(y_true, y_pred, labels=None, average=None):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    labels : list of unique labels for sort out the result
    average : string, ['micro', 'macro'] instead of defining labels.
    
    When true positive + false positive == 0 returns 0
    """
    if labels is None:
        if average == 'micro':
            #
            # your code here
            #
            return # the score with micro averaging
        elif average == 'macro':
            #
            # your code here
            #
            return # the score with micro averaging
    else:
        res = []
        for tag in tagset:
            tp = 0
            fn = 0
            for i in range(len(y_true)):
                if y_true[i] == tag:
                    if y_pred[i] == tag:
                        tp += 1 
                    else:
                        fn += 1
            res.append(tp / (tp + fn))
        return res

def f1_score(y_true, y_pred, labels=None, average=None):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    labels : list of unique labels for sort out the result
    average : string, ['micro', 'macro'] instead of defining labels.
    """
    # you can call recall_score and precision_score.
    if labels is None:
        if average == 'micro':
            #
            # your code here
            #
            return # the score with micro averaging
        elif average == 'macro':
            #
            # your code here
            #
            return # the score with micro averaging
    else:
        #
        # your code here
        #
        return # list of score of each label 
    

def all_metrics(y_true, y_pred, labels=None, average=None):
    # you can compare you implementation with these
#     return (
#         _precision_score(y_true, y_pred, labels=labels, average=average),
#         _recall_score(y_true, y_pred, labels=labels, average=average),
#         _f1_score(y_true, y_pred, labels=labels, average=average),
#         _accuracy_score(y_true, y_pred)
#     )
    # remove the likes above and use the function calls below: 
    return (
       _precision_score(y_true, y_pred, labels=labels, average=average),
       _recall_score(y_true, y_pred, labels=labels, average=average),
       _f1_score(y_true, y_pred, labels=labels, average=average),
       _accuracy_score(y_true, y_pred)
    )
    

In [35]:
models_preds = dict()
print("              |       |         macro       |         micro")
print("  model name  |  acc  | preci  recal    f1  | preci  recal    f1")
print("-"*58)
for model_name, model in models.items():
    tags_pred = [
        map_tag("en-ptb", "universal", tag) if model_name == "Perceptron" else map_tag("en-brown", "universal", tag)
        for sent in test_set_sents
        for word, tag in model.tag(sent)
    ]
    models_preds[model_name] = tags_pred
    # print the results
    precision_macro, recall_macro, f1score_macro, accuracy = all_metrics(tags_true, tags_pred, average='macro')
    precision_micro, recall_micro, f1score_micro, _ = all_metrics(tags_true, tags_pred, average='micro')
    print(f"{model_name:14}| {100*accuracy:5.2f} | {100*precision_macro:5.2f}  {100*recall_macro:5.2f}  {100*f1score_macro:5.2f} | {100*precision_micro:5.2f}  {100*recall_micro:5.2f}  {100*f1score_micro:5.2f}")
    
    

              |       |         macro       |         micro
  model name  |  acc  | preci  recal    f1  | preci  recal    f1
----------------------------------------------------------
Perceptron    | 93.66 | 86.17  88.79  87.23 | 93.66  93.66  93.66
Unigram       | 93.30 | 86.11  94.57  86.09 | 93.30  93.30  93.30
Bigram        | 94.40 | 87.36  94.11  86.71 | 94.40  94.40  94.40
Trigram       | 94.54 | 87.53  94.48  86.97 | 94.54  94.54  94.54
Bigram-backoff| 24.63 | 87.72  31.96  36.74 | 24.63  24.63  24.63


In [37]:
prec = precision_score(y_true, y_pred, labels=tagset)

In [43]:
recs = recall_score(y_true, y_pred, labels=tagset)

In [44]:
import numpy as np

In [45]:
p = np.array(prec)
r = np.array(recs)

In [46]:
macro = p * r * 2 / (p + r)

In [47]:
np.mean(macro)

0.8608748366962331

In [20]:
for model_name, tags_pred in models_preds.items():
    print('='*50)
    print(model_name)
    print('')
    precisions, recalls, f1scores, _ = all_metrics(tags_true, tags_pred, labels=tagset)
    print("tag\tprecision\trecall\tf1-score")
    print("-"*50)
    for tag, precision, recall, f1score in zip(tagset, precisions, recalls, f1scores):
        print(f"{tag}\t{100*precision:9.2f}\t{100*recall:6.2f}\t{100*f1score:8.2f}")
    print('='*50)


Perceptron

tag	precision	recall	f1-score
--------------------------------------------------
.	   100.00	 99.76	   99.88
ADJ	    83.18	 80.25	   81.69
ADP	    98.26	 90.60	   94.28
ADV	    89.88	 89.88	   89.88
CONJ	    99.61	 99.61	   99.61
DET	    95.98	 92.34	   94.12
NOUN	    94.26	 97.28	   95.74
NUM	    89.18	 98.10	   93.42
PRON	    72.41	 97.15	   82.98
PRT	    63.31	 77.26	   69.59
VERB	    98.00	 93.24	   95.56
X	    50.00	 50.00	   50.00
Unigram

tag	precision	recall	f1-score
--------------------------------------------------
.	   100.00	100.00	  100.00
ADJ	    90.91	 86.79	   88.80
ADP	    97.30	 91.13	   94.11
ADV	    84.68	 87.20	   85.92
CONJ	    99.61	100.00	   99.80
DET	    99.78	 99.56	   99.67
NOUN	    96.41	 90.59	   93.41
NUM	    99.49	 92.86	   96.06
PRON	    99.64	 97.15	   98.38
PRT	    67.77	 96.39	   79.58
VERB	    96.45	 93.14	   94.76
X	     1.29	100.00	    2.54
Bigram

tag	precision	recall	f1-score
--------------------------------------------------
.	   100

In [None]:
for model_name, tags_pred in models_preds.items():
    print('='*50)
    print(model_name)
    print('')
    precisions, recalls, f1scores, _ = all_metrics(tags_true, tags_pred, labels=tagset)
    print("tag\tprecision\trecall\tf1-score")
    print("-"*50)
    for tag, precision, recall, f1score in zip(tagset, precisions, recalls, f1scores):
        print(f"{tag}\t{100*precision:9.2f}\t{100*recall:6.2f}\t{100*f1score:8.2f}")
    print('='*50)
