Understand the code and implement the evaluaiton metrics.


## Import Libraries

In [None]:
# Brown corpus:
from nltk.corpus import brown

# load a tagger models
from nltk.tag.perceptron import PerceptronTagger
# Naive Bayes MLE 
from nltk.tag.sequential import NgramTagger

# tagset mapping:
from nltk.tag.mapping import map_tag

# plotting:
from matplotlib import pyplot as plt

# you can compare you implementation with these
# evaluation metrics:
from sklearn.metrics import (
    f1_score as _f1_score, 
    precision_score as _precision_score, 
    recall_score as _recall_score,
    accuracy_score as _accuracy_score
)

import numpy as np

## Prepare the training and testing dataset

In [None]:
# split training and testing:
test_train_split = 500
test_set = brown.tagged_sents()[:test_train_split]
train_set = brown.tagged_sents()[test_train_split:]

## Load or train the classifiers

In [None]:
# load a pre-trained perceptron tagger:
perceptron_tagger = PerceptronTagger()

In [None]:
%%time
# train Naive Bayes / count-based ngram taggers:
unigram_tagger = NgramTagger(1, train=train_set)
bigram_tagger_nobackoff = NgramTagger(2, train=train_set)
bigram_tagger = NgramTagger(2, train=train_set, backoff=unigram_tagger)
trigram_tagger = NgramTagger(3, train=train_set, backoff=bigram_tagger)

In [None]:
models = {
    "Perceptron": perceptron_tagger, 
    "Unigram": unigram_tagger, 
    "Bigram": bigram_tagger, 
    "Trigram": trigram_tagger, 
    "Bigram-backoff": bigram_tagger_nobackoff, 
}

## Evaluate the models

The test dataset and the models are based on the English Penn TreeBank tagsets. However, we don't need that fine degree of granularity. Therefore, we map each tag onto unviversal tagset.

In [None]:
# the ground truth labels according to the dataset:
tags_true = [
    map_tag("en-brown", "universal", tag)
    for tagged_sent in test_set
    for word, tag in tagged_sent
]

# strip the tags:
test_set_sents = [
    [word for word, tag in tagged_sent]
    for tagged_sent in test_set
]

tagset = sorted(list(set(tags_true)))
print(tagset)

$precision = \frac{\text{tp}}{\text{tp + fp}}$


$recall = \frac{\text{tp}}{\text{tp + fn}}$


$accuracy = \frac{\text{tp + tn}}{\text{tp + fp + fn + tn}}$

In [None]:
def accuracy_score(y_true, y_pred):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    """
    number_correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            number_correct += 1
    return number_correct/(len(y_true))
    
def precision_score(y_true, y_pred, labels=None, average=None):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    labels : list of unique labels for sort out the result
    average : string, ['micro', 'macro'] instead of defining labels.
    
    When true positive + false positive == 0 returns 0
    """
    def compute_precisions():
        precisions = []
        tps = []
        fps = []
        for tag in tagset:
            tp = 0
            fp = 0
            for i in range(len(y_true)):
                if y_pred[i] == tag:
                    if y_true[i] == tag:
                        tp += 1 
                    else:
                        fp += 1
            precisions.append(tp / (tp + fp))
            tps.append(tp)
            fps.append(fp)
        return precisions, tps, fps 
    
    if labels is None:
        if average == 'micro':
            precisions, tps, fps = compute_precisions()
            return sum(tps) / (sum(tps) + sum(fps))
        elif average == 'macro':
            precisions, tps, fps = compute_precisions()
            return sum(precisions) / len(precisions)
    else:
        precisions, tps, fps = compute_precisions()
        return precisions

def recall_score(y_true, y_pred, labels=None, average=None):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    labels : list of unique labels for sort out the result
    average : string, ['micro', 'macro'] instead of defining labels.
    
    When true positive + false positive == 0 returns 0
    """
    def compute_recalls():
        recalls = []
        tps = []
        fns = []
        for tag in tagset:
            tp = 0
            fn = 0
            for i in range(len(y_true)):
                if y_true[i] == tag:
                    if y_pred[i] == tag:
                        tp += 1 
                    else:
                        fn += 1
            recalls.append(tp / (tp + fn))
            tps.append(tp)
            fns.append(fn)
        return (recalls, tps, fns)
    
    if labels is None:
        if average == 'micro':
            # tp / tp + fn
            recalls, tps, fns = compute_recalls()
            return sum(tps) / (sum(tps) + sum(fns))
        elif average == 'macro':
            recalls, tps, fns = compute_recalls()
            return sum(recalls) / len(recalls)
    else:
        recalls, tps, fns = compute_recalls()
        return recalls


def f1_score(y_true, y_pred, labels=None, average=None):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    labels : list of unique labels for sort out the result
    average : string, ['micro', 'macro'] instead of defining labels.
    """
    # you can call recall_score and precision_score.
    if labels is None:
        if average == 'micro':
            p = precision_score(y_true, y_pred, average=average)
            r = recall_score(y_true, y_pred, average=average)
            return (2 * p * r) / (p + r)
        elif average == 'macro':
            
            
            p = precision_score(y_true, y_pred, average=average)
            r = recall_score(y_true, y_pred, average=average)
            return (2 * p * r) / (p + r)
    else:
        precision = precision_score(y_true, y_pred, labels=labels)
        recall = recall_score(y_true, y_pred, labels=labels)
        # 2pr / p + r
        return (
            [(p * r * 2) / (p + r) for p, r in zip(precision, recall)]
        )
        

def all_metrics(y_true, y_pred, labels=None, average=None):
    # you can compare you implementation with these
#     return (
#         _precision_score(y_true, y_pred, labels=labels, average=average),
#         _recall_score(y_true, y_pred, labels=labels, average=average),
#         _f1_score(y_true, y_pred, labels=labels, average=average),
#         _accuracy_score(y_true, y_pred)
#     )
    # remove the likes above and use the function calls below: 
    return (
       precision_score(y_true, y_pred, labels=labels, average=average),
       recall_score(y_true, y_pred, labels=labels, average=average),
       f1_score(y_true, y_pred, labels=labels, average=average),
       accuracy_score(y_true, y_pred)
    )
    

In [None]:
models_preds = dict()
print("              |       |         macro       |         micro")
print("  model name  |  acc  | preci  recal    f1  | preci  recal    f1")
print("-"*58)
for model_name, model in models.items():
    tags_pred = [
        map_tag("en-ptb", "universal", tag) if model_name == "Perceptron" else map_tag("en-brown", "universal", tag)
        for sent in test_set_sents
        for word, tag in model.tag(sent)
    ]
    models_preds[model_name] = tags_pred
    # print the results
    precision_macro, recall_macro, f1score_macro, accuracy = all_metrics(tags_true, tags_pred, average='macro')
    precision_micro, recall_micro, f1score_micro, _ = all_metrics(tags_true, tags_pred, average='micro')
    print(f"{model_name:14}| {100*accuracy:5.2f} | {100*precision_macro:5.2f}  {100*recall_macro:5.2f}  {100*f1score_macro:5.2f} | {100*precision_micro:5.2f}  {100*recall_micro:5.2f}  {100*f1score_micro:5.2f}")
    
    

In [None]:
for model_name, tags_pred in models_preds.items():
    print('='*50)
    print(model_name)
    print('')
    precisions, recalls, f1scores, _ = all_metrics(tags_true, tags_pred, labels=tagset)
    print("tag\tprecision\trecall\tf1-score")
    print("-"*50)
    for tag, precision, recall, f1score in zip(tagset, precisions, recalls, f1scores):
        print(f"{tag}\t{100*precision:9.2f}\t{100*recall:6.2f}\t{100*f1score:8.2f}")
    print('='*50)
