# Ejercicio 3: Desarrollar y evaluar un Chunker utilizando el corpus ConLL2000 del NLTK

## Load and Preprocess

In [1]:
# Importar y trabajar con el corpus conll2000


import nltk

# nltk.download('conll2000')
from nltk.corpus import conll2000

conll_train = conll2000.chunked_sents("train.txt")
conll_test = conll2000.chunked_sents("test.txt")
print(conll_train[0])
print("-------")
print(conll_test[0])

import nltk.chunk

train_chunks = [nltk.chunk.tree2conlltags(tree) for tree in conll_train]
test_chunks = [nltk.chunk.tree2conlltags(tree) for tree in conll_test]
print(test_chunks[0])


train = [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in train_chunks]
test = [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in test_chunks]
print("----")
print(train[0])
print("====")
print(test[0])

(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)
-------
(S
  (NP Rockwell/NNP International/NNP Corp./NNP)
  (NP 's/POS Tulsa/NNP unit/NN)
  (VP said/VBD)
  (NP it/PRP)
  (VP signed/VBD)
  (NP a/DT tentative/JJ agreement/NN)
  (VP extending/VBG)
  (NP its/PRP$ contract/NN)
  (PP with/IN)
  (NP Boeing/NNP Co./NNP)
  (VP to/TO provide/VB)
  (NP structural/JJ parts/NNS)
  (PP for/IN)
  (NP Boeing/NNP)
  (NP 's/POS 747/CD jetliners/NNS)
  ./.)
[('Rockwell', 'NNP', 'B-NP'), ('International', 'NNP', 'I-NP'), ('Corp.', 'NNP', 'I-NP'), ("'s", 'POS', 'B-NP'), ('Tulsa', 'NNP', 'I-NP'), ('unit'

## Train Model

In [9]:
from nltk.tag import hmm
from nltk.tag import tnt
from conlleval import evaluate

In [23]:
hmm_model = hmm.HiddenMarkovModelTagger.train(train)
tnt_model = tnt.TnT()
tnt_model.train(train)
u_chunker = nltk.tag.UnigramTagger(train)

## Tag Test

In [42]:
hmm_labels = [hmm_model.tag([t for (t, c) in test_sent]) for test_sent in test]
tnt_labels = [tnt_model.tag([t for (t, c) in test_sent]) for test_sent in test]
u_labels = [u_chunker.tag([t for (t, c) in test_sent]) for test_sent in test]
true_labels = [[c for (t, c) in test_sent] for test_sent in test]

## Evaluation

### Process Tags

In [44]:
formated_true_labels = []
for sent_labels in true_labels:
    for label in sent_labels:
        formated_true_labels.append(label)
    formated_true_labels.append("O")

In [45]:
def parse_output(predicted_labels):
    pred_tags = []
    for sent in predicted_labels:
        for _, label in sent:
            pred_tags.append(label)
        pred_tags.append("O")
    return pred_tags

### Results

In [46]:
print("HMM")
evaluate(parse_output(hmm_labels), formated_true_labels, True)

HMM
processed 49389 tokens with 22758 phrases; found: 21891 phrases; correct: 19008.
accuracy:  90.02%; (non-O)
accuracy:  90.55%; precision:  86.83%; recall:  83.52%; FB1:  85.14
               NP: precision:  86.45%; recall:  83.76%; FB1:  85.08  12422
               PP: precision:  92.16%; recall:  83.50%; FB1:  87.62  4811
               VP: precision:  82.33%; recall:  82.88%; FB1:  82.61  4658


(86.83020419350417, 83.52227788030582, 85.14412416851442)

In [47]:
print("TnT")
evaluate(parse_output(tnt_labels), formated_true_labels, True)

TnT
processed 49389 tokens with 22814 phrases; found: 21891 phrases; correct: 19112.
accuracy:  89.35%; (non-O)
accuracy:  89.98%; precision:  87.31%; recall:  83.77%; FB1:  85.50
               NP: precision:  86.66%; recall:  84.03%; FB1:  85.32  12422
               PP: precision:  92.45%; recall:  83.41%; FB1:  87.70  4811
               VP: precision:  83.71%; recall:  83.49%; FB1:  83.60  4658


(87.3052852770545, 83.77312176733585, 85.50274018566157)

In [48]:
print("Unigram")
evaluate(parse_output(u_labels), formated_true_labels, True)

Unigram
processed 49389 tokens with 25460 phrases; found: 21891 phrases; correct: 18909.
accuracy:  76.15%; (non-O)
accuracy:  78.78%; precision:  86.38%; recall:  74.27%; FB1:  79.87
               NP: precision:  86.80%; recall:  79.87%; FB1:  83.19  12422
               PP: precision:  97.07%; recall:  74.73%; FB1:  84.45  4811
               VP: precision:  74.22%; recall:  60.53%; FB1:  66.68  4658


(86.37796354666301, 74.26944226237235, 79.86737344512261)