In [1]:
# **************************************************
# 1. Se crea un corpus de frases de ejemplo.
# **************************************************
corpus = ["quiero una pizza margherita",
          "quisiera 1 bocadillo",
          "tomare una pizza carbonara",
          "quiero 3 bocadillos de anchoas y 2 pizzas",
          "quiero tres bocadillos de anchoas y dos pizzas",
          "quiero una hamburguesa con queso",
          "yo quiero dos hamburguesas completas",
          "quiero una ensalada mixta",
          "quiero un pastel de cabracho",
          "quiero una pizza carbonara y una pizza romana",
          "quiero una tostada con tomate",
          "tomare una pizza calzone",
          "tomare un canape de pate",
          "quiero calamares",
          "tomare una hamburguesa sin cebolla",
          "quiero un colacao",
          "quiero un cafe solo",
          "quiero un pollo asado"]

In [2]:
# **************************************************
# 2. Después, se debe entrenar un tagger para el español.
# **************************************************
from nltk.corpus import cess_esp

sents = cess_esp.tagged_sents()

training = []
test = []
for i in range(len(sents)):
    if i % 10:
        training.append(sents[i])
    else:
        test.append(sents[i])
        
from nltk.tag.hmm import HiddenMarkovModelTagger

#hmm tagger
spanish_pos_tagger = HiddenMarkovModelTagger.train(training)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# **************************************************
# 3. Ahora, se debe construir un Regex Parser que detecte comidas y cantidades.
# **************************************************

grammar = r""" 

  comida: {<ncmp000|ncfs000|ncms000|ncfp000>}
  
  cantidad: {<Z|di0fs0|dn0cp0|di0ms0>}
          
"""

import nltk
regex_parser = nltk.RegexpParser(grammar)

In [4]:
# **************************************************
# 4. Se usa el pos tagger y el Regex Parser para obtener las IOB de una nueva frase.
# **************************************************
sentence = "yo queria un gazpacho"

sentence_tokens = nltk.word_tokenize(sentence)

tagged_sentence = spanish_pos_tagger.tag(sentence_tokens)

#print(tagged_sentence)

chunked = regex_parser.parse(tagged_sentence)

#print(chunked)

print ('Acierto con HMMs:', spanish_pos_tagger.evaluate(test)*100)

Acierto con HMMs: 89.88905831011094


In [5]:
# **************************************************
# 5. Uso del pos tagger y el Regex Parser para crear un primer corpus IOB 
# **************************************************
parsedCorpus = []
for sentence in corpus:
    sentence_tokens = nltk.word_tokenize(sentence)

    tagged_sentence = spanish_pos_tagger.tag(sentence_tokens)

    #print(tagged_sentence)

    chunked = regex_parser.parse(tagged_sentence)

    #print(chunked)

    parsedCorpus.append(chunked)

In [6]:
# **************************************************
# 6. Creacion del corpus IOB definitivo manualmente en dos ficheros de entrenamiento y test
# **************************************************
size = int(len(parsedCorpus) * 0.2)

with open('train.txt', 'w') as train:
    for i in range(len(parsedCorpus[size:])):
        parsedSent = nltk.chunk.tree2conlltags(parsedCorpus[i])
        for w,t,c in parsedSent:
            train.write(w + " " + t + " " + c + '\n')
        train.write('\n')

with open('test.txt', 'w') as test:
    for i in range(len(parsedCorpus[:size])):
        parsedSent = nltk.chunk.tree2conlltags(parsedCorpus[i])
        for w,t,c in parsedSent:
            test.write(w + " " + t + " " + c + '\n')
        test.write('\n')

In [7]:
# **************************************************
# 7. Entrenamiento y evaluacion de UnigramTagger
# **************************************************
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents('test.txt')
train_sents = conll2000.chunked_sents('train.txt')

unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  86.5%%
    Precision:     74.3%%
    Recall:        86.4%%
    F-Measure:     79.9%%


In [8]:
# **************************************************
# 8. Entrenamiento y evaluacion de BigramChunker
# **************************************************
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
    
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  89.3%%
    Precision:     81.2%%
    Recall:        86.2%%
    F-Measure:     83.6%%


In [9]:
# **************************************************
# 8. Entrenamiento y evaluacion de NaiveBayesClassifier
# **************************************************
class ConsecutiveNPChunkTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(
            train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [10]:
# **************************************************
# Empezamos definiendo un feature extractor simple que solo provee el tag POS del token actual.
# Usando este extractor de features nuestro classifier-based chunker el rendimiento es similar al UnigramChunker.
# **************************************************
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  86.5%%
    Precision:     74.3%%
    Recall:        86.4%%
    F-Measure:     79.9%%


In [11]:
# **************************************************
# Podemos añadir un feature para el tag POS previo. Añadir esta feature permite al classifier modelar interacciones
# entre tags adyacentes, y resulta en un chunker que es similar en rendimiento al BigramChunker.
# **************************************************
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  89.1%%
    Precision:     79.7%%
    Recall:        87.7%%
    F-Measure:     83.5%%


In [12]:
# **************************************************
# Ahora intentaremos añadir una feature para la palabra actual, admitiendo que el contenido de la palabra debería ser útil para el chunking.
# Encontramos que esta feature mejora el rendimiento del chunker en cerca de 1.5 puntos porcentuales (lo que corresponde a una reducción de aproximadamente 10% en la tasa de error)
# **************************************************
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  90.8%%
    Precision:     82.7%%
    Recall:        89.4%%
    F-Measure:     85.9%%


In [13]:
# **************************************************
# Finalmente, podemos intentar extender el extractor de funciones con una variedad de características adicionales, 
# como las características de anticipación [1], 
# las características emparejadas [2] 
# y las características contextuales complejas [3]. 
# Esta última característica, llamada tags-since-dt, crea una cadena que describe el conjunto de todas las etiquetas 
# POS que se han encontrado desde el determinante más reciente, 
# o desde el comienzo de la oración si no hay determinante antes del índice i
# **************************************************
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}

def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.8%%
    Precision:     86.8%%
    Recall:        91.9%%
    F-Measure:     89.3%%
