In [1]:
# **************************************************
# 1. Se crea un corpus de frases de ejemplo.
# **************************************************
corpus = ["quiero una pizza margherita",
          "quisiera 1 bocadillo de tortilla",
          "tomare una pizza carbonara",
          "quiero 3 bocadillos de anchoas y 2 pizzas",
          "quiero tres bocadillos de anchoas y dos pizzas",
          "quiero una hamburguesa con queso",
          "yo quiero dos hamburguesas completas",
          "quiero una ensalada mixta",
          "quiero un pastel de cabracho",
          "quiero una pizza carbonara y una pizza romana",
          "quiero una tostada con tomate",
          "tomare una pizza calzone",
          "tomare un canape de pate",
          "quiero calamares en su tinta",
          "tomare una hamburguesa sin cebolla",
          "quiero un colacao",
          "quiero un cafe con leche",
          "quiero un pollo asado"]

In [2]:
# **************************************************
# 2. Después, se debe entrenar un tagger para el español.
# **************************************************
from nltk.corpus import cess_esp

sents = cess_esp.tagged_sents()

training = []
test = []
for i in range(len(sents)):
    if i % 10:
        training.append(sents[i])
    else:
        test.append(sents[i])
        
from nltk.tag.hmm import HiddenMarkovModelTagger

#hmm tagger
spanish_pos_tagger = HiddenMarkovModelTagger.train(training)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# **************************************************
# 3. Ahora, se debe construir un Regex Parser que detecte comidas y cantidades.
# **************************************************

grammar = r""" 

  cantidad: {<Z|di0fs0|dn0cp0|di0ms0>}
  
  comida: {<ncmp000|ncfs000|ncms000>}
          
"""

import nltk
regex_parser = nltk.RegexpParser(grammar)

In [4]:
# **************************************************
# 4. Se usa el pos tagger y el Regex Parser para obtener las IOB de una nueva frase.
# **************************************************
sentence = "yo queria un gazpacho"

sentence_tokens = nltk.word_tokenize(sentence)

tagged_sentence = spanish_pos_tagger.tag(sentence_tokens)

#print(tagged_sentence)

chunked = regex_parser.parse(tagged_sentence)

#print(chunked)
print ('Acierto con HMMs:', spanish_pos_tagger.evaluate(test)*100)

Acierto con HMMs: 89.88905831011094


In [5]:
#Uso del pos tagger y el Regex Parser para crear un corpus IOB 
#que sirve para entrenar los bigram taggers o el NaiveBayesClassifier:

parsedCorpus = []
for sentence in corpus:
    sentence_tokens = nltk.word_tokenize(sentence)

    tagged_sentence = spanish_pos_tagger.tag(sentence_tokens)

    #print(tagged_sentence)

    chunked = regex_parser.parse(tagged_sentence)

    #print(chunked)

    parsedCorpus.append(chunked)
    #print(token,IOB_tag)

In [6]:
#Creacion de un conjunto de frases de entrenamiento de de test 
#para los UnigramChunker, BigramChunker y NaiveBayesClassifier
train_sents = []
test_sents = []
for i in range(len(parsedCorpus)):
    if i % 10:
        train_sents.append(parsedCorpus[i])
    else:
        test_sents.append(parsedCorpus[i])

In [7]:
#UnigramChunker
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
    
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy: 100.0%%
    Precision:    100.0%%
    Recall:       100.0%%
    F-Measure:    100.0%%


In [8]:
#BigramChunker
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
    
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy: 100.0%%
    Precision:    100.0%%
    Recall:       100.0%%
    F-Measure:    100.0%%


In [9]:
#NaiveBayes
class ConsecutiveNPChunkTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(
            train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [10]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy: 100.0%%
    Precision:    100.0%%
    Recall:       100.0%%
    F-Measure:    100.0%%
