In [None]:
import nltk
from nltk.corpus import conll2002
from nltk.classify import MaxentClassifier
from nltk.tag import ClassifierBasedTagger
# importar las bibliotecas necesarias

# preparar los datos de entrenamiento y prueba
train_sents = list(conll2002.iob_sents('esp.train'))
test_sents = list(conll2002.iob_sents('esp.testb'))

# definir una función para extraer características de las palabras
def word_features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'word': word,
        'postag': postag,
        'word_is_uppercase': word[0].isupper(),
        'word_is_titlecase': word.istitle(),
        'word_is_digit': word.isdigit(),
        'word_suffix': word[-3:],
        'word_prefix': word[:3],
        'postag_prefix': postag[:2],
    }
    if i > 0:
        prev_word = sent[i-1][0]
        prev_postag = sent[i-1][1]
        features['prev_word'] = prev_word
        features['prev_postag'] = prev_postag
        features['prev_word_suffix'] = prev_word[-3:]
        features['prev_postag_prefix'] = prev_postag[:2]
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        next_postag = sent[i+1][1]
        features['next_word'] = next_word
        features['next_postag'] = next_postag
        features['next_word_prefix'] = next_word[:3]
        features['next_postag_prefix'] = next_postag[:2]
    else:
        features['EOS'] = True
    return features

# definir una clase para el etiquetador MEMM
class MEMMTagger(ClassifierBasedTagger):
    def __init__(self, train_sents, feature_detector=word_features, **kwargs):
        self.feature_detector = feature_detector
        ClassifierBasedTagger.__init__(self, train=train_sents, feature_detector=feature_detector, **kwargs)
    def encode_tags(self, tagged_sent):
        return [tag for word, tag in tagged_sent]
    def decode_tags(self, tags):
        return [(word, tag) for (word, _), tag in zip(self.train_sents[0], tags)]
    def train(self, train_sents, **kwargs):
        self.train_sents = train_sents
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = self.feature_detector(untagged_sent, i)
                train_set.append((featureset, tag))
        self.classifier = MaxentClassifier.train(train_set, algorithm='megam', **kwargs)

# entrenar el modelo MEMM
memm_tagger = MEMMTagger(train_sents)

# evaluar el modelo en los datos de prueba
print(memm_tagger.evaluate(test_sents))
