### POS-tagging task

Three models are used: <br>
1) HMM <br>
2) Naive Bayes Classifier (with generated features)<br>
3) CRF (with generated features) <br>

In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import brown
from nltk.tag import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pycrfsuite

Loading Brown news corpora and splitting into train/test (80/20):

In [2]:
corpora = brown.tagged_sents(categories="news", tagset="universal")
corpora_train, corpora_test = train_test_split(corpora, test_size=0.2, random_state=13, shuffle=True)

print("Number of train sentences: {:,}".format(len(corpora_train)))
print("Number of test sentences: {:,}".format(len(corpora_test)))

Number of train sentences: 3,698
Number of test sentences: 925


Hidden Markov Model (HMM) shows poor F1-score on test set. <br> 
We do not use any features here besides only knowledge of hidden states (POS-tags) and observed states (words):

In [3]:
trainer_hmm = hmm.HiddenMarkovModelTrainer()
tagger_hmm = trainer_hmm.train_supervised(corpora_train)

X_test = [nltk.tag.untag(sent) for sent in corpora_test] # List of sentences
y_test = [word[1] for sent in corpora_test for word in sent] # Flatten list of POS

# Hidden Markov model predictions
y_hmm = [word[1] for sent in tagger_hmm.tag_sents(X_test) for word in sent] # Flatten list of POS
print(classification_report(y_test, y_hmm))

             precision    recall  f1-score   support

          .       1.00      0.31      0.47      2396
        ADJ       0.94      0.33      0.49      1303
        ADP       0.94      0.39      0.55      2550
        ADV       0.91      0.44      0.59       680
       CONJ       1.00      0.36      0.53       522
        DET       1.00      0.47      0.64      2303
       NOUN       0.98      0.35      0.51      6180
        NUM       1.00      0.37      0.54       442
       PRON       0.04      0.99      0.08       517
        PRT       0.95      0.40      0.56       470
       VERB       0.99      0.45      0.62      2943
          X       1.00      0.23      0.38        13

avg / total       0.95      0.40      0.53     20319



HMM with stemmed words performs substantially better. <br>
The problem was described previously here - [link](https://github.com/nltk/nltk/issues/1095)

In [4]:
porter = PorterStemmer()
corpora_train_stem = [[(porter.stem(word), tag) for word, tag in sent] for sent in corpora_train]
corpora_test_stem = [[(porter.stem(word), tag) for word, tag in sent] for sent in corpora_test]

trainer_hmm_stem = hmm.HiddenMarkovModelTrainer()
tagger_hmm_stem = trainer_hmm_stem.train_supervised(corpora_train_stem)

X_test_stem = [nltk.tag.untag(sent) for sent in corpora_test_stem]
y_test_stem = [word[1] for sent in corpora_test_stem for word in sent]

y_hmm_stem = [word[1] for sent in tagger_hmm_stem.tag_sents(X_test_stem) for word in sent]
print(classification_report(y_test, y_hmm_stem))

             precision    recall  f1-score   support

          .       1.00      0.47      0.64      2396
        ADJ       0.84      0.44      0.58      1303
        ADP       0.95      0.58      0.72      2550
        ADV       0.86      0.57      0.69       680
       CONJ       1.00      0.51      0.68       522
        DET       0.99      0.63      0.77      2303
       NOUN       0.92      0.49      0.64      6180
        NUM       1.00      0.52      0.69       442
       PRON       0.06      0.98      0.11       517
        PRT       0.94      0.59      0.73       470
       VERB       0.93      0.59      0.72      2943
          X       1.00      0.46      0.63        13

avg / total       0.92      0.55      0.67     20319



Now let's fit Naive Bayes Classifier which uses generated features (both from current word and near context):
- previous / current / next word and stem
- suffix / prefix of current word
- information about capital letters and numerics

In [5]:
def pos_features(sentence, index):
    return {
        'word': sentence[index],
        'stem': porter.stem(sentence[index]),
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '<START>' if index == 0 else sentence[index - 1],
        'next_word': '<END>' if index == len(sentence) - 1 else sentence[index + 1],
        'prev_stem': '<START>' if index == 0 else porter.stem(sentence[index - 1]),
        'next_stem': '<END>' if index == len(sentence) - 1 else porter.stem(sentence[index + 1]),
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],        
    }

class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i)
                train_set.append((featureset, tag))
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        tags = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i)
            tag = self.classifier.classify(featureset)
            tags.append(tag)
        return zip(sentence, tags)

Although Naive Bayes is the simple model which uses very basic features in this example, it demonstrates significantly better performance in comparison with plain HMM:

In [6]:
tagger_nb = ConsecutivePosTagger(corpora_train)
y_nb = [word[1] for sent in tagger_nb.tag_sents(X_test) for word in sent]

print(classification_report(y_test, y_nb))

             precision    recall  f1-score   support

          .       1.00      1.00      1.00      2396
        ADJ       0.80      0.89      0.84      1303
        ADP       0.97      0.90      0.93      2550
        ADV       0.81      0.86      0.84       680
       CONJ       0.99      0.99      0.99       522
        DET       1.00      0.99      0.99      2303
       NOUN       0.97      0.91      0.94      6180
        NUM       0.87      1.00      0.93       442
       PRON       0.94      0.97      0.96       517
        PRT       0.70      0.94      0.80       470
       VERB       0.93      0.96      0.94      2943
          X       0.16      0.77      0.27        13

avg / total       0.95      0.94      0.94     20319



Now we use CRF algorithm (*pycrfsuit* module) and try to improve our previous results. <br>
Let's slightly rewrite previous code, but not changing core function which generates dictionary of features for each word in sentence. <br>

In [7]:
class CRFPosTagger:
    def __init__(self):
        self.trainer = pycrfsuite.Trainer(verbose=False)
        
    def prepare_features(self, corpora):
        X, y = [], []
        for tagged_sent in corpora:
            untagged_sent = nltk.tag.untag(tagged_sent)
            X_sent, y_sent = [], []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i)
                X_sent.append(featureset)
                y_sent.append(tag)
            X.append(X_sent)
            y.append(y_sent)
        return X, y
    
    def train(self, corpora_train, **params):
        X_train, y_train = self.prepare_features(corpora_train)
        for xseq, yseq in zip(X_train, y_train):
            self.trainer.append(pycrfsuite.ItemSequence(xseq), yseq)
        self.trainer.set_params({
                'c1': 1.0,
                'c2': 1e-3,
                'max_iterations': 50,
                'feature.possible_transitions': True
        })
        self.trainer.set_params(params)
        self.trainer.train(model="trainer.pycrfsuit");
        self.tagger = pycrfsuite.Tagger();
        self.tagger.open("trainer.pycrfsuit");
    
    def test(self, corpora_test):
        X_test, y_test = self.prepare_features(corpora_test)
        return [self.tagger.tag(pycrfsuite.ItemSequence(xseq)) for xseq in X_test]
    
    def predict(self, X):
        pass

We can conclude, that CRF outperforms Naive Bayes classifier (both use the same featureset):

In [8]:
tagger_crf = CRFPosTagger()
tagger_crf.train(corpora_train)

y_crf = [word for sent in tagger_crf.test(corpora_test) for word in sent]

print(classification_report(y_test, y_crf))

             precision    recall  f1-score   support

          .       1.00      1.00      1.00      2396
        ADJ       0.90      0.86      0.88      1303
        ADP       0.97      0.98      0.98      2550
        ADV       0.94      0.88      0.91       680
       CONJ       1.00      1.00      1.00       522
        DET       1.00      0.99      0.99      2303
       NOUN       0.96      0.98      0.97      6180
        NUM       1.00      0.99      0.99       442
       PRON       0.98      0.98      0.98       517
        PRT       0.94      0.93      0.94       470
       VERB       0.97      0.96      0.96      2943
          X       0.62      0.62      0.62        13

avg / total       0.97      0.97      0.97     20319

