In [11]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

In [12]:
train_sents = [ l for l in open("Data/pos/train.col") ]
test_sents = [ l for l in open("Data/pos/test.col")]

In [13]:
def word2features(sent, i):
    word = sent[i][0]

    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit()
    ]
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper()
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper()
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [14]:
trainf = open("Data/pos/train.col")
testf = open("Data/pos/test.col")

def getPOSSents(file):
    #all_sents = []

    this_sent = []

    for line in file:
        line= line.strip()
        if line=="":
            #all_sents.extend([this_sent])
            yield(this_sent)
            this_sent=[]
        
            continue
        else:
            parts = line.split("\t")
            this_sent.extend([tuple(parts)])
      
    return

train_sents = [s for s in getPOSSents(trainf)]
test_sents = [s for s in getPOSSents(testf)]
#for s in positer:
#    print s
#    break
#print "Loaded", len(all_sents), "training sentences"


In [15]:
test_sents

[[('Measuring', 'NN'),
  ('cups', 'NNS'),
  ('may', 'MD'),
  ('soon', 'RB'),
  ('be', 'VB'),
  ('replaced', 'VBN'),
  ('by', 'IN'),
  ('tablespoons', 'NNS'),
  ('in', 'IN'),
  ('the', 'DT'),
  ('laundry', 'NN'),
  ('room', 'NN'),
  ('.', '.')],
 [('Procter', 'NNP'),
  ('&', 'CC'),
  ('Gamble', 'NNP'),
  ('Co.', 'NNP'),
  ('plans', 'VBZ'),
  ('to', 'TO'),
  ('begin', 'VB'),
  ('testing', 'VBG'),
  ('next', 'JJ'),
  ('month', 'NN'),
  ('a', 'DT'),
  ('superconcentrated', 'JJ'),
  ('detergent', 'NN'),
  ('that', 'WDT'),
  ('will', 'MD'),
  ('require', 'VB'),
  ('only', 'RB'),
  ('a', 'DT'),
  ('few', 'JJ'),
  ('spoonfuls', 'NNS'),
  ('per', 'IN'),
  ('washload', 'NN'),
  ('.', '.')],
 [('The', 'DT'),
  ('move', 'NN'),
  ('stems', 'VBZ'),
  ('from', 'IN'),
  ('lessons', 'NNS'),
  ('learned', 'VBN'),
  ('in', 'IN'),
  ('Japan', 'NNP'),
  ('where', 'WRB'),
  ('local', 'JJ'),
  ('competitors', 'NNS'),
  ('have', 'VBP'),
  ('had', 'VBD'),
  ('phenomenal', 'JJ'),
  ('success', 'NN'),
  ('with',

In [16]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [17]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 19.2 s, sys: 47.8 ms, total: 19.3 s
Wall time: 19.4 s


In [18]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
trainer.train('pos_model.crfsuite')

In [None]:
trainer.logparser.last_iteration

In [None]:
tagger  = pycrfsuite .Tagger()
tagger.open('pos_model.crfsuite')

In [None]:

example_sent = test_sents[0]
print  ' '.join(sent2tokens(example_sent))

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))