In [1]:
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report


### load training data

In [2]:
fo = open("data/onto.train", "r")
lines = fo.readlines()
data = []
temp = []
for line in lines:
    if line!='\n':
        temp.append(tuple(line.strip().split('\t')))
    else:
        data.append(temp)
        temp = []


In [3]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [4]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

In [5]:

trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 135032
Seconds required: 2.200

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 527917.131517
Feature norm: 1.000000
Error norm: 873674.934192
Active features: 134178
Line search trials: 1
Line search step: 0.000001
Seconds required for this iteration: 3.147

***** Iteration #2 *****
Loss: 284792.367829
Feature norm: 1.886719
Error norm: 92320.944570
Active features: 129262
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 1.571

***** Iteration #3 *****
Loss: 277666.727187
Feature norm: 1.846467
Error norm: 83808.581102
Active features: 131855
Line search trials: 1
Line search step: 1.000000
Seconds requ

# load test(a) and predict

In [6]:
fo = open("data/onto.testa", "r")
lines = fo.readlines()
testa = []
temp = []
for line in lines:
    if line!='\n':
        temp.append(tuple(line.strip().split('\t')))
    else:
        testa.append(temp)
        temp = []


In [7]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
X_testa = [extract_features(doc) for doc in testa]
y_testa = [get_labels(doc) for doc in testa]
y_pred = [tagger.tag(xseq) for xseq in X_testa]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_testa[i]]):
    print("%s (%s)" % (y, x))

yet (O)
, (O)
she (O)
never (O)
seemed (O)
to (O)
have (O)
any (O)
reservation (O)
in (O)
her (O)
effort (O)
and (O)
contribution (O)
. (O)


In [8]:
fo = open('data/onto.testa.crf.pred', "a+")
for k,doc in enumerate(testa):
    for i in range(len(doc)):
        fo.write("{}\t{}\t{}\t{}\n".format(doc[i][0], doc[i][1], doc[i][2], y_pred[k][i]))
    fo.write("\n")
    

## Print for Test B

In [15]:
fo = open("data/onto.testb", "r")
lines = fo.readlines()
testb = []
temp = []
for line in lines:
    if line!='\n':
        temp.append(tuple(line.strip().split('\t')))
    else:
        testb.append(temp)
        temp = []
fo.close()
X_testb = [extract_features(doc) for doc in testb]
#y_testb = [get_labels(doc) for doc in testb]
y_pred = [tagger.tag(xseq) for xseq in X_testb]

fo = open('data/crf_output.txt', "a+")
for k,doc in enumerate(testb):
    for i in range(len(doc)):
        fo.write("{}\n".format(y_pred[k][i]))
    fo.write("\n")
fo.close()