# Conditional Random Fields
## Named Entity Recognition Tagging

In [1]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



Here we take words read from the input files and construct features we use to train the CRF.

For every input word we create features for the lower case verison of the word, the last 2 and the last 3 characters of the word, whether the word is in upper case, title case or whether it consists of digits. We also create features for the previous word and the next word in the sentence.

In [2]:
def word2features(sent, i):
    word = sent[i][0]

    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit()
    ]
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper()
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper()
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]

Here we open our training and testing files, parse them and pass them to the feature extraction functions to ready them for training.

In [10]:
def getSentsFromFile(file):
    #all_sents = []

    this_sent = []

    for line in file:
        line= line.strip()
        if line=="":
            #all_sents.extend([this_sent])
            yield(this_sent)
            this_sent=[]
        
            continue
        else:
            parts = line.split("\t")
            this_sent.extend([tuple(parts)])
      
    return


def loadTestTrain(trainFile, testFile):
    train_sents = [s for s in getSentsFromFile(trainFile)]
    test_sents = [s for s in getSentsFromFile(testFile)]

    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    print "Loaded ", len(X_train), " training instances/", len(X_test), " testing instances."
    
    return (X_train, y_train, X_test, y_test)

trainf_ner = open("Data/ner-pol/train.iob")
testf_ner = open("Data/ner-pol/test.iob")

(X_train, y_train, X_test, y_test) = loadTestTrain(trainf_ner, testf_ner)


Loaded  14987  training instances/ 3684  testing instances.


Set up model hyperparameters and train the model.

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=1.0,
    c2=0.001,
    max_iterations=50,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 25.1 s, sys: 230 ms, total: 25.3 s
Wall time: 25.6 s


List the model labels consisting of named entity types - Organisation, person, location and miscellaneous.

In [29]:
labels = list(crf.classes_)
labels

['|O',
 '|B-ORG',
 '|B-MISC',
 '|B-PER',
 '|I-PER',
 '|B-LOC',
 '|I-ORG',
 '|I-MISC',
 '|I-LOC']

Process the testing set and print out the f1-score.

In [33]:
%%time
y_pred = crf.predict(X_test)
print metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9497753323961592
CPU times: user 1.18 s, sys: 20 µs, total: 1.18 s
Wall time: 1.17 s


Print out prediction costs per label

In [31]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

     |B-LOC      0.846     0.785     0.814      1669
    |B-MISC      0.806     0.740     0.772       705
     |B-ORG      0.798     0.667     0.727      1663
     |B-PER      0.790     0.843     0.816      1617
     |I-LOC      0.736     0.578     0.648       256
    |I-MISC      0.634     0.643     0.639       213
     |I-ORG      0.653     0.701     0.676       833
     |I-PER      0.838     0.948     0.890      1156
         |O      0.983     0.988     0.986     38554

avg / total      0.950     0.951     0.950     46666



List the features by training importance

In [37]:
for key, value in sorted(crf.state_features_.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    print "%s: %s" % (key, value)
    

(u'word[-2:]=5M', u'|O'): 7.942236
(u'word[-2:]=0M', u'|O'): 7.584323
(u'-1:word.lower=grade', u'|O'): 7.467747
(u'word.lower=ata-ur-rehman', u'|B-PER'): 7.377029
(u'word.lower=sungard', u'|B-ORG'): 6.664536
(u'word[-2:]=6F', u'|O'): 6.550478
(u'word.lower=minister', u'|O'): 6.19998
(u'+1:word.lower=1996-08-26', u'|B-LOC'): 5.880611
(u'word.lower=serb-held', u'|B-MISC'): 5.853415
(u'word.lower=weston-super-mare', u'|B-LOC'): 5.847896
(u'word.lower=mcgrath', u'|B-PER'): 5.788491
(u'word[-2:]=-A', u'|O'): 5.677977
(u'+1:word.lower=1996-08-23', u'|B-LOC'): 5.531767
(u'word.lower=england', u'|B-LOC'): 5.348388
(u'word.lower=mclaren', u'|B-ORG'): 5.332102
(u'word.lower=parma', u'|B-ORG'): 5.327616
(u'+1:word.lower=1996-08-28', u'|B-LOC'): 5.295032
(u'word.lower=german', u'|B-MISC'): 5.289305
(u'+1:word.lower=1996-08-27', u'|B-LOC'): 5.262556
(u'+1:word.lower=1996-08-29', u'|B-LOC'): 5.262069
(u'+1:word.lower=1996-08-25', u'|B-LOC'): 5.257143
(u'word.lower=clinton', u'|B-PER'): 5.256348
(u'w

(u'+1:word.lower=2', u'|B-ORG'): 2.877508
(u'word[-3:]=hat', u'|O'): 2.876008
(u'-1:word.lower=at', u'|B-LOC'): 2.872825
(u'-1:word.lower=flight', u'|B-MISC'): 2.869963
(u'word.lower=neuchatel', u'|B-ORG'): 2.867003
(u'word.lower=lelouche', u'|B-PER'): 2.865053
(u'word.lower=bastia', u'|B-ORG'): 2.865025
(u'word.lower=nantes', u'|B-ORG'): 2.862773
(u'word[-2:]=45', u'|O'): 2.859702
(u'word[-3:]=H1', u'|O'): 2.855025
(u'word[-2:]=H1', u'|O'): 2.855025
(u'word.lower=h1', u'|O'): 2.855025
(u'word[-2:]=OJ', u'|B-ORG'): 2.854981
(u'word.lower=as', u'|O'): 2.854788
(u'word[-3:]=cor', u'|B-ORG'): 2.851842
(u'word.lower=pivotal', u'|B-PER'): 2.847795
(u'word.lower=vanlandingham', u'|B-PER'): 2.845799
(u'word.lower=when', u'|O'): 2.83011
(u'word[-3:]=C', u'|O'): 2.829701
(u'word[-2:]=C', u'|O'): 2.829701
(u'word.lower=bonds', u'|B-PER'): 2.828594
(u'word.lower=classic', u'|I-MISC'): 2.828532
(u'+1:word.lower=cup', u'|B-MISC'): 2.828077
(u'word.lower=israeli', u'|B-MISC'): 2.822062
(u'word.lower

(u'word[-3:]=ath', u'|I-PER'): 1.28553
(u'-1:word.lower=compatriot', u'|B-PER'): 1.285296
(u'-1:word.lower=banco', u'|I-ORG'): 1.285011
(u'word.lower=federation', u'|I-LOC'): 1.283645
(u'word[-3:]=ans', u'|I-MISC'): 1.282667
(u'word.lower=you', u'|O'): 1.282644
(u'word[-2:]=ga', u'|I-PER'): 1.282304
(u'word[-3:]=TAN', u'|B-LOC'): 1.281565
(u'word[-3:]=ary', u'|I-LOC'): 1.281064
(u'word.lower=gente', u'|B-ORG'): 1.280892
(u'word.lower=federal', u'|B-ORG'): 1.280699
(u'word[-3:]=ort', u'|I-MISC'): 1.280601
(u'-1:word.lower=fred', u'|I-PER'): 1.279168
(u'word.lower=hong', u'|B-MISC'): 1.278212
(u'word[-3:]=pic', u'|B-MISC'): 1.277546
(u'word[-3:]=ND', u'|B-ORG'): 1.277399
(u'word.lower=nd', u'|B-ORG'): 1.277399
(u'-1:word.lower=1', u'|B-ORG'): 1.276895
(u'word.lower=captain', u'|O'): 1.275415
(u'word[-3:]=ela', u'|B-LOC'): 1.274952
(u'word.lower=kornblum', u'|B-PER'): 1.274836
(u'word[-3:]=ann', u'|B-ORG'): 1.273793
(u'+1:word.lower=states', u'|B-LOC'): 1.273438
(u'+1:word.lower=bus', u'|

(u'-1:word.lower=swiss', u'|B-MISC'): 0.758219
(u'+1:word.lower=win', u'|B-ORG'): 0.758136
(u'word[-2:]=AT', u'|O'): 0.758082
(u'+1:word.lower=television', u'|B-ORG'): 0.757279
(u'-1:word.lower=bosnian', u'|I-MISC'): 0.757054
(u"+1:word.lower='", u'|B-PER'): 0.756231
(u'word.lower=of', u'|I-ORG'): 0.756099
(u'word[-2:]=my', u'|I-ORG'): 0.755689
(u'word[-3:]=ley', u'|B-PER'): 0.755497
(u'word[-3:]=tes', u'|I-LOC'): 0.755409
(u'-1:word.lower=of', u'|I-ORG'): 0.755337
(u'+1:word.lower=kingdom', u'|B-LOC'): 0.755237
(u'word[-3:]=PEX', u'|B-ORG'): 0.755083
(u'word.lower=pulpex', u'|B-ORG'): 0.755083
(u'-1:word.lower=vs.', u'|B-PER'): 0.755067
(u'word.lower=masood', u'|B-PER'): 0.754759
(u'word.lower=free', u'|I-LOC'): 0.754211
(u'word.lower=its', u'|O'): 0.753554
(u'-1:word.lower=sudan', u'|I-ORG'): 0.753296
(u'+1:word.lower=rights', u'|O'): 0.753116
(u'-1:word.lower=main', u'|B-MISC'): 0.753105
(u'word.lower=assembly', u'|I-ORG'): 0.752989
(u'word.lower=taibe', u'|B-ORG'): 0.752267
(u'word

(u'-1:word.lower=ltd', u'|O'): 0.391282
(u'word.lower=suspected', u'|O'): 0.390917
(u'+1:word.lower=government', u'|B-MISC'): 0.390858
(u'+1:word.lower=bank', u'|I-ORG'): 0.390848
(u'+1:word.isupper=True', u'|B-MISC'): 0.39074
(u'word[-2:]=ve', u'|O'): 0.390479
(u'word[-3:]=zon', u'|B-LOC'): 0.390272
(u'-1:word.lower=accused', u'|B-LOC'): 0.389383
(u'word[-3:]=igh', u'|B-LOC'): 0.389138
(u'word.lower=motorola', u'|B-ORG'): 0.389062
(u'word.lower=jamaica', u'|B-LOC'): 0.389029
(u'word[-2:]=LI', u'|B-LOC'): 0.388428
(u'-1:word.lower=state', u'|I-ORG'): 0.38839
(u'word[-3:]=mad', u'|B-PER'): 0.387642
(u'word[-2:]=OF', u'|O'): 0.387625
(u'+1:word.lower=sale', u'|O'): 0.387284
(u'word[-3:]=ria', u'|B-ORG'): 0.38728
(u'word[-2:]=te', u'|O'): 0.386623
(u'word.lower=tyrrell', u'|B-ORG'): 0.386199
(u'-1:word.lower=bold', u'|I-ORG'): 0.386057
(u'+1:word.lower=ogilvy', u'|I-ORG'): 0.386057
(u'-1:word.lower=african', u'|I-LOC'): 0.385841
(u'word.lower=fort', u'|B-LOC'): 0.385787
(u'-1:word.lower=s

(u'word[-2:]=et', u'|I-ORG'): 0.128505
(u'word[-3:]=Ken', u'|B-PER'): 0.12819
(u'word.lower=ken', u'|B-PER'): 0.12819
(u'word[-3:]=MLN', u'|O'): 0.128094
(u'word[-2:]=LN', u'|O'): 0.128094
(u'word[-2:]=AL', u'|B-ORG'): 0.127845
(u'word[-2:]=ie', u'|I-PER'): 0.127831
(u'word[-3:]=Sun', u'|I-ORG'): 0.127586
(u'word.lower=sun', u'|I-ORG'): 0.127586
(u'+1:word.lower=parma', u'|O'): 0.127538
(u'word.lower=stocks', u'|O'): 0.127288
(u'-1:word.lower=replace', u'|B-PER'): 0.127259
(u'-1:word.lower=prince', u'|B-PER'): 0.127198
(u'word.lower=cork', u'|I-PER'): 0.127064
(u'+1:word.lower=financial', u'|B-ORG'): 0.12629
(u'word[-2:]=NG', u'|I-LOC'): 0.126287
(u'-1:word.lower=by', u'|B-PER'): 0.126137
(u'word.lower=el', u'|B-ORG'): 0.125987
(u'word[-2:]=fe', u'|I-ORG'): 0.125923
(u'-1:word.lower=japanese', u'|B-ORG'): 0.125748
(u'word.lower=recomposed', u'|O'): 0.125739
(u'word[-3:]=Act', u'|I-MISC'): 0.12553
(u'word[-2:]=ue', u'|I-MISC'): 0.125485
(u'word[-3:]=ulf', u'|B-MISC'): 0.125338
(u'word.l

(u'+1:word.lower=rafferty', u'|B-PER'): 0.015001
(u'word.lower=practice', u'|O'): 0.014982
(u'word[-3:]=ner', u'|B-PER'): 0.014971
(u'word[-2:]=se', u'|B-PER'): 0.01486
(u'+1:word.lower=grade', u'|B-MISC'): 0.014846
(u'word[-2:]=ft', u'|I-PER'): 0.014831
(u'word[-3:]=(', u'|I-LOC'): 0.014824
(u'word[-2:]=(', u'|I-LOC'): 0.014824
(u'word.lower=(', u'|I-LOC'): 0.014824
(u'word[-3:]=ORK', u'|B-LOC'): 0.014804
(u'word.lower=prince', u'|B-PER'): 0.014804
(u'word.lower=national', u'|B-ORG'): 0.014771
(u'word[-3:]=nts', u'|I-ORG'): 0.014689
(u'+1:word.lower=front', u'|I-ORG'): 0.014612
(u'word[-2:]=RK', u'|B-LOC'): 0.014525
(u'word[-3:]=val', u'|I-MISC'): 0.014427
(u'word[-3:]=tie', u'|I-PER'): 0.014362
(u'word.lower=motor', u'|I-ORG'): 0.014233
(u'+1:word.lower=co', u'|B-ORG'): 0.014233
(u'word.lower=saskatchewan', u'|B-LOC'): 0.014209
(u'word[-3:]=aun', u'|B-PER'): 0.014182
(u'word.lower=shaun', u'|B-PER'): 0.014182
(u'word.lower=international', u'|O'): 0.014181
(u'word[-2:]=ne', u'|B-LOC')

(u'+1:word.lower=6', u'|B-LOC'): -0.525479
(u"+1:word.lower='s", u'|I-MISC'): -0.525604
(u'word[-3:]=ard', u'|O'): -0.525796
(u'+1:word.lower=on', u'|B-PER'): -0.527571
(u'word[-3:]=ine', u'|B-ORG'): -0.52974
(u'+1:word.lower=district', u'|I-ORG'): -0.530596
(u'+1:word.lower=was', u'|B-MISC'): -0.532027
(u'word.lower=st', u'|O'): -0.53211
(u'-1:word.lower=to', u'|O'): -0.535546
(u'+1:word.lower=.', u'|B-MISC'): -0.538742
(u'word.lower=washington', u'|B-ORG'): -0.540562
(u'+1:word.lower=newspaper', u'|O'): -0.541366
(u'word[-2:]=ON', u'|O'): -0.544114
(u'word.lower=west', u'|B-ORG'): -0.548008
(u'+1:word.lower=official', u'|O'): -0.549462
(u'+1:word.lower=7', u'|I-PER'): -0.550387
(u'word[-3:]=EAL', u'|O'): -0.556689
(u'-1:word.lower=and', u'|B-MISC'): -0.557212
(u'word[-3:]=AND', u'|O'): -0.558548
(u'word[-2:]=ni', u'|B-LOC'): -0.559219
(u'bias', u'|I-PER'): -0.559863
(u'word[-3:]=art', u'|O'): -0.560881
(u'word[-2:]=ru', u'|O'): -0.562863
(u'word.lower=city', u'|O'): -0.563414
(u'word

Set up a grid search and 3-fold cross-validation.

This will take HOURS to run. Note that max_iterations has been decreased to 20 (from 50) in order to expedite the task. This however causes a decrease in prediction performance compared to the previous cells.

In [19]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=20,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 82.9min finished


CPU times: user 1h 21min 32s, sys: 1min 1s, total: 1h 22min 33s
Wall time: 1h 23min 7s


Inspect the scores from the grid search. Note that the f1-score doesn't vary very much with the c1/c2 parameters and is more dependent on max_iterations.

In [38]:
rs.grid_scores_

[mean: 0.93933, std: 0.00517, params: {'c2': 0.05224537719245085, 'c1': 0.1552505912922396},
 mean: 0.93920, std: 0.00486, params: {'c2': 0.014664972677450788, 'c1': 0.2181638197432444},
 mean: 0.93563, std: 0.00529, params: {'c2': 0.058159123627147515, 'c1': 0.4849499133987302},
 mean: 0.93755, std: 0.00339, params: {'c2': 0.05878669973066139, 'c1': 0.28935072323291616},
 mean: 0.93915, std: 0.00312, params: {'c2': 0.027818150565818295, 'c1': 0.5510471061891358},
 mean: 0.93560, std: 0.00508, params: {'c2': 0.08667614497770654, 'c1': 0.40032222000802303},
 mean: 0.93943, std: 0.00361, params: {'c2': 0.08686049015897769, 'c1': 0.5662766651417782},
 mean: 0.93858, std: 0.00452, params: {'c2': 0.2121673953171459, 'c1': 0.41193448415135336},
 mean: 0.94031, std: 0.00454, params: {'c2': 0.04348195360013139, 'c1': 0.06138733444418406},
 mean: 0.93568, std: 0.00357, params: {'c2': 0.003559452214544051, 'c1': 1.21303629583201},
 mean: 0.93579, std: 0.00539, params: {'c2': 0.06606988307597234,

In [20]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

('best params:', {'c2': 0.025601748972799855, 'c1': 0.11303458841011417})
('best CV score:', 0.9406488963420524)
model size: 3.00M


Print the per-label scores and the features ordered by importance for the best model found by the grid search.

In [27]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

for key, value in sorted(crf.state_features_.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    print "%s: %s" % (key, value)


             precision    recall  f1-score   support

     |B-LOC      0.728     0.664     0.694      1669
    |B-MISC      0.654     0.671     0.662       705
     |B-ORG      0.746     0.435     0.549      1663
     |B-PER      0.728     0.776     0.751      1617
     |I-LOC      0.592     0.492     0.537       256
    |I-MISC      0.236     0.671     0.349       213
     |I-ORG      0.688     0.517     0.591       833
     |I-PER      0.787     0.912     0.845      1156
         |O      0.971     0.981     0.976     38554

avg / total      0.926     0.924     0.922     46666

(u'word[-3:]=I', u'|O'): 4.600259
(u'word[-2:]=I', u'|O'): 4.600259
(u'word.lower=i', u'|O'): 4.583866
(u'word.lower=a', u'|O'): 4.203804
(u'-1:word.lower=at', u'|B-LOC'): 3.583158
(u'word.isupper=False', u'|O'): 3.145201
(u'word[-3:]=A', u'|O'): 3.115499
(u'word[-2:]=A', u'|O'): 3.115499
(u'word.istitle=False', u'|O'): 3.102066
(u'+1:word.lower=3', u'|B-ORG'): 2.800347
(u'word[-2:]=T-', u'|O'): 2.755228
(u'wor

(u'word[-2:]=SH', u'|B-MISC'): 0.559365
(u'+1:word.lower=newsroom', u'|B-LOC'): 0.557292
(u'word.lower=city', u'|I-LOC'): 0.556958
(u'+1:word.lower=union', u'|B-ORG'): 0.555021
(u'+1:word.lower=)', u'|I-LOC'): 0.552779
(u'word.lower=co', u'|I-ORG'): 0.549524
(u'word[-2:]=TA', u'|B-ORG'): 0.547297
(u'-1:word.lower=an', u'|B-MISC'): 0.547079
(u'word.lower=this', u'|O'): 0.546702
(u'word[-2:]=ex', u'|B-ORG'): 0.545964
(u'word.lower=was', u'|O'): 0.544914
(u'word[-3:]=ole', u'|B-PER'): 0.544332
(u'word.lower=chicago', u'|B-ORG'): 0.542777
(u'word[-3:]=sin', u'|B-PER'): 0.542038
(u'word[-3:]=ate', u'|O'): 0.539482
(u'word[-3:]=nds', u'|B-LOC'): 0.538089
(u'word[-3:]=rld', u'|B-MISC'): 0.537939
(u'word[-3:]=was', u'|O'): 0.537187
(u'+1:word.lower=government', u'|B-MISC'): 0.536883
(u'word[-3:]=May', u'|O'): 0.535262
(u'word[-3:]=ign', u'|O'): 0.534248
(u'word[-2:]=gn', u'|O'): 0.534248
(u'word[-3:]=men', u'|O'): 0.534065
(u'word.lower=arafat', u'|B-PER'): 0.533588
(u'word[-3:]=St', u'|B-ORG'

(u'word[-3:]=BSE', u'|B-MISC'): 0.180853
(u'word.lower=bse', u'|B-MISC'): 0.180853
(u'-1:word.lower=jim', u'|I-PER'): 0.180648
(u'word[-2:]=nd', u'|I-LOC'): 0.180637
(u'word[-2:]=EY', u'|B-LOC'): 0.180632
(u'word[-3:]=ker', u'|I-PER'): 0.180606
(u'word.lower=day', u'|I-MISC'): 0.180285
(u'word[-3:]=Red', u'|B-ORG'): 0.18014
(u'-1:word.lower=pakistan', u'|O'): 0.180123
(u'word[-3:]=C.', u'|B-PER'): 0.180079
(u'word.lower=c.', u'|B-PER'): 0.180079
(u'word[-2:]=OF', u'|O'): 0.179978
(u'word[-3:]=kee', u'|B-ORG'): 0.179893
(u'word.lower=cairo', u'|B-LOC'): 0.179819
(u'+1:word.lower=index', u'|I-MISC'): 0.1798
(u'word.lower=fiorentina', u'|B-ORG'): 0.179597
(u'word[-3:]=OB', u'|B-ORG'): 0.17959
(u'word.lower=ob', u'|B-ORG'): 0.17959
(u'word[-2:]=FK', u'|B-ORG'): 0.179564
(u'+1:word.lower=95', u'|B-MISC'): 0.179554
(u'word[-3:]=OUL', u'|B-LOC'): 0.179463
(u'word[-3:]=ose', u'|B-PER'): 0.17931
(u'word[-3:]=nte', u'|B-ORG'): 0.1793
(u'+1:word.lower=three', u'|O'): 0.179139
(u'word.lower=singap

(u'+1:word.lower=27', u'|O'): 0.101731
(u'word[-3:]=ria', u'|B-ORG'): 0.101727
(u'word[-3:]=Gov', u'|O'): 0.101687
(u'word[-2:]=ou', u'|O'): 0.101667
(u'+1:word.lower=sept', u'|O'): 0.101596
(u'-1:word.lower=human', u'|I-ORG'): 0.101576
(u'word[-3:]=lla', u'|I-PER'): 0.101573
(u'word.lower=islamabad', u'|B-LOC'): 0.101521
(u'-1:word.lower=one', u'|B-MISC'): 0.101469
(u'word[-3:]=VfB', u'|B-ORG'): 0.101447
(u'word[-2:]=fB', u'|B-ORG'): 0.101447
(u'word.lower=vfb', u'|B-ORG'): 0.101447
(u'word.lower=commissioner', u'|O'): 0.101425
(u'word.lower=janeiro', u'|I-LOC'): 0.101378
(u'word[-3:]=rry', u'|I-PER'): 0.101367
(u'word[-3:]=ope', u'|I-ORG'): 0.101363
(u'word.lower=huddersfield', u'|B-ORG'): 0.101266
(u'+1:word.lower=prix', u'|I-MISC'): 0.101249
(u'word[-3:]=96', u'|I-MISC'): 0.10124
(u'word.lower=96', u'|I-MISC'): 0.10124
(u'word[-2:]=RO', u'|B-LOC'): 0.101231
(u'word.lower=plymouth', u'|B-ORG'): 0.101197
(u'+1:word.lower=stuttgart', u'|B-ORG'): 0.101175
(u'-1:word.lower=rio', u'|I-LO

(u'word[-3:]=kan', u'|B-ORG'): 0.068393
(u'+1:word.lower=ptt', u'|B-ORG'): 0.068377
(u'word[-3:]=ard', u'|B-MISC'): 0.068346
(u'+1:word.lower=42', u'|B-ORG'): 0.068345
(u'-1:word.lower=ieng', u'|I-PER'): 0.06831
(u'word[-3:]=INS', u'|O'): 0.068299
(u'word[-3:]=St.', u'|B-LOC'): 0.068261
(u'word.lower=st.', u'|B-LOC'): 0.068261
(u'-1:word.lower=gazeta', u'|I-ORG'): 0.068244
(u'word.lower=despite', u'|O'): 0.068234
(u'word[-2:]=14', u'|O'): 0.068166
(u'+1:word.lower=1-1', u'|O'): 0.068166
(u'word.lower=puchon', u'|B-ORG'): 0.068114
(u'+1:word.lower=sri', u'|O'): 0.068083
(u'word.lower=kashmir', u'|B-LOC'): 0.068033
(u'word[-3:]=13', u'|O'): 0.067986
(u'word.lower=13', u'|O'): 0.067986
(u'word[-3:]=NIS', u'|O'): 0.067942
(u'word.lower=municipal', u'|I-ORG'): 0.06793
(u'word[-3:]=nin', u'|B-PER'): 0.067927
(u'word[-2:]=AK', u'|B-ORG'): 0.067914
(u'word[-3:]=All', u'|B-ORG'): 0.067906
(u'-1:word.lower=frenchman', u'|B-PER'): 0.067871
(u'+1:word.lower=announcement', u'|O'): 0.067822
(u'word[

(u'word[-2:]=tu', u'|B-MISC'): 0.054279
(u'word.lower=cska', u'|B-ORG'): 0.054274
(u'-1:word.lower=iraq', u'|O'): 0.054269
(u'word[-3:]=ise', u'|O'): 0.054268
(u'+1:word.lower=45', u'|B-ORG'): 0.054249
(u'word.lower=monrovia', u'|B-LOC'): 0.054243
(u'word[-3:]=cer', u'|I-MISC'): 0.054241
(u'+1:word.lower=interest', u'|B-MISC'): 0.054237
(u'word[-2:]=ys', u'|O'): 0.054219
(u'word[-3:]=eto', u'|B-LOC'): 0.054201
(u'-1:word.lower=hutnik', u'|I-ORG'): 0.054201
(u'word.lower=athens', u'|B-ORG'): 0.054196
(u'word.lower=darlington', u'|B-ORG'): 0.054167
(u'word[-2:]=yk', u'|I-PER'): 0.054163
(u'word[-2:]=es', u'|O'): 0.054154
(u'-1:word.lower=roberto', u'|I-PER'): 0.05415
(u'+1:word.lower=contract', u'|O'): 0.05414
(u'-1:word.lower=polonia', u'|I-ORG'): 0.054119
(u'word[-3:]=tum', u'|I-PER'): 0.054107
(u'word.lower=liberal', u'|B-ORG'): 0.054077
(u'word[-3:]=zew', u'|B-ORG'): 0.054074
(u'word.lower=widzew', u'|B-ORG'): 0.054074
(u'-1:word.lower=conservative', u'|B-ORG'): 0.054017
(u'word.lowe

(u'word.lower=stallone', u'|B-PER'): 0.042832
(u'word[-3:]=fel', u'|B-PER'): 0.042815
(u'word.lower=reiffel', u'|B-PER'): 0.042815
(u'-1:word.lower=24-year-old', u'|B-PER'): 0.042812
(u'word.lower=moada', u'|B-PER'): 0.042789
(u'word[-3:]=iro', u'|I-LOC'): 0.042763
(u'word[-3:]=ute', u'|I-ORG'): 0.04275
(u'word.lower=bohdan', u'|B-PER'): 0.042729
(u'+1:word.lower=ulihrach', u'|B-PER'): 0.042729
(u'word.lower=penrose', u'|B-PER'): 0.042726
(u'-1:word.lower=unseeded', u'|B-PER'): 0.042719
(u'word[-3:]=aan', u'|B-PER'): 0.042717
(u'+1:word.lower=midwest', u'|B-LOC'): 0.042715
(u'word[-3:]=uca', u'|B-ORG'): 0.042713
(u'word.lower=cocu', u'|B-PER'): 0.042713
(u'word.lower=setubal', u'|B-ORG'): 0.042711
(u'word.lower=bailey', u'|I-PER'): 0.042711
(u'word.lower=braga', u'|B-ORG'): 0.042694
(u'word.lower=auckland', u'|B-ORG'): 0.042692
(u'word[-3:]=SER', u'|O'): 0.042685
(u'word.lower=ivanisevic', u'|B-PER'): 0.042682
(u'word.lower=bertelsmann', u'|B-ORG'): 0.042676
(u'word.lower=patricia', u'

(u'word.lower=azerbaijan', u'|B-LOC'): 0.03289
(u'-1:word.lower=herve', u'|I-PER'): 0.032875
(u'+1:word.lower=westhuizen', u'|I-PER'): 0.03287
(u'word.lower=9002', u'|I-MISC'): 0.032868
(u'-1:word.lower=iso', u'|I-MISC'): 0.032868
(u'+1:word.lower=elections', u'|O'): 0.032868
(u'word[-3:]=Liu', u'|B-ORG'): 0.032864
(u'word.lower=liu', u'|B-ORG'): 0.032864
(u'+1:word.lower=west', u'|B-LOC'): 0.032861
(u'word[-3:]=gei', u'|B-PER'): 0.032857
(u'word[-3:]=kou', u'|B-LOC'): 0.032855
(u'-1:word.lower=azad', u'|I-LOC'): 0.032853
(u'word.lower=development', u'|I-ORG'): 0.032849
(u'-1:word.lower=bj', u'|I-ORG'): 0.032842
(u'word[-3:]=ct.', u'|O'): 0.032839
(u'word.lower=oct.', u'|O'): 0.032839
(u'word.lower=make', u'|O'): 0.032838
(u'word.lower=u.s.-mediated', u'|B-MISC'): 0.032827
(u'word.lower=911', u'|I-MISC'): 0.032827
(u'-1:word.lower=ftse', u'|I-MISC'): 0.032827
(u'-1:word.lower=wisc', u'|I-LOC'): 0.032826
(u'-1:word.lower=colo', u'|I-LOC'): 0.032824
(u'-1:word.lower=bbc', u'|I-ORG'): 0.0

(u'+1:word.lower=units', u'|B-ORG'): 0.028331
(u'+1:word.lower=4th', u'|B-PER'): 0.02833
(u'+1:word.lower=matin', u'|B-ORG'): 0.028326
(u'word.lower=resalat', u'|B-ORG'): 0.028325
(u'word.lower=sean', u'|B-PER'): 0.028322
(u'word.lower=krpaco', u'|B-ORG'): 0.028322
(u'+1:word.lower=vietnamese', u'|O'): 0.028322
(u'+1:word.lower=vs', u'|O'): 0.028317
(u'+1:word.lower=country', u'|I-MISC'): 0.028316
(u'+1:word.lower=must', u'|B-LOC'): 0.028311
(u'word[-3:]=ehe', u'|B-PER'): 0.028306
(u'word[-3:]=LB', u'|I-ORG'): 0.028302
(u'word[-2:]=LB', u'|I-ORG'): 0.028302
(u'-1:word.lower=suedwest', u'|I-ORG'): 0.028302
(u'+1:word.lower=ban', u'|I-MISC'): 0.028299
(u'word.lower=sidor', u'|B-ORG'): 0.028296
(u'word[-3:]=AD', u'|B-ORG'): 0.028292
(u'word.lower=sky', u'|B-ORG'): 0.028291
(u'+1:word.lower=begins', u'|B-PER'): 0.028287
(u'+1:word.lower=50', u'|I-PER'): 0.028285
(u'word.lower=sierd', u'|I-PER'): 0.028284
(u'-1:word.lower=hurte', u'|I-PER'): 0.028284
(u'+1:word.lower=zylstra', u'|I-PER'): 0

(u'word.lower=cherbourg', u'|B-LOC'): 0.025022
(u'word.lower=sciences', u'|I-ORG'): 0.025015
(u'-1:word.lower=chief', u'|O'): 0.02501
(u'word[-3:]=lub', u'|O'): 0.025006
(u'word[-3:]=eke', u'|B-ORG'): 0.025006
(u'word[-2:]=op', u'|B-ORG'): 0.025005
(u'+1:word.lower=teplice', u'|B-ORG'): 0.025
(u'word.lower=sandrine', u'|B-PER'): 0.024999
(u'+1:word.lower=testud', u'|B-PER'): 0.024999
(u'word[-3:]=chi', u'|I-PER'): 0.024996
(u'word.lower=kamyshin', u'|I-ORG'): 0.024996
(u'word.lower=kerametal', u'|B-ORG'): 0.024992
(u'+1:word.lower=dubnica', u'|B-ORG'): 0.024992
(u'word.lower=unless', u'|O'): 0.024991
(u'-1:word.lower=law', u'|B-MISC'): 0.024988
(u'word[-3:]=num', u'|O'): 0.024983
(u'word.lower=saint-germain', u'|I-ORG'): 0.024978
(u'word.lower=bergen', u'|B-LOC'): 0.024977
(u'+1:word.lower=8.57', u'|B-LOC'): 0.024972
(u'-1:word.lower=builds', u'|B-LOC'): 0.024961
(u'word.lower=desmond', u'|B-PER'): 0.024959
(u'word.lower=real', u'|B-ORG'): 0.024957
(u'word.lower=ago', u'|O'): 0.024956


(u'word.lower=arjuna', u'|B-PER'): 0.022017
(u'word.lower=leopold', u'|I-PER'): 0.022013
(u'-1:word.lower=evelyn', u'|I-PER'): 0.022013
(u'+1:word.lower=bhutto', u'|B-PER'): 0.022013
(u'word[-2:]=8.', u'|O'): 0.02201
(u'-1:word.lower=economic', u'|I-ORG'): 0.022005
(u'word[-3:]=sco', u'|B-ORG'): 0.022003
(u'+1:word.lower=stankowski', u'|B-PER'): 0.021999
(u'word.lower=saskatchewan', u'|B-LOC'): 0.021976
(u'word[-3:]=ouc', u'|I-ORG'): 0.021974
(u'word.lower=harden', u'|I-PER'): 0.021971
(u'word[-3:]=Mir', u'|B-PER'): 0.021967
(u'word.lower=mir', u'|B-PER'): 0.021967
(u'+1:word.lower=zaman', u'|B-PER'): 0.021967
(u'word.lower=laca', u'|I-PER'): 0.021962
(u'+1:word.lower=145', u'|I-PER'): 0.02196
(u'-1:word.lower=wife', u'|B-PER'): 0.021959
(u'-1:word.lower=all-rounder', u'|B-PER'): 0.021959
(u'word.lower=food', u'|I-ORG'): 0.021954
(u'word[-3:]=int', u'|B-ORG'): 0.021951
(u'word[-3:]=994', u'|O'): 0.021948
(u'word.lower=1994', u'|O'): 0.021948
(u'-1:word.lower=hosni', u'|I-PER'): 0.02194

(u'+1:word.lower=natalya', u'|O'): 0.01713
(u'+1:word.lower=nurses', u'|B-MISC'): 0.017127
(u'+1:word.lower=venezuela', u'|O'): 0.017125
(u'word.lower=led', u'|O'): 0.017123
(u'word[-3:]=dra', u'|I-PER'): 0.017117
(u'word.lower=hours', u'|O'): 0.017114
(u'+1:word.lower=79', u'|I-PER'): 0.017113
(u'+1:word.lower=spain', u'|O'): 0.017112
(u'word.lower=rainier', u'|I-PER'): 0.01711
(u'word.lower=claydon', u'|I-PER'): 0.01711
(u'word[-2:]=dh', u'|I-PER'): 0.017109
(u'word.lower=brown', u'|B-PER'): 0.017103
(u'-1:word.lower=candidate', u'|B-PER'): 0.017103
(u'+1:word.lower=midfielder', u'|I-ORG'): 0.017103
(u'-1:word.lower=randy', u'|I-PER'): 0.017102
(u'-1:word.lower=islamic', u'|I-MISC'): 0.017091
(u'+1:word.lower=fixtures', u'|O'): 0.017081
(u'-1:word.lower=today', u'|O'): 0.017078
(u'word.lower=robson', u'|I-PER'): 0.017077
(u'+1:word.lower=fifth', u'|O'): 0.017073
(u'word.lower=least', u'|O'): 0.017072
(u'word.lower=soldiers', u'|O'): 0.01707
(u'+1:word.lower=negotiations', u'|O'): 0.0

(u'word.lower=yorkers', u'|I-MISC'): 0.014959
(u'-1:word.lower=hang', u'|I-MISC'): 0.014959
(u'word.lower=osteopathic', u'|I-LOC'): 0.014957
(u'word.lower=borough', u'|I-LOC'): 0.014957
(u'-1:word.lower=avalon', u'|I-LOC'): 0.014957
(u'-1:word.lower=volcanic', u'|I-ORG'): 0.014956
(u'+1:word.lower=seismologicial', u'|I-ORG'): 0.014956
(u'word.lower=presbyterian', u'|I-LOC'): 0.014955
(u'word.lower=suite', u'|I-MISC'): 0.014954
(u'-1:word.lower=venezolana', u'|I-ORG'): 0.014954
(u'-1:word.lower=gynt', u'|I-MISC'): 0.014954
(u'+1:word.lower=honours', u'|I-MISC'): 0.014954
(u'+1:word.lower=guayana', u'|I-ORG'): 0.014954
(u'-1:word.lower=paralympic', u'|I-MISC'): 0.014953
(u'-1:word.lower=lady', u'|I-LOC'): 0.014953
(u'+1:word.lower=farewell', u'|I-MISC'): 0.014952
(u'+1:word.lower=ren', u'|I-MISC'): 0.01495
(u'-1:word.lower=woodlands', u'|I-LOC'): 0.014949
(u'word[-2:]=EV', u'|B-PER'): 0.014947
(u'-1:word.lower=maru', u'|I-MISC'): 0.014947
(u'word.lower=tarnovo', u'|I-LOC'): 0.014946
(u'-

(u'word[-2:]=YA', u'|B-PER'): 0.014328
(u'-1:word.lower=four-nation', u'|B-MISC'): 0.014326
(u'+1:word.lower=shipyard', u'|B-MISC'): 0.014326
(u'+1:word.lower=4.0', u'|I-MISC'): 0.014326
(u'+1:word.lower=0.12', u'|B-MISC'): 0.014326
(u'word.lower=acting', u'|O'): 0.014325
(u'-1:word.lower=georgia', u'|B-MISC'): 0.014322
(u'-1:word.lower=brigade', u'|O'): 0.014321
(u'+1:word.lower=0.04', u'|B-MISC'): 0.01432
(u'word.lower=tunbridge', u'|B-LOC'): 0.014319
(u'word[-3:]=DER', u'|B-PER'): 0.014317
(u'word[-3:]=Dfl', u'|B-MISC'): 0.014314
(u'word[-2:]=fl', u'|B-MISC'): 0.014314
(u'word.lower=dfl', u'|B-MISC'): 0.014314
(u'word.lower=flora', u'|B-LOC'): 0.014313
(u'+1:word.lower=perfetti', u'|B-LOC'): 0.014313
(u'word.lower=charlton', u'|I-PER'): 0.014312
(u'+1:word.lower=prediction', u'|O'): 0.014312
(u'word.lower=transcendental', u'|B-MISC'): 0.014309
(u'+1:word.lower=museeuw', u'|B-PER'): 0.014309
(u'+1:word.lower=ponders', u'|B-PER'): 0.014308
(u'word.lower=taiwan', u'|I-ORG'): 0.014305
(

(u'+1:word.lower=11.55', u'|B-MISC'): 0.013502
(u'word[-3:]=23', u'|O'): 0.013501
(u'word.lower=23', u'|O'): 0.013501
(u'+1:word.lower=policemen', u'|B-MISC'): 0.0135
(u'word.lower=steel', u'|I-ORG'): 0.013498
(u'word.lower=egyptians', u'|B-MISC'): 0.013498
(u'word.lower=cevaer', u'|B-PER'): 0.013498
(u'+1:word.lower=passed', u'|B-ORG'): 0.013498
(u'word[-3:]=tsi', u'|B-PER'): 0.013497
(u'word.lower=schreuder', u'|B-PER'): 0.013497
(u'word.lower=ryszard', u'|B-PER'): 0.013497
(u'+1:word.lower=42.4', u'|B-MISC'): 0.013497
(u'+1:word.lower=0:10', u'|B-PER'): 0.013497
(u'-1:word.lower=vault', u'|B-PER'): 0.013495
(u'+1:word.lower=mcgilley', u'|B-PER'): 0.013494
(u'word.lower=arctic', u'|B-MISC'): 0.013493
(u'-1:word.lower=husband', u'|B-PER'): 0.013493
(u'+1:word.lower=della', u'|B-PER'): 0.013493
(u'word.lower=idalecio', u'|B-PER'): 0.013492
(u'word.lower=brotherhood', u'|B-ORG'): 0.013492
(u'word[-3:]=ANP', u'|B-ORG'): 0.013491
(u'word.lower=lacher', u'|B-PER'): 0.013491
(u'word.lower=a

(u'+1:word.lower=loko', u'|B-PER'): 0.012915
(u'+1:word.lower=conference', u'|I-MISC'): 0.012914
(u'word[-3:]=ibe', u'|B-LOC'): 0.012912
(u'word[-3:]=ABN', u'|B-ORG'): 0.012912
(u'word.lower=zionists', u'|B-MISC'): 0.012912
(u'word.lower=abn', u'|B-ORG'): 0.012912
(u'-1:word.lower=fifth-ranked', u'|B-MISC'): 0.012912
(u'+1:word.lower=karim', u'|B-MISC'): 0.012912
(u'+1:word.lower=arrived', u'|O'): 0.012912
(u'+1:word.lower=amro', u'|B-ORG'): 0.012912
(u'+1:word.lower=allied', u'|B-MISC'): 0.012912
(u'+1:word.lower=teenager', u'|B-MISC'): 0.012911
(u'+1:word.lower=presse', u'|B-ORG'): 0.012911
(u'+1:word.lower=liability', u'|I-ORG'): 0.012911
(u'+1:word.lower=fail', u'|B-PER'): 0.012911
(u'word.lower=.robbie', u'|B-PER'): 0.01291
(u'word.lower=fulmar', u'|B-ORG'): 0.012909
(u'+1:word.lower=+353', u'|I-ORG'): 0.012909
(u'+1:word.lower=progress', u'|O'): 0.012906
(u'-1:word.lower=regarding', u'|B-ORG'): 0.012905
(u'+1:word.lower=183', u'|B-ORG'): 0.012904
(u'word[-3:]=wel', u'|B-MISC'): 0

(u'word[-3:]=ngh', u'|I-PER'): 0.012019
(u'word.lower=singh', u'|I-PER'): 0.012019
(u'word.lower=jewish', u'|I-MISC'): 0.012019
(u'-1:word.lower=navjot', u'|I-PER'): 0.012019
(u'+1:word.lower=dropped', u'|I-PER'): 0.012017
(u'word[-3:]=fey', u'|I-PER'): 0.012013
(u'word.lower=griffey', u'|I-PER'): 0.012013
(u'word.lower=lions', u'|B-ORG'): 0.012012
(u'-1:word.lower=gala', u'|I-PER'): 0.012012
(u'word.lower=commodity', u'|B-ORG'): 0.012011
(u'+1:word.lower=8.54', u'|B-LOC'): 0.012011
(u'word.lower=botha', u'|B-PER'): 0.012009
(u'+1:word.lower=high', u'|O'): 0.012008
(u'word.lower=aid', u'|O'): 0.012007
(u'+1:word.lower=single', u'|B-MISC'): 0.012007
(u'-1:word.lower=helped', u'|B-PER'): 0.012006
(u'-1:word.lower=11th-ranked', u'|B-MISC'): 0.012005
(u'+1:word.lower=bombing', u'|B-MISC'): 0.012005
(u'-1:word.lower=appeal', u'|O'): 0.012004
(u'+1:word.lower=indian', u'|B-MISC'): 0.012004
(u'word[-3:]=hin', u'|B-ORG'): 0.012003
(u'+1:word.lower=hand', u'|O'): 0.012003
(u'word.lower=chaser',

(u'word[-3:]=bao', u'|B-LOC'): 0.011075
(u'word.lower=bilbao', u'|B-LOC'): 0.011075
(u'-1:word.lower=vorskla', u'|I-ORG'): 0.011075
(u'-1:word.lower=restrict', u'|B-LOC'): 0.011074
(u'word.lower=western', u'|B-ORG'): 0.011073
(u'+1:word.lower=north', u'|I-ORG'): 0.011072
(u'+1:word.lower=handed', u'|B-ORG'): 0.011072
(u'-1:word.lower=comprises', u'|B-LOC'): 0.01107
(u'+1:word.lower=limited', u'|B-ORG'): 0.011069
(u'word[-3:]=oba', u'|B-LOC'): 0.011068
(u'word.lower=manitoba', u'|B-LOC'): 0.011068
(u'+1:word.lower=somavia', u'|B-PER'): 0.011068
(u'-1:word.lower=k.t.', u'|I-PER'): 0.011067
(u'+1:word.lower=alatas', u'|B-PER'): 0.011067
(u'word.lower=petrobulk', u'|B-ORG'): 0.011065
(u'+1:word.lower=rainbow', u'|B-ORG'): 0.011065
(u'-1:word.lower=sergeant', u'|B-PER'): 0.011064
(u'+1:word.lower=pearson', u'|B-PER'): 0.011064
(u'word.lower=nebraska', u'|B-LOC'): 0.011063
(u'word.lower=yugo', u'|B-ORG'): 0.011062
(u'+1:word.lower=212-859-1610', u'|I-ORG'): 0.011062
(u'+1:word.lower=fellow',

(u'-1:word.lower=biggest', u'|B-ORG'): 0.010232
(u'+1:word.lower=notice', u'|O'): 0.010232
(u'word.lower=torino', u'|B-ORG'): 0.010231
(u'-1:word.lower=social', u'|I-ORG'): 0.01023
(u'word[-3:]=acy', u'|B-PER'): 0.010228
(u'-1:word.lower=68', u'|B-PER'): 0.010228
(u'+1:word.lower=paul', u'|I-PER'): 0.010227
(u'+1:word.lower=1:51.528', u'|B-ORG'): 0.010227
(u'+1:word.lower=1:51.784', u'|B-ORG'): 0.010226
(u'+1:word.lower=1:51.006', u'|B-ORG'): 0.010226
(u'word.lower=activists', u'|O'): 0.010225
(u'+1:word.lower=38:41.756', u'|B-ORG'): 0.010225
(u'+1:word.lower=1:51.857', u'|B-ORG'): 0.010225
(u'+1:word.lower=1:51.588', u'|B-ORG'): 0.010225
(u'+1:word.lower=1:51.075', u'|B-ORG'): 0.010225
(u'+1:word.lower=38:49.595', u'|B-ORG'): 0.010224
(u'word[-3:]=shi', u'|B-PER'): 0.010223
(u'word.lower=pushpakumara', u'|B-PER'): 0.010223
(u'word.lower=accompanied', u'|O'): 0.010223
(u'word[-3:]=iks', u'|I-PER'): 0.01022
(u'word.lower=hendriks', u'|I-PER'): 0.01022
(u'-1:word.lower=pieter', u'|I-PER'

(u'+1:word.lower=triumph', u'|I-MISC'): 0.008983
(u'word.lower=rastegar', u'|I-PER'): 0.008982
(u'word.lower=publishing', u'|O'): 0.008982
(u'-1:word.lower=coordinate', u'|I-ORG'): 0.008981
(u'+1:word.lower=junior', u'|I-MISC'): 0.008979
(u'+1:word.lower=+1,161', u'|O'): 0.008979
(u'word.lower=vitaly', u'|B-PER'): 0.008977
(u'word.lower=role', u'|O'): 0.008977
(u'+1:word.lower=alliance', u'|B-MISC'): 0.008977
(u'+1:word.lower=universities', u'|I-ORG'): 0.008975
(u'+1:word.lower=4064/89', u'|O'): 0.008974
(u'word[-2:]=ab', u'|I-LOC'): 0.008971
(u'+1:word.lower=understand', u'|O'): 0.008971
(u'word[-3:]=mpt', u'|O'): 0.00897
(u'word[-3:]=cut', u'|O'): 0.00897
(u'+1:word.lower=117', u'|I-PER'): 0.00897
(u'word.lower=terence', u'|I-PER'): 0.008969
(u'+1:word.lower=realtors', u'|I-ORG'): 0.008968
(u'+1:word.lower=religious', u'|I-ORG'): 0.008966
(u'+1:word.lower=2.5', u'|O'): 0.008966
(u'+1:word.lower=capture', u'|O'): 0.008964
(u'word.lower=chang', u'|I-PER'): 0.00896
(u'+1:word.lower=kuhn

(u'word.lower=46', u'|O'): 0.007851
(u'-1:word.lower=johnson', u'|O'): 0.007849
(u'-1:word.lower=brown', u'|I-PER'): 0.007849
(u'word[-3:]=.K.', u'|B-PER'): 0.007846
(u'+1:word.lower=3-3', u'|O'): 0.007846
(u'-1:word.lower=agreement', u'|B-LOC'): 0.007845
(u'+1:word.lower=lynda', u'|O'): 0.007845
(u'+1:word.lower=jean-louis', u'|O'): 0.007841
(u'word[-3:]=sie', u'|I-PER'): 0.007839
(u'+1:word.lower=chavalit', u'|O'): 0.007839
(u'word.lower=chan', u'|B-PER'): 0.007838
(u'+1:word.lower=anti-abortion', u'|O'): 0.007837
(u'-1:word.lower=prince', u'|I-PER'): 0.007836
(u'word[-3:]=ubs', u'|B-ORG'): 0.007834
(u'word.lower=vega', u'|B-PER'): 0.007832
(u'+1:word.lower=4-48', u'|I-PER'): 0.007829
(u'+1:word.lower=capelle', u'|B-PER'): 0.007825
(u'+1:word.lower=racers', u'|O'): 0.007824
(u'+1:word.lower=picture', u'|B-MISC'): 0.007822
(u'-1:word.lower=industrial', u'|B-ORG'): 0.007821
(u'word[-3:]=abi', u'|B-ORG'): 0.007819
(u'word.lower=stanojlovic', u'|I-PER'): 0.007818
(u'word.lower=question',

(u'word.lower=gupta', u'|I-PER'): 0.006728
(u'+1:word.lower=segers', u'|B-PER'): 0.006728
(u'+1:word.lower=dehaan', u'|B-PER'): 0.006728
(u'+1:word.lower=command', u'|B-ORG'): 0.006727
(u'-1:word.lower=venture', u'|I-ORG'): 0.006726
(u'+1:word.lower=may', u'|O'): 0.006726
(u'word[-2:]=AM', u'|B-PER'): 0.006723
(u'word.lower=holon', u'|I-ORG'): 0.006723
(u'-1:word.lower=zafririm', u'|I-ORG'): 0.006723
(u'+1:word.lower=swift', u'|B-PER'): 0.006723
(u'word[-3:]=Rig', u'|I-ORG'): 0.006722
(u'word.lower=rig', u'|I-ORG'): 0.006722
(u'-1:word.lower=kryvy', u'|I-ORG'): 0.006722
(u'-1:word.lower=274', u'|B-PER'): 0.006722
(u'-1:word.lower=largely', u'|O'): 0.006719
(u'+1:word.lower=ericks', u'|B-PER'): 0.006719
(u'-1:word.lower=krzysztof', u'|I-PER'): 0.006717
(u'+1:word.lower=curren', u'|B-PER'): 0.006715
(u'+1:word.lower=chechen', u'|O'): 0.006715
(u'+1:word.lower=west', u'|I-MISC'): 0.006714
(u'-1:word.lower=green', u'|O'): 0.006713
(u'word.lower=lose', u'|O'): 0.006712
(u'+1:word.lower=gupt

(u'word[-3:]=353', u'|O'): 0.005323
(u'-1:word.lower=281', u'|B-PER'): 0.005321
(u'word[-3:]=aud', u'|I-PER'): 0.005319
(u'+1:word.lower=visited', u'|I-PER'): 0.005319
(u'-1:word.lower=na', u'|O'): 0.005318
(u'+1:word.lower=karunanidhi', u'|B-PER'): 0.005316
(u'word.lower=brings', u'|O'): 0.005315
(u'+1:word.lower=foil', u'|O'): 0.005314
(u'-1:word.lower=dies', u'|O'): 0.005313
(u'-1:word.lower=warsaw', u'|O'): 0.00531
(u'+1:word.lower=mountain', u'|B-LOC'): 0.005307
(u'+1:word.lower=auckland', u'|O'): 0.005307
(u'word[-3:]=n.a', u'|O'): 0.005306
(u'word[-2:]=.a', u'|O'): 0.005306
(u'word.lower=n.a', u'|O'): 0.005306
(u'+1:word.lower=started', u'|B-PER'): 0.005306
(u'-1:word.lower=snap', u'|O'): 0.005305
(u'word[-3:]=5-1', u'|O'): 0.005304
(u'+1:word.lower=seven', u'|B-ORG'): 0.005303
(u'+1:word.lower=roosevelt', u'|B-PER'): 0.005298
(u'word.lower=likely', u'|O'): 0.005294
(u'word[-3:]=its', u'|I-LOC'): 0.005293
(u'word.lower=stenning', u'|I-PER'): 0.005292
(u'word[-3:]=hin', u'|B-MISC

(u'word[-2:]=ee', u'|I-LOC'): 0.004137
(u'-1:word.lower=slim', u'|O'): 0.004137
(u'word.lower=dublin', u'|B-ORG'): 0.004136
(u'+1:word.lower=mines', u'|B-LOC'): 0.004136
(u'word.lower=needed', u'|O'): 0.004135
(u'word.lower=magazines', u'|O'): 0.004135
(u'word[-2:]=vo', u'|B-ORG'): 0.004132
(u'+1:word.lower=los', u'|O'): 0.004129
(u'+1:word.lower=s23.sep.96', u'|O'): 0.004128
(u'-1:word.lower=dubbed', u'|B-ORG'): 0.004127
(u'word.lower=club', u'|I-ORG'): 0.004126
(u'-1:word.lower=replace', u'|B-MISC'): 0.004126
(u'+1:word.lower=zickler', u'|B-PER'): 0.004125
(u'-1:word.lower=211', u'|O'): 0.004124
(u'+1:word.lower=trophy', u'|O'): 0.004124
(u'+1:word.lower=jayasuriya', u'|O'): 0.004124
(u'word[-3:]=odt', u'|B-ORG'): 0.00412
(u'-1:word.lower=sack', u'|O'): 0.00412
(u'-1:word.lower=last-gasp', u'|O'): 0.00412
(u'word[-3:]=AL-', u'|O'): 0.004117
(u'word[-2:]=L-', u'|O'): 0.004117
(u'word.lower=total-', u'|O'): 0.004117
(u'+1:word.lower=hathaway', u'|B-ORG'): 0.004116
(u'-1:word.lower=team

(u'+1:word.lower=uses', u'|O'): 0.003089
(u'word[-2:]=UK', u'|I-ORG'): 0.003088
(u'+1:word.lower=113.0', u'|O'): 0.003088
(u'word.lower=davidson', u'|B-PER'): 0.003086
(u'+1:word.lower=pre-sale', u'|O'): 0.003086
(u'word[-2:]=GT', u'|B-ORG'): 0.003085
(u'+1:word.lower=senate', u'|O'): 0.003085
(u'+1:word.lower=143.7', u'|O'): 0.003085
(u'word.lower=karen', u'|I-ORG'): 0.003084
(u'word.lower=galo', u'|B-PER'): 0.003084
(u'-1:word.lower=avonex', u'|O'): 0.003084
(u'+1:word.lower=futures', u'|O'): 0.003084
(u'+1:word.lower=blanco', u'|B-PER'): 0.003084
(u'+1:word.lower=marion', u'|B-ORG'): 0.003082
(u'word[-3:]=igi', u'|B-PER'): 0.00308
(u'+1:word.lower=pescosolido', u'|B-PER'): 0.003079
(u'-1:word.lower=pork', u'|I-ORG'): 0.003078
(u'-1:word.lower=puts', u'|O'): 0.003077
(u'word.lower=heiko', u'|B-PER'): 0.003076
(u'+1:word.lower=szonn', u'|B-PER'): 0.003076
(u'+1:word.lower=kamoga', u'|B-PER'): 0.003076
(u'word.lower=physicians', u'|B-ORG'): 0.003075
(u'+1:word.lower=tanui', u'|B-PER'):

(u'+1:word.lower=nursing', u'|B-MISC'): 0.001853
(u'word.lower=maslova', u'|I-PER'): 0.001852
(u'word.lower=iran', u'|I-LOC'): 0.001852
(u'word[-3:]=ury', u'|I-ORG'): 0.001847
(u'-1:word.lower=remove', u'|O'): 0.001847
(u'+1:word.lower=situation', u'|O'): 0.001847
(u'-1:word.lower=within', u'|B-LOC'): 0.001846
(u'-1:word.lower=from', u'|B-MISC'): 0.001845
(u'+1:word.lower=avoided', u'|B-LOC'): 0.001844
(u'-1:word.lower=kan', u'|O'): 0.001841
(u'+1:word.lower=or', u'|I-PER'): 0.001841
(u'word[-2:]=ck', u'|B-ORG'): 0.00184
(u'word.lower=kundic', u'|I-PER'): 0.001839
(u'-1:word.lower=zoran', u'|I-PER'): 0.001839
(u'word.lower=proposal', u'|O'): 0.001838
(u'-1:word.lower=digest', u'|O'): 0.001838
(u'word[-2:]=/8', u'|O'): 0.001835
(u'word.lower=djate', u'|I-PER'): 0.001835
(u'-1:word.lower=value', u'|O'): 0.001835
(u'-1:word.lower=al', u'|B-MISC'): 0.001835
(u'word.lower=girard-leno', u'|I-PER'): 0.001833
(u'+1:word.lower=refugees', u'|I-MISC'): 0.001833
(u'word.lower=maybank', u'|I-PER'):

(u'-1:word.lower=bought', u'|O'): 0.001124
(u'+1:word.lower=afghanistan', u'|O'): 0.001124
(u'word.lower=plays', u'|O'): 0.001123
(u'word.lower=officer', u'|O'): 0.001123
(u'-1:word.lower=amr', u'|I-ORG'): 0.001123
(u'+1:word.lower=presidency', u'|O'): 0.00112
(u'+1:word.lower=craig', u'|O'): 0.00112
(u'+1:word.lower=director', u'|O'): 0.001118
(u'word[-3:]=hul', u'|B-PER'): 0.001117
(u'-1:word.lower=kabul', u'|O'): 0.001117
(u"-1:word.lower='ll", u'|O'): 0.001117
(u'+1:word.lower=finish', u'|O'): 0.001117
(u'-1:word.lower=14.', u'|B-ORG'): 0.001116
(u'+1:word.lower=last', u'|O'): 0.001116
(u'-1:word.lower=27-year-old', u'|B-MISC'): 0.001112
(u'+1:word.lower=feed', u'|B-MISC'): 0.001112
(u'word[-2:]=L.', u'|B-PER'): 0.001111
(u'-1:word.lower=90', u'|O'): 0.001111
(u'+1:word.lower=problem', u'|O'): 0.00111
(u'+1:word.lower=princess', u'|O'): 0.00111
(u'-1:word.lower=64', u'|O'): 0.001108
(u'word.lower=65th', u'|O'): 0.001105
(u'-1:word.lower=vicario', u'|O'): 0.001105
(u'word.lower=hous

(u'word.lower=kasparkova', u'|I-PER'): 0.000979
(u'word.lower=hiraki', u'|I-PER'): 0.000979
(u'word.lower=henchoz', u'|I-PER'): 0.000979
(u'word.lower=hansie', u'|B-PER'): 0.000979
(u'word.lower=gulyayeva', u'|I-PER'): 0.000979
(u'word.lower=franka', u'|B-PER'): 0.000979
(u'word.lower=eleonora', u'|B-PER'): 0.000979
(u'word.lower=chopra', u'|I-PER'): 0.000979
(u'word.lower=bradtke', u'|I-PER'): 0.000979
(u'-1:word.lower=stefka', u'|I-PER'): 0.000979
(u'-1:word.lower=sarka', u'|I-PER'): 0.000979
(u'-1:word.lower=rika', u'|I-PER'): 0.000979
(u'-1:word.lower=daniele', u'|I-PER'): 0.000979
(u'-1:word.lower=completed', u'|O'): 0.000979
(u'-1:word.lower=300', u'|B-MISC'): 0.000979
(u'+1:word.lower=thorsett', u'|B-PER'): 0.000979
(u'+1:word.lower=sadova', u'|B-PER'): 0.000979
(u'+1:word.lower=rusedski', u'|B-PER'): 0.000979
(u'+1:word.lower=ntawulikura', u'|B-PER'): 0.000979
(u'+1:word.lower=kipkosgei', u'|B-PER'): 0.000979
(u'+1:word.lower=kibitok', u'|B-PER'): 0.000979
(u'+1:word.lower=diet

(u'+1:word.lower=subaru', u'|O'): 0.000559
(u'+1:word.lower=serra-zanetti', u'|B-PER'): 0.000559
(u'+1:word.lower=monin', u'|B-PER'): 0.000559
(u'+1:word.lower=miyagi', u'|B-PER'): 0.000559
(u'+1:word.lower=kandarr', u'|B-PER'): 0.000559
(u'+1:word.lower=kabul', u'|O'): 0.000559
(u'+1:word.lower=istanbul', u'|O'): 0.000559
(u'+1:word.lower=2-0', u'|I-ORG'): 0.000559
(u'word[-3:]=ARY', u'|B-MISC'): 0.000558
(u'word.lower=nanne', u'|B-PER'): 0.000558
(u'-1:word.lower=likud', u'|O'): 0.000558
(u'+1:word.lower=magnusson', u'|B-PER'): 0.000558
(u'+1:word.lower=grichina', u'|B-PER'): 0.000558
(u'+1:word.lower=dahlman', u'|B-PER'): 0.000558
(u'word.lower=scene', u'|O'): 0.000557
(u'+1:word.lower=spain', u'|I-ORG'): 0.000557
(u'+1:word.lower=krizan', u'|B-PER'): 0.000557
(u'+1:word.lower=high', u'|B-MISC'): 0.000557
(u'word[-3:]=ash', u'|I-PER'): 0.000556
(u'word[-3:]=Del', u'|B-PER'): 0.000556
(u'word[-3:]=eem', u'|B-PER'): 0.000555
(u'word.lower=frederick', u'|I-ORG'): 0.000555
(u'word.lower

(u'word.lower=equal', u'|O'): 0.00032
(u'word.lower=ending', u'|O'): 0.00032
(u'-1:word.lower=wellman', u'|O'): 0.00032
(u'-1:word.lower=names', u'|B-LOC'): 0.00032
(u'-1:word.lower=5-7', u'|O'): 0.00032
(u'word[-3:]=111', u'|O'): 0.000319
(u'word.lower=determine', u'|O'): 0.000319
(u'word.lower=111', u'|O'): 0.000319
(u'-1:word.lower=trader', u'|O'): 0.000319
(u'-1:word.lower=started', u'|O'): 0.000319
(u'-1:word.lower=karin', u'|I-PER'): 0.000319
(u'-1:word.lower=202', u'|O'): 0.000319
(u'+1:word.lower=below', u'|O'): 0.000319
(u'word.lower=reiziger', u'|I-PER'): 0.000318
(u'word[-3:]=No', u'|I-LOC'): 0.000317
(u'word[-2:]=No', u'|I-LOC'): 0.000317
(u'word.lower=emerged', u'|O'): 0.000317
(u'word.lower=addressed', u'|O'): 0.000317
(u'-1:word.lower=millwall', u'|O'): 0.000317
(u'-1:word.lower=expectations', u'|O'): 0.000317
(u'-1:word.lower=8-6', u'|O'): 0.000317
(u'-1:word.lower=extradition', u'|O'): 0.000316
(u'-1:word.lower=aid', u'|O'): 0.000316
(u'+1:word.lower=trinidad', u'|O'):

(u'-1:word.lower=1989', u'|O'): 0.000143
(u'+1:word.lower=unc', u'|O'): 0.000143
(u'+1:word.lower=reverts', u'|O'): 0.000143
(u'+1:word.lower=friend', u'|O'): 0.000143
(u'word[-3:]=211', u'|O'): 0.000142
(u'word.lower=waiting', u'|O'): 0.000142
(u'word.lower=suffering', u'|O'): 0.000142
(u'-1:word.lower=prospects', u'|O'): 0.000142
(u'-1:word.lower=places', u'|O'): 0.000142
(u'-1:word.lower=kafelnikov', u'|O'): 0.000142
(u'-1:word.lower=invitation', u'|O'): 0.000142
(u'-1:word.lower=defeated', u'|B-PER'): 0.000142
(u'-1:word.lower=companies', u'|O'): 0.000142
(u'+1:word.lower=negative', u'|O'): 0.000142
(u'+1:word.lower=extended', u'|O'): 0.000142
(u'+1:word.lower=escaped', u'|O'): 0.000142
(u'+1:word.lower=connection', u'|O'): 0.000142
(u'+1:word.lower=aol', u'|O'): 0.000142
(u'+1:word.lower=83rd', u'|O'): 0.000142
(u'+1:word.lower=30', u'|B-ORG'): 0.000142
(u'word[-3:]=-16', u'|O'): 0.000141
(u'word.lower=helicopter', u'|O'): 0.000141
(u'word.lower=fields', u'|O'): 0.000141
(u'word.l

(u'word.lower=kilometre', u'|O'): 3.7e-05
(u'word.lower=container', u'|O'): 3.7e-05
(u'word.lower=brain', u'|O'): 3.7e-05
(u'word.lower=announces', u'|O'): 3.7e-05
(u'word.lower=31st', u'|O'): 3.7e-05
(u'-1:word.lower=wakefield', u'|O'): 3.7e-05
(u'-1:word.lower=salvador', u'|O'): 3.7e-05
(u'-1:word.lower=renegade', u'|O'): 3.7e-05
(u'-1:word.lower=most', u'|B-ORG'): 3.7e-05
(u'-1:word.lower=makers', u'|O'): 3.7e-05
(u'-1:word.lower=liquidity', u'|O'): 3.7e-05
(u'-1:word.lower=harvey', u'|O'): 3.7e-05
(u'-1:word.lower=asian', u'|O'): 3.7e-05
(u'word[-3:]=1/3', u'|O'): 3.6e-05
(u'word.lower=slaughter', u'|O'): 3.6e-05
(u'word.lower=regarding', u'|O'): 3.6e-05
(u'word.lower=pre-tax', u'|O'): 3.6e-05
(u'word.lower=bearish', u'|O'): 3.6e-05
(u'word.lower=absentia', u'|O'): 3.6e-05
(u'word.lower=1968', u'|O'): 3.6e-05
(u'-1:word.lower=packed', u'|O'): 3.6e-05
(u'-1:word.lower=croat', u'|O'): 3.6e-05
(u'-1:word.lower=bury', u'|O'): 3.6e-05
(u'-1:word.lower=alberta', u'|O'): 3.6e-05
(u'+1:wor

(u'+1:word.lower=barrels', u'|O'): -0.0001
(u'+1:word.lower=arch-rivals', u'|O'): -0.0001
(u'word[-2:]=TA', u'|I-LOC'): -0.000101
(u'word.lower=telecommunications', u'|O'): -0.000101
(u'-1:word.lower=hockey', u'|O'): -0.000101
(u'-1:word.lower=four-day', u'|O'): -0.000101
(u'-1:word.lower=a', u'|B-ORG'): -0.000101
(u'+1:word.lower=towards', u'|O'): -0.000101
(u'+1:word.lower=title', u'|B-MISC'): -0.000101
(u'+1:word.lower=ferrigato', u'|O'): -0.000101
(u'word.lower=tankan', u'|O'): -0.000102
(u'word.lower=partners', u'|O'): -0.000102
(u'+1:word.lower=committee', u'|O'): -0.000102
(u'word[-3:]=ren', u'|B-MISC'): -0.000103
(u'word.lower=radical', u'|O'): -0.000103
(u'-1:word.lower=giant', u'|O'): -0.000103
(u'+1:word.lower=shi-ting', u'|O'): -0.000103
(u'+1:word.lower=performance', u'|O'): -0.000103
(u'+1:word.lower=might', u'|O'): -0.000103
(u'word.lower=domingo', u'|B-PER'): -0.000104
(u'+1:word.lower=rouble', u'|O'): -0.000104
(u'+1:word.lower=chili', u'|O'): -0.000104
(u'+1:word.lowe

(u'word[-3:]=ler', u'|B-MISC'): -0.001158
(u'-1:word.lower=dacom', u'|O'): -0.001158
(u'word[-2:]=bo', u'|I-PER'): -0.001163
(u'word[-3:]=ult', u'|I-PER'): -0.001164
(u'+1:word.lower=bridge', u'|B-ORG'): -0.001164
(u'-1:word.lower=julie', u'|O'): -0.001166
(u'-1:word.lower=accept', u'|O'): -0.001175
(u'word[-2:]=ap', u'|B-LOC'): -0.001177
(u'word.lower=white', u'|I-ORG'): -0.001177
(u'word.lower=jerusalem', u'|I-ORG'): -0.001177
(u'word[-3:]=oyd', u'|B-PER'): -0.001178
(u'word[-2:]=yd', u'|B-PER'): -0.001178
(u'word.lower=lloyd', u'|B-PER'): -0.001178
(u'-1:word.lower=gruppen', u'|O'): -0.001178
(u'word[-2:]=em', u'|I-MISC'): -0.001181
(u'-1:word.lower=sport', u'|O'): -0.001185
(u'+1:word.lower=lewis', u'|O'): -0.001188
(u'word[-3:]=rch', u'|I-PER'): -0.00119
(u'+1:word.lower=team', u'|I-ORG'): -0.001191
(u'word.lower=council', u'|B-MISC'): -0.001192
(u'word[-3:]=La', u'|I-ORG'): -0.001194
(u'+1:word.lower=j.', u'|O'): -0.001194
(u'-1:word.lower=systems', u'|O'): -0.001198
(u'+1:word.l

(u'+1:word.lower=nt', u'|O'): -0.004214
(u'+1:word.lower=100-30', u'|O'): -0.004215
(u'-1:word.lower=egyptair', u'|O'): -0.004217
(u'+1:word.lower=kick', u'|O'): -0.004217
(u'-1:word.lower=refuse', u'|O'): -0.004218
(u'-1:word.lower=dictator', u'|O'): -0.004224
(u'word[-3:]=yal', u'|O'): -0.004227
(u'-1:word.lower=brutal', u'|O'): -0.004227
(u'+1:word.lower=1860', u'|O'): -0.004227
(u'-1:word.lower=communist', u'|B-PER'): -0.004231
(u'-1:word.lower=repaid', u'|O'): -0.004235
(u'-1:word.lower=1997-98', u'|O'): -0.004236
(u'word[-3:]=Max', u'|O'): -0.004246
(u'word[-3:]=Fox', u'|B-ORG'): -0.004246
(u'word.lower=max', u'|O'): -0.004246
(u'word.lower=fox', u'|B-ORG'): -0.004246
(u'-1:word.lower=scoring', u'|O'): -0.004249
(u'+1:word.lower=missionaries', u'|O'): -0.004249
(u'-1:word.lower=fee', u'|O'): -0.004257
(u'+1:word.lower=152', u'|O'): -0.004261
(u'-1:word.lower=slammed', u'|O'): -0.004269
(u'-1:word.lower=utility', u'|O'): -0.00427
(u'-1:word.lower=checkfree', u'|O'): -0.00427
(u'+1

(u'+1:word.lower=partners', u'|O'): -0.014084
(u'word.lower=university', u'|O'): -0.014121
(u'word.lower=rapid', u'|O'): -0.014128
(u'+1:word.lower=laying', u'|O'): -0.01415
(u'+1:word.lower=9', u'|I-PER'): -0.014155
(u'word[-2:]=cs', u'|O'): -0.014161
(u'word[-3:]=les', u'|O'): -0.014167
(u'word[-3:]=04', u'|O'): -0.014167
(u'word.lower=villa', u'|I-PER'): -0.014167
(u'word.lower=04', u'|O'): -0.014167
(u'-1:word.lower=typical', u'|O'): -0.014167
(u'+1:word.lower=interbank', u'|O'): -0.014179
(u'word[-2:]=ie', u'|I-LOC'): -0.014195
(u'-1:word.lower=democracy', u'|O'): -0.014196
(u'word[-3:]=bor', u'|B-ORG'): -0.014197
(u'+1:word.lower=pitched', u'|O'): -0.0142
(u'-1:word.lower=george', u'|O'): -0.014206
(u'word[-3:]=kan', u'|O'): -0.014209
(u'word[-3:]=ash', u'|B-LOC'): -0.014218
(u'+1:word.lower=marketing', u'|O'): -0.01422
(u'word[-2:]=ES', u'|B-PER'): -0.014224
(u'word[-3:]=ung', u'|B-LOC'): -0.014235
(u'+1:word.lower=hovercrafts', u'|O'): -0.014236
(u'+1:word.lower=1970s', u'|O'):

(u'-1:word.lower=nations', u'|O'): -0.046492
(u'word[-3:]=lim', u'|O'): -0.046502
(u'word.lower=park', u'|O'): -0.046537
(u'word[-2:]=ff', u'|I-ORG'): -0.046597
(u'+1:word.lower=farmers', u'|O'): -0.046611
(u'word.lower=state', u'|B-LOC'): -0.046648
(u'+1:word.lower=22', u'|B-PER'): -0.046672
(u'+1:word.lower=4-0', u'|O'): -0.046786
(u'+1:word.lower=unless', u'|O'): -0.046868
(u'+1:word.lower=morinville', u'|O'): -0.046899
(u'word.isupper=True', u'|I-ORG'): -0.046977
(u'word[-2:]=rn', u'|I-ORG'): -0.04705
(u'word[-2:]=se', u'|I-PER'): -0.047191
(u'word[-3:]=urn', u'|O'): -0.047246
(u'word.lower=the', u'|I-LOC'): -0.047269
(u'word[-3:]=ane', u'|O'): -0.047297
(u'+1:word.lower=but', u'|B-ORG'): -0.04733
(u'word.lower=union', u'|B-ORG'): -0.047362
(u'+1:word.lower=militants', u'|O'): -0.047383
(u'word.lower=york', u'|B-ORG'): -0.047413
(u'+1:word.lower=two', u'|B-LOC'): -0.047418
(u'word.lower=bill', u'|O'): -0.047466
(u'word.lower=taiwan', u'|B-ORG'): -0.047475
(u'+1:word.lower=1996-08-2

(u'word[-3:]=her', u'|B-LOC'): -0.177379
(u'-1:word.lower=capital', u'|O'): -0.177886
(u'word[-3:]=ard', u'|B-LOC'): -0.177993
(u'word[-2:]=um', u'|B-ORG'): -0.17824
(u'word[-2:]=me', u'|I-ORG'): -0.178848
(u'word[-3:]=bed', u'|O'): -0.178861
(u'word[-3:]=sex', u'|O'): -0.179049
(u'word[-2:]=ns', u'|O'): -0.179585
(u'+1:word.istitle=True', u'|I-PER'): -0.179946
(u'word[-3:]=ice', u'|B-PER'): -0.180011
(u'word[-3:]=ing', u'|I-LOC'): -0.18011
(u'word[-2:]=ex', u'|B-LOC'): -0.180565
(u'word[-3:]=rch', u'|B-LOC'): -0.181112
(u'word[-3:]=RAL', u'|O'): -0.181326
(u'word[-3:]=nia', u'|B-MISC'): -0.181555
(u'-1:word.lower=former', u'|B-PER'): -0.181704
(u'word[-3:]=ity', u'|B-LOC'): -0.181808
(u'word.lower=at', u'|I-ORG'): -0.181846
(u'word[-3:]=can', u'|I-ORG'): -0.182464
(u'word[-2:]=ka', u'|I-PER'): -0.182562
(u'word[-3:]=mas', u'|O'): -0.182668
(u'word[-3:]=rry', u'|O'): -0.18274
(u'word[-3:]=ISS', u'|O'): -0.183407
(u'-1:word.lower="', u'|B-MISC'): -0.18344
(u'word.lower=west', u'|O'): -0

Save the trained model to disk.

In [23]:
import pickle

f = open("crf_ner_pol.mdl", "a")
pickle.dump(crf, f)
f.close()

Print the most likely and least likely transitions.

In [24]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
|B-LOC -> |I-LOC  5.345479
|B-PER -> |I-PER  5.174143
|I-ORG -> |I-ORG  4.609703
|I-MISC -> |I-MISC 4.569016
|B-ORG -> |I-ORG  4.533346
|B-MISC -> |I-MISC 3.129953
|I-PER -> |I-PER  1.794438
|O     -> |B-ORG  1.580775
|O     -> |B-LOC  1.509977
|O     -> |B-MISC 1.488459
|I-LOC -> |I-LOC  1.441020
|O     -> |B-PER  1.336434
|O     -> |O      0.822066
|B-LOC -> |O      0.562223
|B-PER -> |O      0.368204
|I-PER -> |O      0.046463
|B-MISC -> |B-MISC 0.024453
|B-MISC -> |O      0.021723
|B-MISC -> |B-PER  -0.068634
|B-LOC -> |B-MISC -0.095446

Top unlikely transitions:
|I-ORG -> |B-PER  -1.766387
|B-PER -> |B-LOC  -1.771607
|B-LOC -> |B-PER  -1.813408
|B-PER -> |B-MISC -1.937413
|B-MISC -> |I-PER  -2.015148
|B-MISC -> |I-ORG  -2.102599
|I-PER -> |I-ORG  -2.139164
|I-ORG -> |I-PER  -2.330169
|B-LOC -> |I-ORG  -2.514656
|B-ORG -> |B-PER  -2.631615
|B-ORG -> |B-LOC  -2.706786
|B-LOC -> |I-PER  -2.823604
|B-PER -> |B-ORG  -3.407979
|B-ORG -> |I-PER  -3.809023
|B-PER -