# NER

In [2]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report
import eli5
import scipy.stats
import matplotlib.pyplot as plt

## Dataset
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus?select=ner_dataset.csv <br>
The dataset has been modified by the 'data_modification.py' script

In [3]:
dframe = pd.read_csv("./data/my_dataset.csv", encoding = "ISO-8859-1", error_bad_lines=True)

In [4]:
dframe

Unnamed: 0,SentenceN,Word,POS,Tag
0,1,Thousands,NNS,O
1,1,of,IN,O
2,1,demonstrators,NNS,O
3,1,have,VBP,O
4,1,marched,VBN,O
...,...,...,...,...
1048570,47959,they,PRP,O
1048571,47959,responded,VBD,O
1048572,47959,to,TO,O
1048573,47959,the,DT,O


In [5]:
dframe["Tag"].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [6]:
print(dframe["Tag"].value_counts())

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64


In [7]:
#dframe = dframe[:10000]

In [8]:
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(), 
                                                   s["POS"].values.tolist(),
                                                   s["Tag"].values.tolist())]
grouped = dframe.groupby("SentenceN").apply(agg_func)
sentences = [s for s in grouped]

In [9]:
sentences

[[('Thousands', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('demonstrators', 'NNS', 'O'),
  ('have', 'VBP', 'O'),
  ('marched', 'VBN', 'O'),
  ('through', 'IN', 'O'),
  ('London', 'NNP', 'B-geo'),
  ('to', 'TO', 'O'),
  ('protest', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('war', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('Iraq', 'NNP', 'B-geo'),
  ('and', 'CC', 'O'),
  ('demand', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('withdrawal', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('British', 'JJ', 'B-gpe'),
  ('troops', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('that', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Families', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('soldiers', 'NNS', 'O'),
  ('killed', 'VBN', 'O'),
  ('in', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('conflict', 'NN', 'O'),
  ('joined', 'VBD', 'O'),
  ('the', 'DT', 'O'),
  ('protesters', 'NNS', 'O'),
  ('who', 'WP', 'O'),
  ('carried', 'VBD', 'O'),
  ('banners', 'NNS', 'O'),
  ('with', 'IN', 'O'),
  ('such', 'JJ', 'O'),
  ('slogans', 'NNS', 'O'),
  (

From the book "Speech and Language Processing" by Daniel Jurafsky and James H. Martin 

![image-3.png](attachment:image-3.png) ![image-4.png](attachment:image-4.png)

In [10]:
def word_shape(word):
    shape = ""
    for ch in word:
        if ch.isalpha():
            if ch.isupper():
                shape += "X"
            else:
                shape += "x"
        elif ch.isdigit():
            shape += "d"
        else:
            shape += ch
    return shape

def short_word_shape(word):
    shape = ""
    prev = ""
    for ch in word_shape(word):
        if ch != prev:
            shape += ch
            prev = ch
    return shape

def contains_hyphen(word):
    return "-" in word

In [11]:
print(word_shape("I.M.F"))
print(short_word_shape("I.M.F"))
print(contains_hyphen("hello"))

X.X.X
X.X.X
False


In [12]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'word_shape()': word_shape(word),
        'short_word_shape()': short_word_shape(word),
        'contains_hyphen()': contains_hyphen(word),
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:word_shape()': word_shape(word1),
            '-1:short_word_shape()': short_word_shape(word1),
            '-1:contains_hyphen()': contains_hyphen(word1),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:word_shape()': word_shape(word1),
            '+1:short_word_shape()': short_word_shape(word1),
            '+1:contains_hyphen()': contains_hyphen(word1),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [13]:
%%time
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

Wall time: 15.4 s


In [15]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.24,
    c2=0.13,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 4min 6s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.24, c2=0.13,
    keep_tempfiles=None, max_iterations=100)

In [16]:
%%time
y_pred = crf.predict(X_test)

Wall time: 7.5 s


In [17]:
metrics.flat_accuracy_score(y_test, y_pred)

0.9719917494625531

In [18]:
metrics.flat_precision_score(y_test, y_pred, average='weighted')

0.9713891430523243

In [19]:
metrics.flat_f1_score(y_test, y_pred, average='weighted')

0.9715110040799858

In [20]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-geo',
 'B-gpe',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-per',
 'I-per',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-gpe',
 'B-art',
 'I-art',
 'I-nat']

In [21]:
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.8518759681995407

In [23]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
report = metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)
print(report)

              precision    recall  f1-score   support

       B-art      0.396     0.148     0.216       128
       I-art      0.244     0.095     0.137       105
       B-eve      0.558     0.391     0.460       110
       I-eve      0.400     0.215     0.280        93
       B-geo      0.859     0.910     0.884     12321
       I-geo      0.805     0.798     0.801      2462
       B-gpe      0.971     0.933     0.952      5249
       I-gpe      0.886     0.574     0.696        68
       B-nat      0.649     0.329     0.436        73
       I-nat      0.857     0.286     0.429        21
       B-org      0.811     0.736     0.772      6716
       I-org      0.815     0.793     0.804      5490
       B-per      0.847     0.824     0.835      5616
       I-per      0.843     0.900     0.871      5672
       B-tim      0.934     0.888     0.911      6701
       I-tim      0.842     0.771     0.805      2051

   micro avg      0.861     0.847     0.854     52876
   macro avg      0.732   

In [24]:
eli5.show_weights(crf, top=30)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,2.829,0.164,-3.758,0.192,-3.692,0.783,-7.49,-0.169,-3.173,0.156,-2.457,0.438,-7.497,1.324,-6.777,0.877,-7.232
B-art,-0.417,0.0,6.821,0.0,0.0,0.0,-0.669,-0.147,0.0,0.0,0.0,0.011,-1.629,-0.948,-1.519,-0.071,-0.812
I-art,-0.673,0.0,6.243,0.0,0.0,-0.804,-0.994,-0.367,0.0,0.0,0.0,-0.672,-1.131,0.0,-1.277,-0.575,-0.82
B-eve,-0.703,0.0,0.0,0.0,6.06,-0.489,-0.676,-0.921,0.0,0.0,0.0,-1.066,-1.224,-1.301,-1.276,0.5,-0.841
I-eve,-0.363,0.0,0.0,-1.419,5.778,-0.528,-0.657,-0.252,0.0,0.0,0.0,-0.509,-1.164,-1.159,-1.096,-0.708,-1.059
B-geo,0.407,0.166,-1.332,0.014,-1.526,-3.087,5.376,0.885,-2.157,0.0,-1.007,-0.228,-4.157,-0.755,-3.845,1.295,-3.124
I-geo,-0.189,1.744,-0.614,-0.316,-0.596,-2.03,4.457,-0.582,-1.154,0.0,-0.144,-0.217,-3.063,-0.144,-2.609,0.628,-2.326
B-gpe,0.311,-0.896,-1.787,-1.368,-1.657,-0.005,-3.86,-5.15,4.818,-0.075,-0.139,0.983,-4.488,-0.125,-3.649,-0.449,-2.814
I-gpe,-0.262,0.0,0.0,0.0,0.0,0.0,-0.512,-0.638,4.928,0.0,0.0,-0.474,-0.708,-0.011,-0.589,-0.547,-0.47
B-nat,-0.729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.834,-0.247,-0.329,-0.215,-0.787,-0.002,-0.026

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+6.379,word.lower():last,,,,,,,,,,,,,,,
+5.926,word.lower():month,,,,,,,,,,,,,,,
+5.000,BOS,,,,,,,,,,,,,,,
+4.957,word.lower():chairman,,,,,,,,,,,,,,,
+4.845,word.lower():jordanian,,,,,,,,,,,,,,,
+4.841,word.lower():columbia,,,,,,,,,,,,,,,
+4.552,word.lower():hurricane,,,,,,,,,,,,,,,
+4.454,word.lower():asian,,,,,,,,,,,,,,,
+4.441,word.lower():republicans,,,,,,,,,,,,,,,
+4.237,word.lower():secretary,,,,,,,,,,,,,,,

Weight?,Feature
+6.379,word.lower():last
+5.926,word.lower():month
+5.000,BOS
+4.957,word.lower():chairman
+4.845,word.lower():jordanian
+4.841,word.lower():columbia
+4.552,word.lower():hurricane
+4.454,word.lower():asian
+4.441,word.lower():republicans
+4.237,word.lower():secretary

Weight?,Feature
+3.906,word.lower():twitter
+3.549,word.lower():nevirapine
+3.366,word.lower():canal
+3.344,-1:word.lower():film
+3.264,word.lower():english
+3.070,-1:word.lower():engine
+2.878,+1:word.lower():enkhbayar
+2.571,word.lower():economics
+2.548,+1:word.lower():boots
+2.345,word.lower():hungarian

Weight?,Feature
+2.227,+1:word.lower():came
+2.104,word.lower():flowers
+2.050,word.lower():station
+1.866,+1:word.lower():gained
+1.745,word.lower():a
+1.709,+1:word.lower():airport
+1.706,word.lower():constitution
+1.671,word.lower():pound
+1.650,+1:word.lower():agreement
+1.621,word[-2:]:le

Weight?,Feature
+3.803,word.lower():olympic
+3.594,word.lower():ramadan
+3.190,word.lower():games
+3.162,-1:word.lower():war
+2.892,-1:word.lower():typhoon
+2.722,-1:word.lower():falklands
+2.615,-1:word.lower():first
+2.564,word[-3:]:mes
+2.503,word[-3:]:pic
+2.245,-1:word.lower():happy

Weight?,Feature
+2.847,word.lower():games
+2.551,word.lower():series
+2.129,+1:word.lower():finals
+2.120,+1:word.lower():rally
+1.908,word[-3:]:Day
+1.875,word[-3:]:mes
+1.803,word.lower():sabbath
+1.785,word[-3:]:ath
+1.760,+1:word.lower():now
+1.705,word.lower():day

Weight?,Feature
+5.368,word.lower():caribbean
+4.765,word.lower():beijing
+4.420,-1:word.lower():serb
+4.128,word.lower():israel
+4.124,word.lower():europe
+4.117,word.lower():mars
+4.058,-1:word.lower():hamas
+3.963,word.lower():balkans
+3.916,word.lower():london
+3.751,word.lower():martian

Weight?,Feature
+3.482,word.lower():island
+3.240,word.lower():city
+2.814,word.lower():east
+2.762,word.lower():republic
+2.708,word.lower():holiday
+2.606,+1:word.lower():possessions
+2.556,word.lower():airport
+2.364,-1:word.lower():hong
+2.326,+1:word.lower():regional
+2.295,-1:word.lower():christmas

Weight?,Feature
+6.194,word.lower():nepal
+6.066,word.lower():niger
+4.561,word.lower():afghan
+4.515,word.lower():gibraltar
+4.429,word.lower():jordan
+4.322,word.lower():turkish
+4.037,word.lower():croats
+4.005,word.lower():azerbaijan
+3.845,word.lower():senegal
+3.793,+1:word.lower():mayor

Weight?,Feature
+4.697,+1:word.lower():mayor
+2.767,-1:word.lower():democratic
+2.543,+1:word.lower():developed
+2.437,+1:word.lower():health
+2.401,+1:word.lower():man
+2.261,word.lower():cypriots
+2.166,-1:word.lower():bosnian
+1.838,+1:word.lower():under
+1.830,+1:word.lower():began
+1.806,word.lower():republic

Weight?,Feature
+4.462,word.lower():katrina
+4.343,word.lower():marburg
+2.968,word.lower():rita
+2.591,word_shape():XdXd
+2.591,short_word_shape():XdXd
+2.428,word[-3:]:ita
+2.272,word[-3:]:urg
+2.259,word.lower():leukemia
+2.141,word[-2:]:rg
+2.086,+1:word.lower():correctly

Weight?,Feature
+2.262,word.lower():rita
+2.253,word[-3:]:ita
+2.009,word[-2:]:ta
+1.818,-1:word.lower():type
+1.707,-1:word.lower():hurricanes
+1.667,word.lower():flu
+1.658,word[-2:]:lu
+1.421,+1:word.lower():slammed
+1.379,+1:word.lower():last
+1.379,-1:postag:NN

Weight?,Feature
+6.191,word.lower():philippine
+6.132,word.lower():hamas
+4.544,word.lower():university
+4.465,-1:word.lower():senator
+4.382,word.lower():taleban
+4.380,word.lower():al-qaida
+4.320,word.lower():european
+4.245,-1:word.lower():rice
+4.193,word.lower():hezbollah
+3.924,word.lower():congress

Weight?,Feature
+3.161,-1:word.lower():european
+3.049,+1:word.lower():reporter
+3.020,-1:short_word_shape():dx
+2.968,-1:word.lower():militant
+2.904,word.lower():times
+2.822,word.lower():singapore
+2.810,word.lower():coast
+2.810,-1:word.lower():associated
+2.745,+1:word.lower():hamas
+2.728,word.lower():airlines

Weight?,Feature
+6.026,word.lower():president
+5.872,word.lower():obama
+4.953,word.lower():prime
+4.484,word.lower():vice
+4.475,word.lower():senator
+4.337,word.lower():western
+4.267,word.lower():hall
+4.056,word.lower():greenspan
+3.875,word.lower():clinton
+3.813,word.lower():al-zarqawi

Weight?,Feature
+3.231,+1:word.lower():advisor
+2.776,word.lower():rice
+2.751,-1:word.lower():condoleezza
+2.519,-1:word_shape():xxxxxxxx
+2.429,word.lower():peter
+2.406,word.lower():gates
+2.377,-1:word.lower():richard
+2.361,+1:word.lower():hui
+2.298,word.lower():vice
+2.288,+1:word.lower():condoleezza

Weight?,Feature
+5.557,word.lower():january
+5.322,word[-3:]:Day
+5.162,word.lower():august
+4.970,+1:word.lower():week
+4.850,word.lower():weekend
+4.806,word.lower():february
+4.309,word[-3:]:day
+4.133,-1:word.lower():week
+4.015,word.lower():march
+3.981,+1:word.lower():year

Weight?,Feature
+3.282,+1:word.lower():early
+2.915,+1:word.lower():stocky
+2.811,-1:word.lower():or
+2.793,word.lower():morning
+2.778,word[-3:]:ber
+2.739,-1:word.lower():past
+2.716,word.lower():night
+2.671,+1:word.lower():old
+2.597,word.lower():evening
+2.569,+1:word.lower():ukrainian


In [27]:
# %%time
# # define fixed parameters and parameters to search
# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     max_iterations=100,
#     all_possible_transitions=True
# )
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# # use the same metric for evaluation
# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=labels)

# # search
# rs = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs.fit(X_train, y_train)

In [29]:
# crf = rs.best_estimator_
# print('best params:', rs.best_params_)
# print('best CV score:', rs.best_score_)
# print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

## Used sources
- https://web.stanford.edu/~jurafsky/slp3/
- https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html
- https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html#training-data
- https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/