In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import sklearn
import sklearn_crfsuite

from sklearn_crfsuite import CRF, scorers, metrics
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import make_scorer
from itertools import chain

data = pd.read_csv('ner_dataset.csv', encoding='latin1')
data = data.fillna(method='ffill')

In [2]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                            s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
  
    def get_next(self):     
        try:
             s = self.grouped["Sentence: {}".format(self.n_sent)]
             self.n_sent += 1
             return s
        except:
             return None                 

In [3]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [4]:
%%time
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.is_upper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag[:2]
        })
    else:
        features['BOS'] = True
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2]
        })
    else:
        features['EOS'] = True
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2token(sent):
    return [token for token, postag, label in sent]

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Wall time: 4.06 s


In [5]:
%%time
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)


Wall time: 3min 41s


In [6]:
# tagger = crf.tagger_
# y_pred = [tagger.tag(xseq) for xseq in X_test]
i = 0
y_pred = crf.predict(X_test)

for x,y in zip(y_pred[i], [x['word.lower()'] for x in X_test[i]]):
   print("{0} ({1})".format(y, x))

he (O)
made (O)
an (O)
appointment (O)
with (O)
a (O)
plastic (O)
surgeon (O)
because (O)
he (O)
was (O)
so (O)
sensitive (O)
about (O)
his (O)
looks (O)
. (O)


In [7]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
#predictions = np.array([labels[tag] for row in y_pred for tag in row])
#truths = np.array([labels[tag] for row in y_test for tag in row])

labels = crf.classes_

y_predict = MultiLabelBinarizer().fit_transform(y_pred)
y_testing = MultiLabelBinarizer().fit_transform(y_test)
print(y_predict.shape)
print(y_testing.shape)
# sorted_labels = sorted(
#     labels,
#     key=lambda name: (name[1:], name[0])
# )
print(accuracy_score(y_testing, y_predict))
print(classification_report(
   y_testing, y_predict, 
    target_names=labels, digits=3))


(9592, 17)
(9592, 17)
0.7538573811509591
              precision    recall  f1-score   support

           O      0.375     0.164     0.229        73
       B-per      0.556     0.426     0.482        47
       B-tim      0.971     0.957     0.964      2432
       B-gpe      0.913     0.951     0.932      4950
       I-tim      0.739     0.515     0.607        33
       B-mal      0.851     0.816     0.833      3160
       B-org      0.903     0.907     0.905      2661
       I-mal      0.969     0.952     0.960      3468
       I-org      0.214     0.075     0.111        40
       I-per      0.296     0.276     0.286        29
       B-nat      0.913     0.525     0.667        40
       B-eve      0.841     0.832     0.837      1221
       I-eve      0.667     0.364     0.471        11
       I-gpe      0.836     0.829     0.833      1677
       B-art      0.901     0.944     0.922      2103
       I-art      0.858     0.838     0.848       859
       I-nat      1.000     1.000     1.