# Named Entity Recognition with CRF in Python

In [1]:
import pandas as pd
import numpy as np

# source : https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
# 데이터 다운로드 : https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/version/4
data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")

### 데이터 전처리 및 문장화 

In [2]:
data = data.fillna(method="ffill")

In [3]:
type(data)

pandas.core.frame.DataFrame

In [4]:
data.head(40)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [5]:
# 데이터 끝 10개의 행
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [6]:
# "Word" 열의 값들 중 unique한 것들로 list 생성 
words = list(set(data["Word"].values))
n_words = len(words)
n_words

35178

In [7]:
# 데이터에서 문장을 원하는 형태로 포매팅하여 추출하는 클래스
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [8]:
# 전체 문장 추출
getter = SentenceGetter(data)

In [9]:
# 문장 하나
sent = getter.get_next()

In [10]:
# 추출한 문장 확인
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [11]:
# 전체 문장
sentences = getter.sentences

In [12]:
# 문장 하나 확인
sentences[58]

[('The', 'DT', 'O'),
 ('government', 'NN', 'O'),
 ('was', 'VBD', 'O'),
 ('forced', 'VBN', 'O'),
 ('to', 'TO', 'O'),
 ('ask', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('International', 'NNP', 'B-org'),
 ('Monetary', 'NNP', 'I-org'),
 ('Fund', 'NNP', 'I-org'),
 ('and', 'CC', 'O'),
 ('several', 'JJ', 'O'),
 ('countries', 'NNS', 'O'),
 ('for', 'IN', 'O'),
 ('a', 'DT', 'O'),
 ('multi-billion-dollar', 'JJ', 'O'),
 ('loan', 'NN', 'O'),
 ('.', '.', 'O')]

### CRF training을 위해 단어를 features로

In [13]:
# 단어를 features화
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [14]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [15]:
print(len(X))
print(len(y))
print(len(sentences))

47959
47959
47959


In [16]:
print(len(X[10]))
print(len(y[10]))
print(sentences[10])
print(y[10])
print(X[10])

40
40
[('In', 'IN', 'O'), ('Beirut', 'NNP', 'B-geo'), (',', ',', 'O'), ('a', 'DT', 'O'), ('string', 'NN', 'O'), ('of', 'IN', 'O'), ('officials', 'NNS', 'O'), ('voiced', 'VBD', 'O'), ('their', 'PRP$', 'O'), ('anger', 'NN', 'O'), (',', ',', 'O'), ('while', 'IN', 'O'), ('at', 'IN', 'O'), ('the', 'DT', 'O'), ('United', 'NNP', 'B-org'), ('Nations', 'NNP', 'I-org'), ('summit', 'NN', 'O'), ('in', 'IN', 'O'), ('New', 'NNP', 'B-geo'), ('York', 'NNP', 'I-geo'), (',', ',', 'O'), ('Prime', 'NNP', 'B-per'), ('Minister', 'NNP', 'O'), ('Fouad', 'NNP', 'B-per'), ('Siniora', 'NNP', 'I-per'), ('said', 'VBD', 'O'), ('the', 'DT', 'O'), ('Lebanese', 'JJ', 'B-gpe'), ('people', 'NNS', 'O'), ('are', 'VBP', 'O'), ('resolute', 'JJ', 'O'), ('in', 'IN', 'O'), ('preventing', 'VBG', 'O'), ('such', 'JJ', 'O'), ('attempts', 'NNS', 'O'), ('from', 'IN', 'O'), ('destroying', 'VBG', 'O'), ('their', 'PRP$', 'O'), ('spirit', 'NN', 'O'), ('.', '.', 'O')]
['O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [17]:
# 문장 하나 확인
# 각 단어의 features가 dictionary로 표현되고 이를 요소로 하는 리스트
X[0]

[{'bias': 1.0,
  'word.lower()': 'thousands',
  'word[-3:]': 'nds',
  'word[-2:]': 'ds',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'NNS',
  'postag[:2]': 'NN',
  'BOS': True,
  '+1:word.lower()': 'of',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'IN',
  '+1:postag[:2]': 'IN'},
 {'bias': 1.0,
  'word.lower()': 'of',
  'word[-3:]': 'of',
  'word[-2:]': 'of',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'IN',
  'postag[:2]': 'IN',
  '-1:word.lower()': 'thousands',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:postag': 'NNS',
  '-1:postag[:2]': 'NN',
  '+1:word.lower()': 'demonstrators',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'NNS',
  '+1:postag[:2]': 'NN'},
 {'bias': 1.0,
  'word.lower()': 'demonstrators',
  'word[-3:]': 'ors',
  'word[-2:]': 'rs',
  'word.isupper()': False,
  'word.istitle()': False,
  'wor

### Conditional Random Fields 적용 

In [18]:
from sklearn_crfsuite import CRF

# Graident Descent 대신 Limited-memory Broyden–Fletcher–Goldfarb–Shanno (LBFGS) 사용
crf = CRF(algorithm='lbfgs',
          c1=0.1, # overfitting을 방지하기 위한 L1 regularization(정규화) 가중치
          c2=0.1, # overfitting을 방지하기 위한 L2 regularization 가중치
          max_iterations=100,
          all_possible_transitions=False)

In [19]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [20]:
# Cross validation 을 적용하여 CRF 학습 
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)



In [21]:
pred[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'O',
 'O']

In [22]:
# predicted labels와 ground truth labels 비교
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)



              precision    recall  f1-score   support

       B-art       0.37      0.11      0.17       402
       B-eve       0.52      0.35      0.42       308
       B-geo       0.85      0.90      0.88     37644
       B-gpe       0.97      0.94      0.95     15870
       B-nat       0.66      0.37      0.47       201
       B-org       0.78      0.72      0.75     20143
       B-per       0.84      0.81      0.82     16990
       B-tim       0.93      0.88      0.90     20333
       I-art       0.11      0.03      0.04       297
       I-eve       0.34      0.21      0.26       253
       I-geo       0.82      0.79      0.80      7414
       I-gpe       0.92      0.55      0.69       198
       I-nat       0.61      0.27      0.38        51
       I-org       0.81      0.79      0.80     16784
       I-per       0.84      0.89      0.87     17251
       I-tim       0.83      0.76      0.80      6528
           O       0.99      0.99      0.99    887908

    accuracy              

In [None]:
print(len(X), len(X)*0.7)
print(type(X))

In [None]:
# Training
crf.fit(X[:33570], y[:33570])

In [None]:
# prediction
y_test_pred = crf.predict(X[33570:])

In [None]:
y[33570]

In [None]:
y_test_pred[0]

In [None]:
# predicted labels와 ground truth labels 비교
report = flat_classification_report(y_pred=y_test_pred, y_true=y[33570:])
print(report)

In [None]:
# 태그 간의 transition(전이) probabilities, 태그 별 예측에 중요한 features
import eli5
eli5.show_weights(crf, top=30)

In [None]:
# 위의 결과를 보면 너무 많은 features들이 사용되었다. 
# 드물게 발생하는 features의 영향력을 줄일 수 있는 L1 regularization의 가중치를 높여서 학습

crf = CRF(algorithm='lbfgs',
          c1=10,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [None]:
# cross validation
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [None]:
# report
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

In [None]:
# Training
crf.fit(X[:33570], y[:33570])

In [None]:
# prediction
y_test_pred = crf.predict(X[33570:])

In [None]:
# predicted labels와 ground truth labels 비교
report = flat_classification_report(y_pred=y_test_pred, y_true=y[33570:])
print(report)

In [None]:
# 태그 간의 transition(전이) probabilities, 태그 별 예측에 중요한 features
eli5.show_weights(crf, top=30)