In [1]:
import pandas as pd
import numpy as np

# 한국어 NER data : https://github.com/machinereading/KoreanNERCorpus ==> train.txt, 

# 데이터 전처리 및 문장화 

In [2]:
data_sents = []
data = []
sent_no = 0
for line in open('data-ko/train.txt', 'r', encoding = 'utf8'):
    if line[0] == ';' :
        data_sents.append(line[1:].split('\n')[0])
        sent_no += 1
    elif line[0] != '$':  
        row = line.split()
        if len(row) :
            row[0] = 'Sentence: ' + str(sent_no) # sentence number 보정해주기 위한 값
            data.append(row)

In [3]:
len(data_sents)

3555

In [4]:
data_sents[0]

' 한편, AFC챔피언스리그 E조에 속한 포항 역시 대회 8강 진출이 불투명하다 .'

In [5]:
data[0]

['Sentence: 1', '한편', 'NNG', 'O']

In [6]:
len(data)

121009

In [7]:
data = pd.DataFrame(data)

In [8]:
data.columns = ['Sentence #', 'Word', 'POS', 'Tag']

In [9]:
data.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,한편,NNG,O
1,Sentence: 1,",",SP,O
2,Sentence: 1,AFC,SL,O
3,Sentence: 1,챔피언스,NNG,O
4,Sentence: 1,리그,NNG,O
5,Sentence: 1,E,SL,B_OG
6,Sentence: 1,조,NNG,I
7,Sentence: 1,에,JKB,O
8,Sentence: 1,속하,VV,O
9,Sentence: 1,ㄴ,ETM,O


In [10]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
120999,Sentence: 3555,3,SN,O
121000,Sentence: 3555,.,SF,O
121001,Sentence: 3555,85,SN,O
121002,Sentence: 3555,%,SW,O
121003,Sentence: 3555,씩,XSN,O
121004,Sentence: 3555,하락,NNG,O
121005,Sentence: 3555,하,XSV,O
121006,Sentence: 3555,았,EP,O
121007,Sentence: 3555,다,EF,O
121008,Sentence: 3555,.,SF,O


In [11]:
words = list(set(data['Word'].values))
n_words = len(words)
n_words

13859

In [12]:
import sys

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            print(sys.exc_info())
            return None


In [13]:
getter = SentenceGetter(data)

In [14]:
sent = getter.get_next()

In [15]:
print(sent)

[('한편', 'NNG', 'O'), (',', 'SP', 'O'), ('AFC', 'SL', 'O'), ('챔피언스', 'NNG', 'O'), ('리그', 'NNG', 'O'), ('E', 'SL', 'B_OG'), ('조', 'NNG', 'I'), ('에', 'JKB', 'O'), ('속하', 'VV', 'O'), ('ㄴ', 'ETM', 'O'), ('포항', 'NNP', 'O'), ('역시', 'MAJ', 'O'), ('대회', 'NNG', 'O'), ('8강', 'NNG', 'O'), ('진출', 'NNG', 'O'), ('이', 'JKS', 'O'), ('불투명', 'NNG', 'O'), ('하', 'VV', 'O'), ('다', 'EC', 'O'), ('.', 'SF', 'O')]


In [16]:
sentences = getter.sentences

In [17]:
sentences[50]

[('비', 'NNG', 'O'),
 ('가', 'JKS', 'O'),
 ('오', 'VV', 'O'),
 ('ㄹ', 'ETM', 'O'),
 ('경우', 'NNG', 'O'),
 ('당일', 'NNG', 'B_DT'),
 ('아침', 'NNG', 'B_TI'),
 ('7', 'SN', 'I'),
 ('시', 'NNB', 'I'),
 ('현재', 'NNG', 'O')]

# CRF training을 위해 단어를 features

In [173]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'bias': 1.0,
        'word': word,
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word': word1,
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word': word1,
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [174]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [175]:
print(len(X))
print(len(y))
print(len(sentences))

3555
3555
3555


In [176]:
print(len(X[10]))
print(len(y[10]))
print(sentences[10])
print(y[10])
print(X[10])

41
41
[('자연', 'NNG', 'O'), ('휴양림', 'NNG', 'O'), ('의', 'JKG', 'O'), ('숲속', 'NNG', 'O'), ('의', 'JKG', 'O'), ('집', 'NNG', 'O'), ('에서', 'JKB', 'O'), ('찻길', 'NNG', 'O'), ('을', 'JKO', 'O'), ('따르', 'VV', 'O'), ('아', 'EC', 'O'), ('오르', 'VV', 'O'), ('다', 'EC', 'O'), ('오른쪽', 'NNG', 'O'), ('숲속', 'NNG', 'O'), ('의', 'JKG', 'O'), ('정자각', 'NNG', 'O'), ('바로', 'MAG', 'O'), ('옆', 'NNG', 'O'), ('계곡', 'NNG', 'O'), ('으로', 'JKB', 'O'), ('내려가', 'VV', 'O'), ('면', 'EC', 'O'), ('적가리', 'NNP', 'B_LC'), ('골', 'NNG', 'I'), ('최고', 'NNG', 'O'), ('의', 'JKG', 'O'), ('경관', 'NNG', 'O'), ('지', 'NNG', 'O'), ('이', 'VCP', 'O'), ('ㄴ', 'ETM', 'O'), ('이', 'NNP', 'B_LC'), ('폭포', 'NNG', 'I'), ('와', 'JC', 'O'), ('저', 'XPN', 'B_LC'), ('폭포', 'NNG', 'I'), ('중간', 'NNG', 'O'), ('으로', 'JKB', 'O'), ('나서', 'VV', 'O'), ('ㄴ다', 'EC', 'O'), ('.', 'SF', 'O')]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_LC', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B_LC', 'I', 'O', 'B_LC', 'I

In [177]:
# 문장 하나 확인
# 각 단어의 features가 dictionary로 표현되고 이를 요소로 하는 리스트
X[0]

[{'bias': 1.0,
  'word': '한편',
  'word.isdigit()': False,
  'postag': 'NNG',
  'postag[:2]': 'NN',
  'BOS': True,
  '+1:word': ',',
  '+1:word.isdigit()': False,
  '+1:postag': 'SP',
  '+1:postag[:2]': 'SP'},
 {'bias': 1.0,
  'word': ',',
  'word.isdigit()': False,
  'postag': 'SP',
  'postag[:2]': 'SP',
  '-1:word': '한편',
  '-1:word.isdigit()': False,
  '-1:postag': 'NNG',
  '-1:postag[:2]': 'NN',
  '+1:word': 'AFC',
  '+1:word.isdigit()': False,
  '+1:postag': 'SL',
  '+1:postag[:2]': 'SL'},
 {'bias': 1.0,
  'word': 'AFC',
  'word.isdigit()': False,
  'postag': 'SL',
  'postag[:2]': 'SL',
  '-1:word': ',',
  '-1:word.isdigit()': False,
  '-1:postag': 'SP',
  '-1:postag[:2]': 'SP',
  '+1:word': '챔피언스',
  '+1:word.isdigit()': False,
  '+1:postag': 'NNG',
  '+1:postag[:2]': 'NN'},
 {'bias': 1.0,
  'word': '챔피언스',
  'word.isdigit()': False,
  'postag': 'NNG',
  'postag[:2]': 'NN',
  '-1:word': 'AFC',
  '-1:word.isdigit()': False,
  '-1:postag': 'SL',
  '-1:postag[:2]': 'SL',
  '+1:word':

# Conditional Random Fields 적용 

In [178]:
from sklearn_crfsuite import CRF

# Graident Descent 대신 Limited-memory Broyden–Fletcher–Goldfarb–Shanno (LBFGS) 사용
crf = CRF(algorithm='lbfgs',
          c1=0.1, # overfitting을 방지하기 위한 L1 regularization(정규화) 가중치
          c2=0.1, # overfitting을 방지하기 위한 L2 regularization 가중치
          max_iterations=100,
          all_possible_transitions=False)

In [179]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [180]:
# Cross validation 을 적용하여 CRF 학습 
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [181]:
pred[0]

['O',
 'O',
 'B_OG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_OG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [182]:
# predicted labels와 ground truth labels 비교
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

        B_DT       0.95      0.82      0.88      2135
        B_LC       0.71      0.62      0.66      2125
        B_OG       0.76      0.48      0.59      3267
        B_PS       0.68      0.75      0.71      2347
        B_TI       0.94      0.74      0.83       332
           I       0.84      0.67      0.75      7130
           O       0.96      0.99      0.97    103673

    accuracy                           0.94    121009
   macro avg       0.83      0.72      0.77    121009
weighted avg       0.94      0.94      0.94    121009



In [183]:
print(len(X), len(X)*0.7)
print(type(X))

3555 2488.5
<class 'list'>


In [184]:
# Training
crf.fit(X[:2488], y[:2488])

CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [185]:
# prediction
y_test_pred = crf.predict(X[2488:])

In [186]:
y[2488]
y_test_pred[0]

['B_PS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_DT',
 'I',
 'O',
 'O',
 'O',
 'B_DT',
 'I',
 'I',
 'I',
 'O',
 'B_DT',
 'I',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_LC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_DT',
 'I',
 'O',
 'O',
 'O',
 'O']

In [187]:
# predicted labels와 ground truth labels 비교
report = flat_classification_report(y_pred=y_test_pred, y_true=y[2488:])
print(report)

              precision    recall  f1-score   support

        B_DT       0.94      0.75      0.83       648
        B_LC       0.64      0.49      0.55       627
        B_OG       0.78      0.52      0.62       993
        B_PS       0.60      0.78      0.68       603
        B_TI       0.92      0.64      0.76       123
           I       0.82      0.69      0.75      2172
           O       0.96      0.99      0.97     27203

    accuracy                           0.93     32369
   macro avg       0.81      0.69      0.74     32369
weighted avg       0.93      0.93      0.93     32369



In [188]:
# 태그 간의 transition(전이) probabilities, 태그 별 예측에 중요한 features
import eli5
eli5.show_weights(crf, top=30)

From \ To,B_DT,B_LC,B_OG,B_PS,B_TI,I,O
B_DT,-2.23,-1.154,-1.541,-2.088,0.158,4.015,-0.718
B_LC,-2.602,1.099,-0.868,-3.018,0.0,3.598,-0.731
B_OG,-1.671,-2.039,-1.346,-0.828,0.0,5.732,-1.077
B_PS,-1.452,-1.73,-1.01,-1.871,0.0,4.342,-0.261
B_TI,0.0,-0.169,-0.509,-0.253,-1.336,4.387,-0.836
I,-2.174,-1.128,-1.567,-1.167,-0.379,4.476,-1.222
O,1.837,1.342,1.09,1.821,1.389,0.0,3.587

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+8.103,word:지난해,,,,,
+6.710,word:여름,,,,,
+6.274,word:전날,,,,,
+6.234,word:발렌타인데이,,,,,
+6.226,word:오늘,,,,,
+6.053,word:지난달,,,,,
+6.012,word:어제,,,,,
+5.640,word:이틀,,,,,
+5.479,word:전년,,,,,
+5.434,word:하루,,,,,

Weight?,Feature
+8.103,word:지난해
+6.710,word:여름
+6.274,word:전날
+6.234,word:발렌타인데이
+6.226,word:오늘
+6.053,word:지난달
+6.012,word:어제
+5.640,word:이틀
+5.479,word:전년
+5.434,word:하루

Weight?,Feature
+6.197,word:성내
+4.692,word:아이티
+4.519,word:미국
+4.426,word:오산
+4.399,word:중국
+4.273,word:백운
+4.219,+1:word:마을
+4.192,word:일본
+4.079,word:프랑스
+3.997,word:화성

Weight?,Feature
+5.751,word:민주당
+5.677,word:사법부
+5.343,word:한나라당
+5.308,word:도의회
+5.233,word:검찰
+4.968,word:시의회
+4.964,word:국세청
+4.862,word:국회
+4.677,word:정부
+4.643,word:코스닥

Weight?,Feature
+4.562,+1:word:스님
+4.335,-1:word:듀오
+4.088,word:재범
+3.894,word:러블리
+3.891,word:드라큘라
+3.850,+1:word:PM
+3.816,word:선미
+3.815,word:풀잎
+3.745,+1:word:고문
+3.655,word:신데렐라

Weight?,Feature
+7.510,word:오전
+7.435,word:오후
+6.573,word:저녁
+5.931,+1:word:시간
+5.406,word:전반
+5.397,word:새벽
+5.006,word:밤
+4.957,word:낮
+4.922,word:아침
+4.662,word:후반

Weight?,Feature
+4.898,word:부터
+4.315,word:까지
+3.727,word:협회
+3.626,word:개월
+3.564,word:세기
+3.495,word:년도
+3.441,word:주
+3.362,word:교육청
+3.305,word:상반기
+3.278,word:폭포

Weight?,Feature
+4.479,word:관계자
+3.887,word:등
+3.739,+1:word:참전
+3.667,+1:word:언니
+3.596,+1:word:구장
+3.501,+1:word:올림픽
+3.464,word:한국전쟁
+3.448,+1:word:감
+3.432,word:숭례문
+3.402,EOS


In [210]:
# 위의 결과를 보면 너무 많은 features들이 사용되었다. 
# 드물게 발생하는 features의 영향력을 줄일 수 있는 L1 regularization의 가중치를 높여서 학습

crf = CRF(algorithm='lbfgs',
          c1=5,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [211]:
# cross validation
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [212]:
# report
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

        B_DT       0.93      0.70      0.80      2135
        B_LC       0.55      0.50      0.53      2125
        B_OG       0.66      0.25      0.36      3267
        B_PS       0.50      0.63      0.56      2347
        B_TI       0.85      0.43      0.57       332
           I       0.82      0.51      0.63      7130
           O       0.94      0.99      0.97    103673

    accuracy                           0.92    121009
   macro avg       0.75      0.57      0.63    121009
weighted avg       0.91      0.92      0.91    121009



In [213]:
# Training
crf.fit(X[:2488], y[:2488])

CRF(algorithm='lbfgs', all_possible_transitions=False, c1=5, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [214]:
# prediction
y_test_pred = crf.predict(X[2488:])

In [215]:
# predicted labels와 ground truth labels 비교
report = flat_classification_report(y_pred=y_test_pred, y_true=y[2488:])
print(report)

              precision    recall  f1-score   support

        B_DT       0.91      0.64      0.75       648
        B_LC       0.47      0.37      0.41       627
        B_OG       0.69      0.23      0.34       993
        B_PS       0.43      0.74      0.54       603
        B_TI       0.87      0.21      0.34       123
           I       0.82      0.53      0.65      2172
           O       0.94      0.99      0.96     27203

    accuracy                           0.91     32369
   macro avg       0.73      0.53      0.57     32369
weighted avg       0.90      0.91      0.90     32369



In [216]:
# 태그 간의 transition(전이) probabilities, 태그 별 예측에 중요한 features
eli5.show_weights(crf, top=30)

From \ To,B_DT,B_LC,B_OG,B_PS,B_TI,I,O
B_DT,-0.621,-0.718,-0.823,-1.221,0.0,3.766,-0.645
B_LC,0.0,1.452,-0.052,-1.752,0.0,3.778,-0.353
B_OG,0.0,-1.103,-0.14,0.0,0.0,5.79,-0.196
B_PS,0.0,-1.757,-0.673,-1.219,0.0,3.848,0.006
B_TI,0.0,0.0,0.0,0.0,0.0,4.127,-0.738
I,-1.768,-1.346,-1.671,-1.218,-0.695,3.589,-1.108
O,2.371,1.097,1.187,2.018,0.729,0.0,3.458

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+5.907,word:지난해,,,,,
+5.858,word:오늘,,,,,
+4.605,word:올해,,,,,
+4.495,word:설,,,,,
+4.128,word:지나,,,,,
+3.932,word:전년,,,,,
+3.901,word:하루,,,,,
+3.862,word:2010,,,,,
+3.797,word:전날,,,,,
+3.684,word:겨울,,,,,

Weight?,Feature
+5.907,word:지난해
+5.858,word:오늘
+4.605,word:올해
+4.495,word:설
+4.128,word:지나
+3.932,word:전년
+3.901,word:하루
+3.862,word:2010
+3.797,word:전날
+3.684,word:겨울

Weight?,Feature
+4.662,word:성내
+3.070,word:미국
+2.362,word:일본
+2.360,word:중국
+2.220,+1:word:지역
+2.198,word:한국
+2.062,+1:word:마을
+2.056,postag:NNP
+2.017,+1:word:권
+1.825,word:제주

Weight?,Feature
+3.614,word:정부
+3.588,word:국회
+3.439,word:민주당
+3.015,word:한나라당
+2.882,word:경찰
+2.597,word:친
+1.744,word:롯데
+1.724,word:삼성
+1.669,word:유엔
+1.656,word:LG

Weight?,Feature
+3.151,+1:word:의원
+2.803,+1:word:모
+2.647,+1:word:씨
+2.557,+1:word:대통령
+2.251,postag:NNP
+2.201,-1:word:그룹
+1.937,word:정
+1.879,word:김
+1.493,+1:word:전
+1.489,+1:word:감독

Weight?,Feature
+5.775,word:오전
+5.173,word:오후
+3.950,word:전반
+3.816,word:밤
+3.734,word:아침
+3.693,+1:word:시간
+2.898,word:낮
+2.538,+1:word::
+2.346,word:저녁
+2.308,+1:word:분

Weight?,Feature
+3.616,word:부터
+3.461,word:까지
+2.475,word:위원회
+2.372,word:년
+2.369,word:주
+2.049,-1:word::
+2.046,word:초
+1.962,word:분기
+1.851,word:월
+1.821,word:시즌

Weight?,Feature
+3.709,EOS
+3.004,postag[:2]:JK
+2.536,word:관계자
+2.038,"word:,"
+1.909,word:등
+1.853,+1:postag[:2]:XS
+1.629,word:은
+1.440,-1:postag:NNP
+1.440,bias
+1.253,word:는
