In [85]:
import pickle

with open('C:\conll2002', 'rb') as f:
    train, test = pickle.load(f)

In [86]:
# Raw sentence is 'Melbourne(Australia),
for chunk in train[0]:
    print(chunk)

('Melbourne', 'NP', 'B-LOC')
('(', 'Fpa', 'O')
('Australia', 'NP', 'B-LOC')
(')', 'Fpt', 'O')
(',', 'Fc', 'O')
('25', 'Z', 'O')
('may', 'NC', 'O')
('(', 'Fpa', 'O')
('EFE', 'NC', 'B-ORG')
(')', 'Fpt', 'O')
('.', 'Fp', 'O')


In [87]:
def word2features(sentence, i):
    
    word = sentence[i][0]
    postag = sentence[i][1]
    
    features = {
        'word_lower': word.lower(),
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
        'postag': postag
    }
    
    if i==0:
        features['BOS'] = True
    if i==( len(sentence)-1 ):
        features['EOS'] = True
        
    return features

def sentence2features(sentence):
    return [ word2features(sentence, i) for i in range(len(sentence))]

def sentence2labels(sentence):
    return [ label for token, postag, label in sentence ]

def sentence2tokens(sentence):
    return [ token for token, postag, label in sentence ]

In [88]:
sentence2features(train[0])[0]

{'BOS': True,
 'is_digit': False,
 'is_title': True,
 'is_upper': False,
 'postag': 'NP',
 'word_lower': 'melbourne'}

In [89]:
X_train = [ sentence2features(s) for s in train ]
y_train = [ sentence2labels(s) for s in train]
X_test = [ sentence2features(s) for s in test ]
y_test = [ sentence2labels(s) for s in test ]

In [90]:
# pip install sklearn-crfsuite

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [91]:
crf = sklearn_crfsuite.CRF(max_iterations=100)
crf.fit(X_train, y_train)

CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
  averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
  calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [92]:
labels = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-MISC']

In [93]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.71176088970234686

In [94]:
print( metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3) )

             precision    recall  f1-score   support

      B-LOC      0.681     0.702     0.691      1084
      I-LOC      0.644     0.557     0.597       325
      B-ORG      0.805     0.783     0.794      1400
      I-ORG      0.723     0.772     0.747      1104
      B-PER      0.767     0.766     0.767       735
      I-PER      0.801     0.932     0.862       634
     B-MISC      0.724     0.363     0.483       339
     I-MISC      0.494     0.395     0.439       557

avg / total      0.723     0.710     0.712      6178



# 문제
* 한국어 띄어쓰기 구현하기

In [295]:
import re
import codecs

def corpus2sent(path):
    corpus = codecs.open(path).read()    
    raws = corpus.split('\n')
    sentences = []
    for row in rows:
        tokens = row.split(' ')       
        sentence = []
        for token in tokens:            
            try:
                word, tag = token.split('/')
                if word and tag:                   
                    word=word.replace(",","")
                    sentence.append([word, tag])
            except:
                pass
        sentences.append(sentence)
    return sentences


In [227]:
def index2feature(sent, i, offset):
    word, tag = sent[i + offset]
    if offset < 0:       
        sign=''
        return '{}{}word,{}'.format(sign, offset, word)
    if offset == 0:
        return 'word,{}'.format(word)
    else:
        sign = '+'
        return '{}{}word,{}'.format(sign, offset, word)

def word2features(sent, i):
    L = len(sent)
    word, tag = sent[i]
    features = []
 
    if i > 1:
        features.append(index2feature(sent, i, -2))
    if i > 0:
        features.append(index2feature(sent, i, -1))  
    features.append(index2feature(sent, i, 0))
    if i < L - 1:
        features.append(index2feature(sent, i, 1)) 
    if i < L - 2:
        features.append(index2feature(sent, i, 2)) 
    return features

def sent2words(sent):
    return [word for word, tag in sent]

def sent2tags(sent):
    return [tag for word, tag in sent]

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [228]:
train = corpus2sent("C:/train_corp.txt")
test = corpus2sent("C:/test_corp.txt")

In [229]:
train_x = [sent2features(sent) for sent in train]
train_y = [sent2tags(sent) for sent in train]
test_x = [sent2features(sent) for sent in test]
test_y = [sent2tags(sent) for sent in test]

In [230]:
import copy
train_x2=copy.copy(train_x)
X_train=copy.copy(train_x2)
test_x2=copy.copy(test_x)
X_test=copy.copy(test_x2)
for i in range(len(train_x)):
    for j in range(len(train_x[i])):        
        for k in range(len(train_x[i][j])):
            train_x2[i][j][k]=copy.copy(train_x[i][j][k].split(','))
            
for i in range(len(test_x)):
    for j in range(len(test_x[i])):        
        for k in range(len(test_x[i][j])):
            test_x2[i][j][k]=copy.copy(test_x[i][j][k].split(','))           
                       

In [231]:
for i in range(len(train_x2)):
    for j in range(len(train_x2[i])):     
            X_train[i][j]=copy.copy(dict(train_x2[i][j]))   
            
for i in range(len(test_x2)):
    for j in range(len(test_x2[i])):     
            X_test[i][j]=copy.copy(dict(test_x2[i][j]))      

## 1번

In [294]:
print (X_test[0])

[{'word': '여', '+1word': '야', '+2word': '는'}, {'-1word': '여', 'word': '야', '+1word': '는', '+2word': '1'}, {'-2word': '여', '-1word': '야', 'word': '는', '+1word': '1', '+2word': '7'}, {'-2word': '야', '-1word': '는', 'word': '1', '+1word': '7', '+2word': '일'}, {'-2word': '는', '-1word': '1', 'word': '7', '+1word': '일', '+2word': '헌'}, {'-2word': '1', '-1word': '7', 'word': '일', '+1word': '헌', '+2word': '법'}, {'-2word': '7', '-1word': '일', 'word': '헌', '+1word': '법', '+2word': '재'}, {'-2word': '일', '-1word': '헌', 'word': '법', '+1word': '재', '+2word': '판'}, {'-2word': '헌', '-1word': '법', 'word': '재', '+1word': '판', '+2word': '소'}, {'-2word': '법', '-1word': '재', 'word': '판', '+1word': '소', '+2word': '의'}, {'-2word': '재', '-1word': '판', 'word': '소', '+1word': '의', '+2word': '위'}, {'-2word': '판', '-1word': '소', 'word': '의', '+1word': '위', '+2word': '헌'}, {'-2word': '소', '-1word': '의', 'word': '위', '+1word': '헌', '+2word': '결'}, {'-2word': '의', '-1word': '위', 'word': '헌', '+1word': '결', '+2word': 

In [120]:
# pip install sklearn-crfsuite

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [232]:
crf = sklearn_crfsuite.CRF(max_iterations=100)
crf.fit(X_train, train_y)

CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
  averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
  calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

## 2번

In [235]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(test_y, y_pred, average='weighted', labels=['B','I'])

0.9253395852949543

## 3번

In [293]:
result = auto_space("자질이부족합니다")
print (result)

result = auto_space("띄어쓰기가완벽하지는않네요")
print (result)

자질이 부족합니다
띄어 쓰기가 완벽하지는 않네요


In [289]:
def index2feature(sent, i, offset):
    word= sent[i + offset]
    if offset < 0:       
        sign=''
        return '{}{}word,{}'.format(sign, offset, word)
    if offset == 0:
        return 'word,{}'.format(word)
    else:
        sign = '+'
        return '{}{}word,{}'.format(sign, offset, word)

def word2features(sent, i):
    L = len(sent)
    word= sent[i]
    features = []
 
    if i > 1:
        features.append(index2feature(sent, i, -2))
    if i > 0:
        features.append(index2feature(sent, i, -1))  
    features.append(index2feature(sent, i, 0))
    if i < L - 1:
        features.append(index2feature(sent, i, 1)) 
    if i < L - 2:
        features.append(index2feature(sent, i, 2)) 
    return features

            
def auto_space(sent):  
    sentence=[]
    result=[]
    sentence.append([word2features(sent, i) for i in range(len(sent))])    
    sentence2=copy.copy(sentence)
    sentence3=copy.copy(sentence2)
    for i in range(len(sentence)):
        for j in range(len(sentence[i])):        
            for k in range(len(sentence[i][j])):
                sentence2[i][j][k]=copy.copy(sentence[i][j][k].split(','))
    for i in range(len(sentence2)):
        for j in range(len(sentence2[i])):     
            sentence3[i][j]=copy.copy(dict(sentence2[i][j]))
    sentence=list(zip(list(sent),crf.predict(sentence3)[0]))
    for i in range(len(sentence)):
        if sentence[i][1]=='B' and i>0:
            result.append(" "+sentence[i][0])
        else:
            result.append(sentence[i][0])
    return "".join(result)