まずは、データセットを読み込む。

In [1]:
def load_data_and_labels(filename):
    sents, labels = [], []
    with open(filename) as f:
        words, tags = [], []
        for line in f:
            line = line.rstrip()
            if len(line) == 0 or line.startswith('-DOCSTART-'):
                if len(words) != 0:
                    sents.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                word, tag = line.split('\t')
                words.append(word)
                tags.append(tag)
    return sents, labels

sents, labels = load_data_and_labels('data/dataset.tsv')

In [2]:
print(sents[1])
print(labels[1])

['こ', 'の', '記', '事', 'で', 'は', '言', '語', '（', 'げ', 'ん', 'ご', '）', '、', '特', 'に', '自', '然', '言', '語', 'に', 'つ', 'い', 'て', '述', 'べ', 'る', '。']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-product/language/language_other', 'I-product/language/language_other', 'I-product/language/language_other', 'I-product/language/language_other', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


上記に示したように読み込んだデータセットでは文字単位でラベルがついている。今回作るベースラインでは単語単位で認識したいので、単語レベルにラベルを付け直すことにする。

タスクとしては以下の通り。
* 文字のリストを結合して文字列にする
* 文字列を形態素解析器で解析し、分かち書きする
* 分かち書きした単語のリストに対してラベルを付け直す。

In [3]:
docs = [''.join(sent) for sent in sents]
docs[1]

'この記事では言語（げんご）、特に自然言語について述べる。'

In [7]:
import MeCab
t = MeCab.Tagger()


def tokenize(sent):
    tokens = []
    t.parse('')  # for UnicodeDecodeError
    node = t.parseToNode(sent)

    while node:
        feature = node.feature.split(',')
        surface = node.surface    # 表層形
        pos = feature[0]          # 品詞
        tokens.append((surface, pos))
        node = node.next

    return tokens[1:-1]

tokenized_docs = [[d[0] for d in tokenize(doc)] for doc in docs]
poses = [[d[1] for d in tokenize(doc)] for doc in docs]
# tokenized_docs = [tokenize(doc) for doc in docs]
# tokenized_docs = [[d[0] for d in doc] for doc in tokenized_docs]
# poses = [[d[1] for d in doc] for doc in tokenized_docs]
# tokenized_docs[1]
# poses[1]

分かち書きまではできた。その後が面倒くさい。どうやってラベルを単語単位で付け直すか？文字と単語の対応でも取るか。
1. 形態素を1つ取り出す
2. 形態素を構成するラベルを文字列マッチングによって取り出す
3. ラベルを修正する

In [10]:
doc = docs[1]
i = 0
for word in tokenized_docs[1]:
    j = len(word)
    print('{}\t{}'.format(word, labels[1][i: i+j][0]))
    i += j

この	O
記事	O
で	O
は	O
言語	O
（	O
げん	O
ご	O
）	O
、	O
特に	O
自然	B-product/language/language_other
言語	I-product/language/language_other
について	O
述べる	O
。	O


ラベルの修正でかなりいい加減な処理をしているが、ひとまずこれで良しとしよう。ではこれをすべての文書に対して行ってやる。

In [11]:
for t_doc, label in zip(tokenized_docs, labels):
    i = 0
    for word in t_doc:
        j = len(word)
        tag = label[i: i+j][0]
        print('{}\t{}'.format(word, tag))
        i += j
    break

アンパサンド	O
(,	O
&)	O
と	O
は	O
「	O
…	O
と	O
…	O
」	O
を	O
意味	O
する	O
記号	O
で	O
ある	B-product/title/position_vocation
。	O
ラテン語	O
の	I-product/language/language_other
の	I-product/language/language_other
合	I-product/language/language_other
字	O
で	O
、	O
Trebuchet	B-location/gpe/city
MS	I-product/weapon
フォント	I-product/weapon
で	I-product/weapon
は	O
、	O
と	O
表示	O
さ	O
れ	O
"	O
et	O
"	O
の	O
合	O
字	O
で	O
ある	O
こと	O
が	O
容易	O
に	O
わかる	O
。	O
ampersa	O
、	O
すなわち	O
"	O
and	O
per	O
se	O
and	O
"、	O
その	O
意味	O
は	O
"	O
and	O
[	O
the	O
symbol	O
which	O
]	O
by	O
itself	O
[	O
is	O
]	O
and	O
"	O
で	O
ある	O
。	O


むむっ、誤った対応付を行っている。原因を分析するために元の対応付を見てみる。

In [12]:
for ch, label in zip(sents[0], labels[0]):
    print('{}\t{}'.format(ch, label))

ア	O
ン	O
パ	O
サ	O
ン	O
ド	O
 	O
(	O
,	O
 	O
&	O
)	O
 	O
と	O
は	O
「	O
…	O
と	O
…	O
」	O
を	O
意	O
味	O
す	O
る	O
記	B-product/title/position_vocation
号	I-product/title/position_vocation
で	O
あ	O
る	O
。	O
ラ	B-product/language/language_other
テ	I-product/language/language_other
ン	I-product/language/language_other
語	I-product/language/language_other
の	O
 	O
の	O
合	B-location/gpe/city
字	I-location/gpe/city
で	O
、	O
T	B-product/weapon
r	I-product/weapon
e	I-product/weapon
b	I-product/weapon
u	I-product/weapon
c	I-product/weapon
h	I-product/weapon
e	I-product/weapon
t	I-product/weapon
 	I-product/weapon
M	I-product/weapon
S	I-product/weapon
フ	O
ォ	O
ン	O
ト	O
で	O
は	O
、	O
と	O
表	O
示	O
さ	O
れ	O
 	O
"	O
e	O
t	O
"	O
 	O
の	O
合	O
字	O
で	O
あ	O
る	O
こ	O
と	O
が	O
容	O
易	O
に	O
わ	O
か	O
る	O
。	O
a	O
m	O
p	O
e	O
r	O
s	O
a	O
、	O
す	O
な	O
わ	O
ち	O
 	O
"	O
a	O
n	O
d	O
 	O
p	O
e	O
r	O
 	O
s	O
e	O
 	O
a	O
n	O
d	O
"	O
、	O
そ	O
の	O
意	O
味	O
は	O
"	O
a	O
n	O
d	O
 	O
[	O
t	O
h	O
e	O
 	O
s	O
y	O
m	O
b	O
o	O
l	O
 	O
w	O
h	O
i	O
c	O
h	O
]	O
 	O
b	O


どうやら元の文字列にはあった空白が形態素解析で消えたことで、対応付にずれが生じたのが原因のようだ。単純に対応付けるのではなく、単語が始まるかどうかで補正をする。

In [14]:
tags = []
for t_doc, doc, label in zip(tokenized_docs, docs, labels):
    i = 0
    doc_tags = []
    for word in t_doc:
        j = len(word)
        while not doc[i:].startswith(word):  # correct
            i += 1
        tag = label[i: i+j][0]
        # print('{}\t{}'.format(word, tag))
        doc_tags.append(tag)
        i += j
    tags.append(doc_tags)
    # break

In [17]:
sents, labels = [], []

In [33]:
data = []
i = 0
for doc, poss, labels in zip(tokenized_docs, poses, tags):
    if i == 10000:
        break
    data.append([(w, p, l) for w, p, l in zip(doc, poss, labels)])
    i += 1

In [34]:
data[0]

[('アンパサンド', '名詞', 'O'),
 ('(,', '名詞', 'O'),
 ('&)', '名詞', 'O'),
 ('と', '助詞', 'O'),
 ('は', '助詞', 'O'),
 ('「', '記号', 'O'),
 ('…', '記号', 'O'),
 ('と', '助詞', 'O'),
 ('…', '記号', 'O'),
 ('」', '記号', 'O'),
 ('を', '助詞', 'O'),
 ('意味', '名詞', 'O'),
 ('する', '動詞', 'O'),
 ('記号', '名詞', 'B-product/title/position_vocation'),
 ('で', '助動詞', 'O'),
 ('ある', '助動詞', 'O'),
 ('。', '記号', 'O'),
 ('ラテン語', '名詞', 'B-product/language/language_other'),
 ('の', '助詞', 'O'),
 ('の', '助詞', 'O'),
 ('合', '名詞', 'B-location/gpe/city'),
 ('字', '名詞', 'I-location/gpe/city'),
 ('で', '助詞', 'O'),
 ('、', '記号', 'O'),
 ('Trebuchet', '名詞', 'B-product/weapon'),
 ('MS', '名詞', 'I-product/weapon'),
 ('フォント', '名詞', 'O'),
 ('で', '助詞', 'O'),
 ('は', '助詞', 'O'),
 ('、', '記号', 'O'),
 ('と', '助詞', 'O'),
 ('表示', '名詞', 'O'),
 ('さ', '動詞', 'O'),
 ('れ', '動詞', 'O'),
 ('"', '名詞', 'O'),
 ('et', '名詞', 'O'),
 ('"', '名詞', 'O'),
 ('の', '助詞', 'O'),
 ('合', '名詞', 'O'),
 ('字', '名詞', 'O'),
 ('で', '助動詞', 'O'),
 ('ある', '助動詞', 'O'),
 ('こと', '名詞', 'O'),
 ('が', '助詞', 'O'),


In [35]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [36]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [37]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0...

.1....2..

..3...

.4....

5....6.

...7...

.8....9

....10


Number of features: 300304
Seconds required: 3.262



L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 1255329.670059
Feature norm: 1.000000
Error norm: 1086621.245355
Active features: 299406
Line search trials: 1
Line search step: 0.000001
Seconds required for this iteration: 137.017



***** Iteration #2 *****
Loss: 972490.398686
Feature norm: 3.767060
Error norm: 270077.210111
Active features: 299136
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 206.104

***** Iteration #3 *****
Loss: 826168.655660
Feature norm: 3.235413
Error norm: 265801.816771
Active features: 286386
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.811

***** Iteration #4 *****
Loss: 576012.791648
Feature norm: 2.263571
Error norm: 99251.459715
Active features: 290478
Line search trials: 6
Line search step: 0.031250
Seconds required for this iteration: 404.000

***** Iteration #5 *****
Loss: 571203.997807
Feature norm: 2.163320
Error norm: 213328.282475
Active features: 297367
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 131.958



***** Iteration #6 *****
Loss: 539565.154627
Feature norm: 2.495464
Error norm: 50253.399666
Active features: 297609
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.410

***** Iteration #7 *****
Loss: 505613.391666
Feature norm: 3.150976
Error norm: 42158.384273
Active features: 298146
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.569



***** Iteration #8 *****
Loss: 440981.513386
Feature norm: 7.573255
Error norm: 208597.683597
Active features: 282771
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.428

***** Iteration #9 *****
Loss: 416171.943560
Feature norm: 7.344106
Error norm: 71996.312514
Active features: 288215
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 132.404



***** Iteration #10 *****
Loss: 409957.831158
Feature norm: 7.538054
Error norm: 29075.812597
Active features: 288615
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.058

***** Iteration #11 *****
Loss: 402650.456038
Feature norm: 7.913659
Error norm: 50358.758864
Active features: 287677
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.535

***** Iteration #12 *****
Loss: 391710.869263
Feature norm: 8.797099
Error norm: 92160.013051
Active features: 285687
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.962

***** Iteration #13 *****
Loss: 379295.233918
Feature norm: 9.578862
Error norm: 52399.495488
Active features: 281924
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 132.051



***** Iteration #14 *****
Loss: 371696.625744
Feature norm: 10.637282
Error norm: 86970.669827
Active features: 279034
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.749

***** Iteration #15 *****
Loss: 361045.977427
Feature norm: 11.941792
Error norm: 75955.093371
Active features: 277693
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.837

***** Iteration #16 *****
Loss: 352454.157905
Feature norm: 12.900655
Error norm: 53150.507172
Active features: 277155
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.693

***** Iteration #17 *****
Loss: 344384.878372
Feature norm: 14.339560
Error norm: 52850.691616
Active features: 276515
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.824

***** Iteration #18 *****
Loss: 336448.302638
Feature norm: 15.849092
Error norm: 53195.253636
Active features: 276215
Line search trials: 1
Line se

***** Iteration #19 *****
Loss: 328400.388219
Feature norm: 17.546635
Error norm: 66931.124957
Active features: 275600
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.996

***** Iteration #20 *****
Loss: 319296.931244
Feature norm: 19.044240
Error norm: 27519.794232
Active features: 275377
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.071

***** Iteration #21 *****
Loss: 308871.236601
Feature norm: 20.869392
Error norm: 34532.941629
Active features: 274665
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.888

***** Iteration #22 *****
Loss: 291471.349134
Feature norm: 24.312358
Error norm: 30452.974247
Active features: 272565
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.033



***** Iteration #23 *****
Loss: 274745.680628
Feature norm: 27.502898
Error norm: 32099.441571
Active features: 229815
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.062

***** Iteration #24 *****
Loss: 259479.326580
Feature norm: 30.615261
Error norm: 12363.350182
Active features: 218699
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.188

***** Iteration #25 *****
Loss: 244458.423501
Feature norm: 36.773004
Error norm: 56341.396611
Active features: 213666
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.283



***** Iteration #26 *****
Loss: 232626.357040
Feature norm: 40.011890
Error norm: 12247.328538
Active features: 212249
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.153

***** Iteration #27 *****
Loss: 221060.647361
Feature norm: 47.325765
Error norm: 48812.942714
Active features: 204633
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.479



***** Iteration #28 *****
Loss: 211915.742568
Feature norm: 50.549633
Error norm: 7996.229199
Active features: 204323
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.434

***** Iteration #29 *****
Loss: 199278.804089
Feature norm: 59.236839
Error norm: 13208.448295
Active features: 199767
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.328

***** Iteration #30 *****
Loss: 185059.598927
Feature norm: 71.575891
Error norm: 33447.446191
Active features: 197005
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.384



***** Iteration #31 *****
Loss: 174299.409627
Feature norm: 82.771869
Error norm: 26167.397290
Active features: 197020
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.178

***** Iteration #32 *****
Loss: 167311.797946
Feature norm: 88.677568
Error norm: 5751.680887
Active features: 198543
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.367



***** Iteration #33 *****
Loss: 155205.162703
Feature norm: 108.530475
Error norm: 12269.768311
Active features: 198032
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.164

***** Iteration #34 *****
Loss: 152728.747565
Feature norm: 122.325854
Error norm: 21591.423509
Active features: 196931
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.282

***** Iteration #35 *****
Loss: 146562.983998
Feature norm: 123.846807
Error norm: 14571.799977
Active features: 198356
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 133.865

***** Iteration #36 *****
Loss: 141106.534435
Feature norm: 126.534939
Error norm: 6164.916598
Active features: 198489
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.663

***** Iteration #37 *****
Loss: 137275.178191
Feature norm: 133.773800
Error norm: 26461.196381
Active features: 197395
Line search trials: 1
Li

***** Iteration #38 *****
Loss: 133939.919268
Feature norm: 141.074184
Error norm: 64592.557227
Active features: 197292
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.751

***** Iteration #39 *****
Loss: 128860.670864
Feature norm: 142.096135
Error norm: 8512.791618
Active features: 198774
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.110

***** Iteration #40 *****
Loss: 124445.135411
Feature norm: 147.270890
Error norm: 13852.047139
Active features: 197579
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.876



***** Iteration #41 *****
Loss: 115140.762614
Feature norm: 163.958146
Error norm: 16866.314647
Active features: 193954
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.154

***** Iteration #42 *****
Loss: 109733.826682
Feature norm: 172.956336
Error norm: 12641.899313
Active features: 194045
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.192

***** Iteration #43 *****
Loss: 102395.862362
Feature norm: 187.039828
Error norm: 4781.908708
Active features: 192797
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.212



***** Iteration #44 *****
Loss: 95236.495281
Feature norm: 216.942309
Error norm: 39330.066120
Active features: 189714
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.128

***** Iteration #45 *****
Loss: 90824.734414
Feature norm: 221.517732
Error norm: 4109.646222
Active features: 190412
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.027

***** Iteration #46 *****
Loss: 86591.092773
Feature norm: 234.103256
Error norm: 20852.311426
Active features: 187888
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.159

***** Iteration #47 *****
Loss: 80502.927561
Feature norm: 256.191269
Error norm: 10264.443327
Active features: 186180
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.971

***** Iteration #48 *****
Loss: 75136.628436
Feature norm: 272.902264
Error norm: 6428.217701
Active features: 184698
Line search trials: 1
Line sear

***** Iteration #50 *****
Loss: 65741.190539
Feature norm: 325.435725
Error norm: 13068.107764
Active features: 182346
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.660

***** Iteration #51 *****
Loss: 63617.118551
Feature norm: 335.677241
Error norm: 5211.909890
Active features: 182051
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.983



***** Iteration #52 *****
Loss: 59655.162571
Feature norm: 371.752240
Error norm: 27222.642016
Active features: 179826
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.025

***** Iteration #53 *****
Loss: 58291.214899
Feature norm: 366.688138
Error norm: 3705.028144
Active features: 180741
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.542



***** Iteration #54 *****
Loss: 56965.057110
Feature norm: 376.330179
Error norm: 12105.646686
Active features: 180429
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.539

***** Iteration #55 *****
Loss: 54955.954793
Feature norm: 389.097995
Error norm: 5000.679445
Active features: 179568
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.659



***** Iteration #56 *****
Loss: 52741.860756
Feature norm: 410.544907
Error norm: 4968.941057
Active features: 177482
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.211

***** Iteration #57 *****
Loss: 51474.944678
Feature norm: 425.240441
Error norm: 3469.732040
Active features: 175959
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.183

***** Iteration #58 *****
Loss: 50307.453011
Feature norm: 440.439725
Error norm: 4970.486058
Active features: 173900
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.120



***** Iteration #59 *****
Loss: 49579.775294
Feature norm: 447.520299
Error norm: 2148.701225
Active features: 172270
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.125

***** Iteration #60 *****
Loss: 48838.097497
Feature norm: 457.362628
Error norm: 2862.233670
Active features: 168563
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.087

***** Iteration #61 *****
Loss: 48251.248010
Feature norm: 465.529765
Error norm: 5373.643983
Active features: 164548
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.850



***** Iteration #62 *****
Loss: 47722.781095
Feature norm: 471.975938
Error norm: 1469.034476
Active features: 163716
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.616

***** Iteration #63 *****
Loss: 47208.601498
Feature norm: 477.271479
Error norm: 1369.929866
Active features: 162688
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 73.432

***** Iteration #64 *****
Loss: 46757.232878
Feature norm: 482.846177
Error norm: 7002.417787
Active features: 157763
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.386



***** Iteration #65 *****
Loss: 46352.922011
Feature norm: 485.469312
Error norm: 3024.365127
Active features: 157278
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.488

***** Iteration #66 *****
Loss: 45970.885857
Feature norm: 490.915884
Error norm: 4650.243170
Active features: 155931
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.277

***** Iteration #67 *****
Loss: 45568.937937
Feature norm: 494.606641
Error norm: 2391.713856
Active features: 154296
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.848



***** Iteration #68 *****
Loss: 45202.780979
Feature norm: 500.672777
Error norm: 5282.220394
Active features: 151973
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 71.412

***** Iteration #69 *****
Loss: 44881.688284
Feature norm: 503.454132
Error norm: 2883.325770
Active features: 149973
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.856

***** Iteration #70 *****
Loss: 44581.711265
Feature norm: 508.844773
Error norm: 4563.647554
Active features: 146583
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.113



***** Iteration #71 *****
Loss: 44330.786398
Feature norm: 510.968898
Error norm: 2411.831982
Active features: 146073
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.668

***** Iteration #72 *****
Loss: 44093.586343
Feature norm: 515.067874
Error norm: 3762.165773
Active features: 145042
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.456

***** Iteration #73 *****
Loss: 43877.324554
Feature norm: 516.835155
Error norm: 1762.722045
Active features: 144307
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.525



***** Iteration #74 *****
Loss: 43669.689448
Feature norm: 520.287613
Error norm: 3499.041449
Active features: 143253
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.971

***** Iteration #75 *****
Loss: 43489.221838
Feature norm: 521.623545
Error norm: 1521.717258
Active features: 142617
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.138

***** Iteration #76 *****
Loss: 43322.021954
Feature norm: 524.506960
Error norm: 3233.433599
Active features: 141729
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.084



***** Iteration #77 *****
Loss: 43166.423229
Feature norm: 525.683434
Error norm: 1323.918374
Active features: 141078
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.112

***** Iteration #78 *****
Loss: 43016.858663
Feature norm: 528.215238
Error norm: 3010.025575
Active features: 139620
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.104



***** Iteration #79 *****
Loss: 42875.022051
Feature norm: 529.243422
Error norm: 1478.104278
Active features: 139192
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.177

***** Iteration #80 *****
Loss: 42745.100118
Feature norm: 531.464757
Error norm: 2874.531587
Active features: 138648
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.196

***** Iteration #81 *****
Loss: 42624.166122
Feature norm: 532.393691
Error norm: 1666.279750
Active features: 138179
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.127



***** Iteration #82 *****
Loss: 42507.359446
Feature norm: 534.520900
Error norm: 2664.488784
Active features: 137649
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.173

***** Iteration #83 *****
Loss: 42409.224663
Feature norm: 535.265355
Error norm: 1640.050238
Active features: 137283
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.217



***** Iteration #84 *****
Loss: 42308.464644
Feature norm: 536.965924
Error norm: 1897.296338
Active features: 136922
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.648

***** Iteration #85 *****
Loss: 42215.472072
Feature norm: 537.823710
Error norm: 1502.130602
Active features: 136313
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.057



***** Iteration #86 *****
Loss: 42131.839427
Feature norm: 539.286334
Error norm: 1988.880395
Active features: 136011
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.277



***** Iteration #87 *****
Loss: 42056.105296
Feature norm: 539.861145
Error norm: 1236.563860
Active features: 135622
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.933

***** Iteration #88 *****
Loss: 41981.993534
Feature norm: 541.105075
Error norm: 1941.883799
Active features: 135085
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.482

***** Iteration #89 *****
Loss: 41917.620550
Feature norm: 541.480574
Error norm: 1199.551754
Active features: 134926
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.744

***** Iteration #90 *****
Loss: 41852.426345
Feature norm: 542.487137
Error norm: 1835.606097
Active features: 134542
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.123



***** Iteration #91 *****
Loss: 41796.278986
Feature norm: 542.790320
Error norm: 1347.192474
Active features: 134157
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.162

***** Iteration #92 *****
Loss: 41741.551494
Feature norm: 543.596721
Error norm: 1309.152898
Active features: 133833
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.131

***** Iteration #93 *****
Loss: 41692.173474
Feature norm: 543.878061
Error norm: 974.490876
Active features: 133524
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.326

***** Iteration #94 *****
Loss: 41642.466420
Feature norm: 544.561291
Error norm: 1445.629883
Active features: 133159
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.279



***** Iteration #95 *****
Loss: 41599.954195
Feature norm: 544.729383
Error norm: 1108.303185
Active features: 132929
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.713

***** Iteration #96 *****
Loss: 41555.107071
Feature norm: 545.356475
Error norm: 1271.482923
Active features: 132620
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.888



***** Iteration #97 *****
Loss: 41511.756573
Feature norm: 545.593204
Error norm: 1011.164370
Active features: 132239
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.147

***** Iteration #98 *****
Loss: 41473.465775
Feature norm: 546.273992
Error norm: 1676.145559
Active features: 132021
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.653

***** Iteration #99 *****
Loss: 41437.380340
Feature norm: 546.453752
Error norm: 1183.117191
Active features: 131752
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.321

***** Iteration #100 *****
Loss: 41400.914840
Feature norm: 547.172393
Error norm: 1566.112508
Active features: 131507
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.547

***** Iteration #101 *****
Loss: 41366.184541
Feature norm: 547.405072
Error norm: 1498.539475
Active features: 131203
Line search trials: 1
Line searc

***** Iteration #102 *****
Loss: 41333.787397
Feature norm: 548.096482
Error norm: 1548.099123
Active features: 131064
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.265

***** Iteration #103 *****
Loss: 41302.729428
Feature norm: 548.269635
Error norm: 1168.596641
Active features: 130866
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.112



***** Iteration #104 *****
Loss: 41270.869053
Feature norm: 548.873013
Error norm: 1412.258969
Active features: 130764
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.151

***** Iteration #105 *****
Loss: 41240.590799
Feature norm: 549.003963
Error norm: 1315.551027
Active features: 130529
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.240



***** Iteration #106 *****
Loss: 41212.082593
Feature norm: 549.604941
Error norm: 1475.180833
Active features: 130396
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.235

***** Iteration #107 *****
Loss: 41184.855992
Feature norm: 549.700902
Error norm: 1035.784518
Active features: 130254
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.418



***** Iteration #108 *****
Loss: 41156.921915
Feature norm: 550.137683
Error norm: 1031.359931
Active features: 130110
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.897

***** Iteration #109 *****
Loss: 41130.977604
Feature norm: 550.249536
Error norm: 1102.843195
Active features: 129947
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.128



***** Iteration #110 *****
Loss: 41106.121082
Feature norm: 550.643354
Error norm: 1031.751014
Active features: 129849
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.132

***** Iteration #111 *****
Loss: 41081.420411
Feature norm: 550.672710
Error norm: 913.741787
Active features: 129559
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.271

***** Iteration #112 *****
Loss: 41056.081168
Feature norm: 550.976961
Error norm: 1036.466638
Active features: 129325
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 71.386



***** Iteration #113 *****
Loss: 41034.674157
Feature norm: 550.914704
Error norm: 1055.603741
Active features: 129093
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.035

***** Iteration #114 *****
Loss: 41012.299308
Feature norm: 551.104461
Error norm: 718.636622
Active features: 128894
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.110



***** Iteration #115 *****
Loss: 40988.602800
Feature norm: 550.996420
Error norm: 806.212417
Active features: 128585
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.204

***** Iteration #116 *****
Loss: 40966.206151
Feature norm: 551.152826
Error norm: 931.857808
Active features: 128348
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.309



***** Iteration #117 *****
Loss: 40946.022179
Feature norm: 551.017174
Error norm: 740.779717
Active features: 128128
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.661



***** Iteration #118 *****
Loss: 40924.050051
Feature norm: 551.138588
Error norm: 854.427414
Active features: 127887
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.389

***** Iteration #119 *****
Loss: 40903.141207
Feature norm: 550.995103
Error norm: 978.279300
Active features: 127656
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.426



***** Iteration #120 *****
Loss: 40883.054212
Feature norm: 551.191500
Error norm: 939.953624
Active features: 127548
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.124



***** Iteration #121 *****
Loss: 40863.806279
Feature norm: 551.074788
Error norm: 884.634851
Active features: 127297
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.449

***** Iteration #122 *****
Loss: 40843.564344
Feature norm: 551.307498
Error norm: 1018.103243
Active features: 127135
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.060



***** Iteration #123 *****
Loss: 40824.774231
Feature norm: 551.189640
Error norm: 845.184222
Active features: 126901
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.239



***** Iteration #124 *****
Loss: 40806.111587
Feature norm: 551.438153
Error norm: 978.271196
Active features: 126766
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.392

***** Iteration #125 *****
Loss: 40787.734688
Feature norm: 551.353797
Error norm: 757.502836
Active features: 126543
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.899

***** Iteration #126 *****
Loss: 40769.989251
Feature norm: 551.628857
Error norm: 1082.477750
Active features: 126440
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.021

***** Iteration #127 *****
Loss: 40753.292392
Feature norm: 551.578974
Error norm: 871.042075
Active features: 126325
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.463



***** Iteration #128 *****
Loss: 40736.625196
Feature norm: 551.889359
Error norm: 1060.420796
Active features: 126246
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.703

***** Iteration #129 *****
Loss: 40720.170123
Feature norm: 551.847355
Error norm: 810.397290
Active features: 126049
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.588

***** Iteration #130 *****
Loss: 40703.314524
Feature norm: 552.179913
Error norm: 1064.158144
Active features: 125932
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 72.354



***** Iteration #131 *****
Loss: 40687.627562
Feature norm: 552.120901
Error norm: 842.403184
Active features: 125767
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.860

***** Iteration #132 *****
Loss: 40673.110701
Feature norm: 552.447616
Error norm: 1067.329060
Active features: 125644
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 74.703

***** Iteration #133 *****
Loss: 40659.106907
Feature norm: 552.373491
Error norm: 851.147097
Active features: 125478
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 72.695



***** Iteration #134 *****
Loss: 40645.877779
Feature norm: 552.693693
Error norm: 1219.272060
Active features: 125354
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.984

***** Iteration #135 *****
Loss: 40633.547288
Feature norm: 552.616634
Error norm: 1017.389689
Active features: 125151
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 71.766

***** Iteration #136 *****
Loss: 40621.424641
Feature norm: 552.945429
Error norm: 1018.060328
Active features: 125156
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.178

***** Iteration #137 *****
Loss: 40609.584060
Feature norm: 552.871699
Error norm: 883.389265
Active features: 125009
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 71.111



***** Iteration #138 *****
Loss: 40597.092755
Feature norm: 553.148722
Error norm: 969.013071
Active features: 124894
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.668



***** Iteration #139 *****
Loss: 40585.417714
Feature norm: 553.059114
Error norm: 1063.438424
Active features: 124718
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.218

***** Iteration #140 *****
Loss: 40573.852517
Feature norm: 553.377104
Error norm: 1078.240121
Active features: 124688
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.687

***** Iteration #141 *****
Loss: 40563.421495
Feature norm: 553.273029
Error norm: 983.977809
Active features: 124561
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.432



***** Iteration #142 *****
Loss: 40552.203401
Feature norm: 553.563861
Error norm: 853.701371
Active features: 124510
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.436



***** Iteration #143 *****
Loss: 40541.868036
Feature norm: 553.481462
Error norm: 984.208019
Active features: 124368
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.657

***** Iteration #144 *****
Loss: 40531.036459
Feature norm: 553.781177
Error norm: 941.127427
Active features: 124385
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 73.613

***** Iteration #145 *****
Loss: 40521.969128
Feature norm: 553.671692
Error norm: 1083.064439
Active features: 124244
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 71.937

***** Iteration #146 *****
Loss: 40512.027172
Feature norm: 553.961201
Error norm: 740.525221
Active features: 124260
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 73.160



***** Iteration #147 *****
Loss: 40502.867795
Feature norm: 553.883175
Error norm: 841.934389
Active features: 124093
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.138

***** Iteration #148 *****
Loss: 40493.005044
Feature norm: 554.162302
Error norm: 925.092738
Active features: 124044
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.087



***** Iteration #149 *****
Loss: 40484.659099
Feature norm: 554.062016
Error norm: 1105.597661
Active features: 123948
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.712

***** Iteration #150 *****
Loss: 40475.405489
Feature norm: 554.361373
Error norm: 906.966606
Active features: 123985
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.702



***** Iteration #151 *****
Loss: 40467.833559
Feature norm: 554.249002
Error norm: 987.115119
Active features: 123812
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.399



***** Iteration #152 *****
Loss: 40459.446946
Feature norm: 554.484676
Error norm: 755.054679
Active features: 123740
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.207

***** Iteration #153 *****
Loss: 40452.192836
Feature norm: 554.379024
Error norm: 1037.742894
Active features: 123526
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.743

***** Iteration #154 *****
Loss: 40444.173221
Feature norm: 554.643249
Error norm: 875.032486
Active features: 123544
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 85.129

***** Iteration #155 *****
Loss: 40437.259672
Feature norm: 554.511079
Error norm: 1008.926575
Active features: 123401
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 110.015



***** Iteration #156 *****
Loss: 40429.584875
Feature norm: 554.743250
Error norm: 751.589486
Active features: 123384
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 81.506

***** Iteration #157 *****
Loss: 40423.189977
Feature norm: 554.628223
Error norm: 1006.349237
Active features: 123222
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.703

***** Iteration #158 *****
Loss: 40416.190873
Feature norm: 554.884621
Error norm: 885.887459
Active features: 123180
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 71.301

***** Iteration #159 *****
Loss: 40410.536351
Feature norm: 554.769834
Error norm: 1052.398055
Active features: 123065
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 69.118



***** Iteration #160 *****
Loss: 40403.753362
Feature norm: 555.022429
Error norm: 778.282611
Active features: 123085
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 70.693

***** Iteration #161 *****
Loss: 40398.229103
Feature norm: 554.936125
Error norm: 910.480038
Active features: 122994
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.893

***** Iteration #162 *****
Loss: 40391.794686
Feature norm: 555.172530
Error norm: 858.070736
Active features: 122945
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 67.115

***** Iteration #163 *****
Loss: 40386.806370
Feature norm: 555.072138
Error norm: 1094.444052
Active features: 122821
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 68.092



***** Iteration #164 *****
Loss: 40380.526243
Feature norm: 555.359449
Error norm: 954.036988
Active features: 122870
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.848

***** Iteration #165 *****
Loss: 40375.525589
Feature norm: 555.259239
Error norm: 977.453779
Active features: 122807
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.997

***** Iteration #166 *****
Loss: 40369.912767
Feature norm: 555.510719
Error norm: 835.871627
Active features: 122778
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.131

***** Iteration #167 *****
Loss: 40364.945734
Feature norm: 555.435434
Error norm: 924.678554
Active features: 122665
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.099

***** Iteration #168 *****
Loss: 40359.683725
Feature norm: 555.711409
Error norm: 1058.685343
Active features: 122606
Line search trials: 1
Line search

***** Iteration #174 *****
Loss: 40330.870550
Feature norm: 556.320271
Error norm: 1208.778104
Active features: 122217
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.022

***** Iteration #175 *****
Loss: 40325.737085
Feature norm: 556.214770
Error norm: 967.193539
Active features: 122168
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.000

***** Iteration #176 *****
Loss: 40321.178177
Feature norm: 556.461823
Error norm: 938.967868
Active features: 122115
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.038

***** Iteration #177 *****
Loss: 40316.901305
Feature norm: 556.396439
Error norm: 927.761755
Active features: 122056
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.045

***** Iteration #178 *****
Loss: 40313.302373
Feature norm: 556.665474
Error norm: 1209.892097
Active features: 121995
Line search trials: 1
Line searc

***** Iteration #179 *****
Loss: 40308.684609
Feature norm: 556.576503
Error norm: 972.471624
Active features: 121928
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.320

***** Iteration #180 *****
Loss: 40304.986638
Feature norm: 556.841925
Error norm: 1114.009836
Active features: 121922
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.976

***** Iteration #181 *****
Loss: 40300.684235
Feature norm: 556.767631
Error norm: 846.253690
Active features: 121922
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.043

***** Iteration #182 *****
Loss: 40297.327585
Feature norm: 557.007652
Error norm: 1116.420316
Active features: 121931
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.239

***** Iteration #183 *****
Loss: 40293.554367
Feature norm: 556.929661
Error norm: 943.749752
Active features: 121873
Line search trials: 1
Line searc

***** Iteration #184 *****
Loss: 40290.661764
Feature norm: 557.198645
Error norm: 1236.575770
Active features: 121858
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.854

***** Iteration #185 *****
Loss: 40286.331139
Feature norm: 557.125377
Error norm: 805.746290
Active features: 121796
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.975

***** Iteration #186 *****
Loss: 40283.487334
Feature norm: 557.360049
Error norm: 1106.105787
Active features: 121778
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.965

***** Iteration #187 *****
Loss: 40279.641983
Feature norm: 557.310632
Error norm: 798.674181
Active features: 121713
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.033



***** Iteration #188 *****
Loss: 40277.390213
Feature norm: 557.559201
Error norm: 1235.159311
Active features: 121746
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.347

***** Iteration #189 *****
Loss: 40273.199321
Feature norm: 557.508567
Error norm: 775.009735
Active features: 121665
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 71.446



***** Iteration #190 *****
Loss: 40270.996716
Feature norm: 557.737941
Error norm: 1174.757836
Active features: 121670
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.882

***** Iteration #191 *****
Loss: 40267.090580
Feature norm: 557.696107
Error norm: 744.527540
Active features: 121589
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 64.864



***** Iteration #192 *****
Loss: 40265.055674
Feature norm: 557.918780
Error norm: 1161.531909
Active features: 121569
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 64.832

***** Iteration #193 *****
Loss: 40261.215526
Feature norm: 557.880683
Error norm: 726.395578
Active features: 121469
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 64.691



***** Iteration #194 *****
Loss: 40259.572596
Feature norm: 558.100369
Error norm: 1210.283378
Active features: 121451
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 64.756

***** Iteration #195 *****
Loss: 40255.693393
Feature norm: 558.066390
Error norm: 700.428679
Active features: 121362
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 64.896

***** Iteration #196 *****
Loss: 40253.965972
Feature norm: 558.279325
Error norm: 1156.868899
Active features: 121397
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.614



***** Iteration #197 *****
Loss: 40250.354119
Feature norm: 558.250999
Error norm: 635.150206
Active features: 121380
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.798

***** Iteration #198 *****
Loss: 40248.634020
Feature norm: 558.448835
Error norm: 1138.731480
Active features: 121392
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 66.352

***** Iteration #199 *****
Loss: 40245.225468
Feature norm: 558.420683
Error norm: 654.932772
Active features: 121390
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 64.875

***** Iteration #200 *****
Loss: 40244.075596
Feature norm: 558.618250
Error norm: 1242.795022
Active features: 121397
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 65.151



L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 14360.192

Storing the model
Number of active features: 121397 (300304)
Number of active attributes: 53181 (138631)
Number of active labels: 274 (274)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.332



In [38]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print('{}\t{}'.format(y, x))

小池田	O
マヤ	O
（	O
こ	O
いけ	O
だ	O
ま	O
や	O
、	O
1969	B-timex/date
年	I-timex/date
5	B-timex/date
月	I-timex/date
4	I-timex/date
日	I-timex/date
-	O
）	O
は	O
、	O
日本	B-location/gpe/country
の	O
漫画	B-product/art/book
家	I-product/art/book
。	O
山口	B-location/gpe/province
県	I-location/gpe/province
光	B-location/gpe/city
市	I-location/gpe/city
虹	B-location/gpe/city
ヶ	I-location/gpe/city
丘	I-location/gpe/city
生まれ	O
、	O
大阪	B-location/gpe/province
府	I-location/gpe/province
出身	O
。	O
京都市立芸術大学	O
版画	O
科	O
卒業	O
。	O
女性	O
。	O


In [39]:
def get_entities(seq):
    """Gets entities from sequence.
    Args:
        seq (list): sequence of labels.
    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).
    Example:
        >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
        >>> print(get_entities(seq))
        [('PER', 0, 2), ('LOC', 3, 4)]
    """
    i = 0
    chunks = []
    seq = seq + ['O']  # add sentinel
    types = [tag.split('-')[-1] for tag in seq]
    while i < len(seq):
        if seq[i].startswith('B'):
            for j in range(i+1, len(seq)):
                if seq[j].startswith('I') and types[j] == types[i]:
                    continue
                break
            chunks.append((types[i], i, j))
            i = j
        else:
            i += 1
    return chunks


def f1_score(y_true, y_pred, sequence_lengths):
    """Evaluates f1 score.
    Args:
        y_true (list): true labels.
        y_pred (list): predicted labels.
        sequence_lengths (list): sequence lengths.
    Returns:
        float: f1 score.
    Example:
        >>> y_true = []
        >>> y_pred = []
        >>> sequence_lengths = []
        >>> print(f1_score(y_true, y_pred, sequence_lengths))
        0.8
    """
    correct_preds, total_correct, total_preds = 0., 0., 0.
    for lab, lab_pred, length in zip(y_true, y_pred, sequence_lengths):
        lab = lab[:length]
        lab_pred = lab_pred[:length]

        lab_chunks = set(get_entities(lab))
        lab_pred_chunks = set(get_entities(lab_pred))

        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds += len(lab_pred_chunks)
        total_correct += len(lab_chunks)

    p = correct_preds / total_preds if correct_preds > 0 else 0
    r = correct_preds / total_correct if correct_preds > 0 else 0
    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
    return f1

lengths = [len(y) for y in y_test]
f1_score(y_test, y_pred, lengths)

0.6474437410225035

In [40]:
len(tokenized_docs)

1078768