In [10]:
import numpy as np

### Open Words with BIO Notation Label

In [132]:
import json
def get_BIO_data(filename):
    with open(filename) as f:
        labeled = json.load(f)

    display(type(labeled))
    data = []
    for sentence in labeled:
        sen = []
        for item in sentence:
            tup = (item['token'],item['pos'],item['label'])
            sen.append(tup)
        data.append(sen)
    return data

def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    features = [
        'bias',
        'word.lower=' + word.lower(),
        'postag=' + postag
    ]
    # Fitur untuk kata non awal kalimat
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1
        ])
    else:
        features.append('BOS') # Tanda awal kalimat
    # Fitur untuk kata non akhir kalimat
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1
        ])
    else:
        features.append('EOS') #tanda akhir kalimat
    return features

def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

def get_labels(doc):
    return [label for (token, postag, label) in doc]

def get_split_index(X,n_split):
    kf = KFold(n_splits=n_split)
    train_split = []
    test_split = []
    for train_index, test_index in kf.split(X):
        train_split.append(train_index)
        test_split.append(test_index)
    return train_split,test_split

In [135]:
import pycrfsuite
def generate_model(X,y,train_split,test_split):
    trainer = pycrfsuite.Trainer(verbose=True)
    for i in range(len(train_split)):
        X_train = [X[j] for j in train_split[i]]
        y_train = [y[j] for j in train_split[i]]
        for xseq, yseq in zip(X_train, y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({
            'c1': 0.1, #L1 penalty
            'c2': 0.01, #L2 penalty
            'max_iterations': 200,
            'feature.possible_transitions': True
        })
        trainer.train('crf.model_'+str(i))

In [136]:
data = get_BIO_data('reviews-aspect.json')
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]
train_split, test_split = get_split_index(X,10)

#Generate Model with pycrfsuite
generate_model(X,y,train_split,test_split)

list

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5143
Seconds required: 0.015

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 2965.432524
Feature norm: 1.000000
Error norm: 2045.092997
Active features: 5054
Line search trials: 1
Line search step: 0.000126
Seconds required for this iteration: 0.005

***** Iteration #2 *****
Loss: 2537.814600
Feature norm: 1.345533
Error norm: 717.591510
Active features: 4811
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #3 *****
Loss: 2433.836074
Feature norm: 1.471054
Error norm: 658.487876
Active features: 4984
Line search trials: 1
Line search step: 1.000000
Seconds required for this iterat

***** Iteration #74 *****
Loss: 423.797137
Feature norm: 61.771441
Error norm: 11.528746
Active features: 1482
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #75 *****
Loss: 423.654652
Feature norm: 61.736057
Error norm: 5.419477
Active features: 1482
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #76 *****
Loss: 423.540157
Feature norm: 61.773770
Error norm: 8.944973
Active features: 1489
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #77 *****
Loss: 423.422176
Feature norm: 61.761684
Error norm: 4.824096
Active features: 1486
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #78 *****
Loss: 423.329004
Feature norm: 61.797866
Error norm: 10.923947
Active features: 1486
Line search trials: 1
Line search step: 1.000000
Seconds required for this i

***** Iteration #147 *****
Loss: 420.281570
Feature norm: 62.830174
Error norm: 5.906312
Active features: 1377
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #148 *****
Loss: 420.280866
Feature norm: 62.844584
Error norm: 10.379672
Active features: 1375
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #149 *****
Loss: 420.225325
Feature norm: 62.841522
Error norm: 4.802472
Active features: 1370
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #150 *****
Loss: 420.209690
Feature norm: 62.847791
Error norm: 7.787228
Active features: 1368
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #151 *****
Loss: 420.184433
Feature norm: 62.844843
Error norm: 6.277330
Active features: 1369
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #197 *****
Loss: 419.391296
Feature norm: 62.765699
Error norm: 2.864041
Active features: 1364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #198 *****
Loss: 419.381434
Feature norm: 62.765037
Error norm: 4.660118
Active features: 1363
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.006

***** Iteration #199 *****
Loss: 419.369950
Feature norm: 62.760912
Error norm: 4.081389
Active features: 1361
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.005

***** Iteration #200 *****
Loss: 419.359413
Feature norm: 62.763702
Error norm: 5.272094
Active features: 1360
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.008

L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 0.738

Storing the model
Number of active features: 1360 (5143)
Number of active attributes

***** Iteration #56 *****
Loss: 663.182602
Feature norm: 77.282932
Error norm: 19.824275
Active features: 1988
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #57 *****
Loss: 662.270239
Feature norm: 77.399817
Error norm: 22.543584
Active features: 1976
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #58 *****
Loss: 661.467432
Feature norm: 77.475897
Error norm: 22.974004
Active features: 1959
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #59 *****
Loss: 660.548589
Feature norm: 77.661862
Error norm: 18.751408
Active features: 1951
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #60 *****
Loss: 659.788375
Feature norm: 77.724766
Error norm: 14.587613
Active features: 1938
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #99 *****
Loss: 648.770598
Feature norm: 80.210797
Error norm: 20.670429
Active features: 1760
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #100 *****
Loss: 648.624368
Feature norm: 80.209202
Error norm: 7.759823
Active features: 1757
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #101 *****
Loss: 648.553373
Feature norm: 80.255288
Error norm: 20.236800
Active features: 1753
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #102 *****
Loss: 648.412378
Feature norm: 80.260785
Error norm: 10.800311
Active features: 1754
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #103 *****
Loss: 648.394463
Feature norm: 80.328697
Error norm: 24.535795
Active features: 1749
Line search trials: 1
Line search step: 1.000000
Seconds required for 

***** Iteration #175 *****
Loss: 643.994222
Feature norm: 81.627927
Error norm: 11.056089
Active features: 1681
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #176 *****
Loss: 643.943737
Feature norm: 81.623209
Error norm: 5.760284
Active features: 1680
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #177 *****
Loss: 643.942841
Feature norm: 81.638660
Error norm: 12.009177
Active features: 1677
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #178 *****
Loss: 643.886461
Feature norm: 81.634515
Error norm: 5.516443
Active features: 1676
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #179 *****
Loss: 643.861861
Feature norm: 81.643198
Error norm: 3.792795
Active features: 1679
Line search trials: 2
Line search step: 0.500000
Seconds required for t

***** Iteration #29 *****
Loss: 1015.850995
Feature norm: 74.107717
Error norm: 55.361757
Active features: 3132
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #30 *****
Loss: 994.700935
Feature norm: 75.695382
Error norm: 116.528491
Active features: 3095
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #31 *****
Loss: 971.351629
Feature norm: 76.865166
Error norm: 50.084073
Active features: 2998
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #32 *****
Loss: 953.809010
Feature norm: 78.485893
Error norm: 166.875293
Active features: 2777
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #33 *****
Loss: 939.021494
Feature norm: 79.020696
Error norm: 38.655084
Active features: 2769
Line search trials: 1
Line search step: 1.000000
Seconds required for 

***** Iteration #104 *****
Loss: 812.834174
Feature norm: 92.820952
Error norm: 21.244200
Active features: 1843
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #105 *****
Loss: 812.647773
Feature norm: 92.806859
Error norm: 11.834253
Active features: 1844
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #106 *****
Loss: 812.511106
Feature norm: 92.854302
Error norm: 15.752587
Active features: 1841
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #107 *****
Loss: 812.372277
Feature norm: 92.850216
Error norm: 16.567598
Active features: 1837
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #108 *****
Loss: 812.303599
Feature norm: 92.929707
Error norm: 26.427181
Active features: 1830
Line search trials: 1
Line search step: 1.000000
Seconds required fo

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5607
Seconds required: 0.047

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 12272.960905
Feature norm: 1.000000
Error norm: 8066.207804
Active features: 5555
Line search trials: 1
Line search step: 0.000031
Seconds required for this iteration: 0.020

***** Iteration #2 *****
Loss: 10628.659661
Feature norm: 1.335445
Error norm: 2909.567550
Active features: 5236
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #3 *****
Loss: 10211.845367
Feature norm: 1.458326
Error norm: 2710.326167
Active features: 5419
Line search trials: 1
Line search step: 1.000000
Seconds required for this i

***** Iteration #39 *****
Loss: 1094.154649
Feature norm: 89.780397
Error norm: 111.984077
Active features: 2579
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #40 *****
Loss: 1085.901175
Feature norm: 90.505828
Error norm: 69.119015
Active features: 2553
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #41 *****
Loss: 1077.549263
Feature norm: 90.926870
Error norm: 58.479594
Active features: 2541
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #42 *****
Loss: 1069.685588
Feature norm: 91.754384
Error norm: 122.198451
Active features: 2512
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #43 *****
Loss: 1064.124336
Feature norm: 91.908580
Error norm: 57.045765
Active features: 2407
Line search trials: 1
Line search step: 1.000000
Seconds required 

***** Iteration #115 *****
Loss: 961.462821
Feature norm: 100.414211
Error norm: 30.720972
Active features: 1895
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #116 *****
Loss: 961.142405
Feature norm: 100.439681
Error norm: 17.470402
Active features: 1894
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #117 *****
Loss: 960.995227
Feature norm: 100.574885
Error norm: 24.631992
Active features: 1889
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #118 *****
Loss: 960.775682
Feature norm: 100.592083
Error norm: 14.699145
Active features: 1891
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #119 *****
Loss: 960.644059
Feature norm: 100.681899
Error norm: 20.912418
Active features: 1887
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #164 *****
Loss: 955.946398
Feature norm: 102.124453
Error norm: 12.133027
Active features: 1835
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #165 *****
Loss: 955.940474
Feature norm: 102.148398
Error norm: 25.382512
Active features: 1831
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #166 *****
Loss: 955.760265
Feature norm: 102.129002
Error norm: 11.217329
Active features: 1826
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #167 *****
Loss: 955.740290
Feature norm: 102.150901
Error norm: 23.005147
Active features: 1824
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #168 *****
Loss: 955.589638
Feature norm: 102.134134
Error norm: 10.695518
Active features: 1824
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #18 *****
Loss: 4122.110641
Feature norm: 19.893036
Error norm: 1044.153553
Active features: 4464
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #19 *****
Loss: 3793.600631
Feature norm: 21.277031
Error norm: 335.004188
Active features: 4356
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #20 *****
Loss: 3581.575328
Feature norm: 23.648546
Error norm: 763.448792
Active features: 4239
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #21 *****
Loss: 3270.686057
Feature norm: 27.566243
Error norm: 392.809846
Active features: 4084
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #22 *****
Loss: 2925.965736
Feature norm: 33.705523
Error norm: 933.629109
Active features: 3923
Line search trials: 1
Line search step: 1.000000
Seconds requi

***** Iteration #86 *****
Loss: 1107.088528
Feature norm: 107.392145
Error norm: 20.891252
Active features: 2030
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #87 *****
Loss: 1106.137326
Feature norm: 107.310865
Error norm: 19.813749
Active features: 2017
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #88 *****
Loss: 1105.424833
Feature norm: 107.313311
Error norm: 22.231911
Active features: 2013
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #89 *****
Loss: 1104.675025
Feature norm: 107.231214
Error norm: 15.485934
Active features: 2001
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #90 *****
Loss: 1103.844018
Feature norm: 107.198277
Error norm: 26.719754
Active features: 2000
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #143 *****
Loss: 1088.242073
Feature norm: 107.854577
Error norm: 18.072828
Active features: 1898
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #144 *****
Loss: 1088.180486
Feature norm: 107.909352
Error norm: 32.423988
Active features: 1898
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #145 *****
Loss: 1087.952784
Feature norm: 107.907001
Error norm: 16.690523
Active features: 1893
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #146 *****
Loss: 1087.886049
Feature norm: 107.964369
Error norm: 30.799357
Active features: 1892
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #147 *****
Loss: 1087.690020
Feature norm: 107.959164
Error norm: 18.338382
Active features: 1889
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #12 *****
Loss: 6493.968101
Feature norm: 9.802756
Error norm: 1257.444079
Active features: 5532
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #13 *****
Loss: 6091.441999
Feature norm: 11.273048
Error norm: 898.158840
Active features: 5133
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #14 *****
Loss: 5689.451217
Feature norm: 13.710322
Error norm: 1405.072649
Active features: 5019
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #15 *****
Loss: 5322.188021
Feature norm: 15.444147
Error norm: 398.423058
Active features: 4868
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #16 *****
Loss: 4860.784821
Feature norm: 18.650108
Error norm: 688.068206
Active features: 4666
Line search trials: 1
Line search step: 1.000000
Seconds requi

***** Iteration #54 *****
Loss: 1290.225878
Feature norm: 106.609613
Error norm: 44.008309
Active features: 2328
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #55 *****
Loss: 1286.219789
Feature norm: 106.795765
Error norm: 55.711147
Active features: 2314
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #56 *****
Loss: 1283.262434
Feature norm: 107.097589
Error norm: 87.682489
Active features: 2300
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #57 *****
Loss: 1279.399907
Feature norm: 107.241758
Error norm: 78.292306
Active features: 2298
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #58 *****
Loss: 1276.286479
Feature norm: 107.390084
Error norm: 37.172708
Active features: 2299
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #101 *****
Loss: 1229.896685
Feature norm: 111.829335
Error norm: 37.121807
Active features: 2015
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #102 *****
Loss: 1229.537400
Feature norm: 112.006097
Error norm: 39.384008
Active features: 2020
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #103 *****
Loss: 1229.064152
Feature norm: 112.023068
Error norm: 31.352299
Active features: 2018
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #104 *****
Loss: 1228.746324
Feature norm: 112.172160
Error norm: 38.911286
Active features: 2017
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.054

***** Iteration #105 *****
Loss: 1228.293477
Feature norm: 112.211084
Error norm: 33.044326
Active features: 2014
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #141 *****
Loss: 1220.027674
Feature norm: 114.509829
Error norm: 24.164519
Active features: 1957
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #142 *****
Loss: 1219.845885
Feature norm: 114.597808
Error norm: 25.493941
Active features: 1957
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #143 *****
Loss: 1219.688698
Feature norm: 114.625337
Error norm: 23.590383
Active features: 1952
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #144 *****
Loss: 1219.522708
Feature norm: 114.708286
Error norm: 25.314282
Active features: 1955
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #145 *****
Loss: 1219.368062
Feature norm: 114.720227
Error norm: 22.809008
Active features: 1953
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #193 *****
Loss: 1213.100244
Feature norm: 115.121094
Error norm: 16.513120
Active features: 1908
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #194 *****
Loss: 1213.043555
Feature norm: 115.136598
Error norm: 23.958944
Active features: 1908
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #195 *****
Loss: 1212.928540
Feature norm: 115.117892
Error norm: 18.481505
Active features: 1905
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #196 *****
Loss: 1212.870551
Feature norm: 115.135187
Error norm: 24.928024
Active features: 1905
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #197 *****
Loss: 1212.739542
Feature norm: 115.116774
Error norm: 16.617444
Active features: 1903
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #30 *****
Loss: 1860.272351
Feature norm: 91.477444
Error norm: 93.040110
Active features: 3699
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #31 *****
Loss: 1819.946551
Feature norm: 92.841659
Error norm: 175.249375
Active features: 3406
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.036

***** Iteration #32 *****
Loss: 1767.687740
Feature norm: 95.136559
Error norm: 259.350389
Active features: 3253
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #33 *****
Loss: 1740.982417
Feature norm: 95.884408
Error norm: 414.730081
Active features: 3181
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #34 *****
Loss: 1704.137004
Feature norm: 97.205169
Error norm: 252.852562
Active features: 3151
Line search trials: 1
Line search step: 1.000000
Seconds require

***** Iteration #99 *****
Loss: 1355.192997
Feature norm: 115.909628
Error norm: 29.678277
Active features: 2054
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #100 *****
Loss: 1354.724449
Feature norm: 116.051249
Error norm: 25.691209
Active features: 2055
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #101 *****
Loss: 1354.359299
Feature norm: 116.153001
Error norm: 37.285767
Active features: 2051
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #102 *****
Loss: 1353.912065
Feature norm: 116.326460
Error norm: 33.509421
Active features: 2045
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #103 *****
Loss: 1353.455488
Feature norm: 116.377563
Error norm: 24.118260
Active features: 2042
Line search trials: 1
Line search step: 1.000000
Seconds re

***** Iteration #150 *****
Loss: 1342.106001
Feature norm: 118.699957
Error norm: 39.148709
Active features: 1996
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #151 *****
Loss: 1341.800192
Feature norm: 118.699052
Error norm: 18.981905
Active features: 1994
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #152 *****
Loss: 1341.769874
Feature norm: 118.758672
Error norm: 39.446311
Active features: 1996
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #153 *****
Loss: 1341.461689
Feature norm: 118.756192
Error norm: 18.912310
Active features: 1992
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #154 *****
Loss: 1341.447928
Feature norm: 118.807930
Error norm: 39.837822
Active features: 1987
Line search trials: 1
Line search step: 1.000000
Seconds r

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5607
Seconds required: 0.075

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 24686.391088
Feature norm: 1.000000
Error norm: 16102.408236
Active features: 5547
Line search trials: 1
Line search step: 0.000016
Seconds required for this iteration: 0.036

***** Iteration #2 *****
Loss: 21415.633281
Feature norm: 1.334106
Error norm: 5857.161527
Active features: 5246
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #3 *****
Loss: 20581.478393
Feature norm: 1.456625
Error norm: 5481.668539
Active features: 5407
Line search trials: 1
Line search step: 1.000000
Seconds required for this 

***** Iteration #52 *****
Loss: 1583.933445
Feature norm: 111.931441
Error norm: 74.166467
Active features: 2338
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #53 *****
Loss: 1577.862205
Feature norm: 112.296078
Error norm: 59.905949
Active features: 2328
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #54 *****
Loss: 1573.626994
Feature norm: 112.655348
Error norm: 56.491988
Active features: 2325
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #55 *****
Loss: 1569.207981
Feature norm: 112.936783
Error norm: 51.161744
Active features: 2335
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #56 *****
Loss: 1564.995180
Feature norm: 113.305975
Error norm: 98.069058
Active features: 2326
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #140 *****
Loss: 1464.517673
Feature norm: 123.056558
Error norm: 43.677656
Active features: 2011
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #141 *****
Loss: 1464.203999
Feature norm: 123.069257
Error norm: 26.885447
Active features: 2009
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #142 *****
Loss: 1464.051865
Feature norm: 123.160676
Error norm: 40.061627
Active features: 2007
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #143 *****
Loss: 1463.799091
Feature norm: 123.171547
Error norm: 29.879136
Active features: 2005
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #144 *****
Loss: 1463.650202
Feature norm: 123.262504
Error norm: 38.686222
Active features: 2003
Line search trials: 1
Line search step: 1.000000
Seconds r

0....1....2....3....4....5....6....7....8....9....10
Number of features: 5607
Seconds required: 0.084

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 27761.687108
Feature norm: 1.000000
Error norm: 18102.715062
Active features: 5537
Line search trials: 1
Line search step: 0.000014
Seconds required for this iteration: 0.048

***** Iteration #2 *****
Loss: 24084.590931
Feature norm: 1.334099
Error norm: 6591.908725
Active features: 5250
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #3 *****
Loss: 23145.414835
Feature norm: 1.456722
Error norm: 6170.770263
Active features: 5406
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #4 *****
Loss: 20030.703287
Feature norm: 1.810966
Error norm: 4770.718453
Active 

***** Iteration #51 *****
Loss: 1735.946597
Feature norm: 115.422853
Error norm: 104.666141
Active features: 2458
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #52 *****
Loss: 1728.126159
Feature norm: 116.108056
Error norm: 151.805145
Active features: 2448
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #53 *****
Loss: 1719.329907
Feature norm: 116.382159
Error norm: 140.951572
Active features: 2443
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #54 *****
Loss: 1711.322316
Feature norm: 116.805759
Error norm: 61.088929
Active features: 2435
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #55 *****
Loss: 1703.469643
Feature norm: 117.002389
Error norm: 114.529042
Active features: 2423
Line search trials: 1
Line search step: 1.000000
Seconds re

***** Iteration #136 *****
Loss: 1583.306442
Feature norm: 127.089255
Error norm: 42.630728
Active features: 2041
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #137 *****
Loss: 1582.880941
Feature norm: 127.104672
Error norm: 21.532114
Active features: 2040
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.035

***** Iteration #138 *****
Loss: 1582.794763
Feature norm: 127.175077
Error norm: 42.039815
Active features: 2036
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #139 *****
Loss: 1582.380204
Feature norm: 127.199374
Error norm: 27.432367
Active features: 2037
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.028

***** Iteration #140 *****
Loss: 1582.341856
Feature norm: 127.288709
Error norm: 50.957675
Active features: 2035
Line search trials: 1
Line search step: 1.000000
Seconds r

0....1....2....3....4....5....6....7....8....9....10
Number of features: 5607
Seconds required: 0.085

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 30881.087112
Feature norm: 1.000000
Error norm: 20135.751497
Active features: 5504
Line search trials: 1
Line search step: 0.000013
Seconds required for this iteration: 0.051

***** Iteration #2 *****
Loss: 26791.639508
Feature norm: 1.334027
Error norm: 7332.367457
Active features: 5276
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #3 *****
Loss: 25746.440084
Feature norm: 1.456631
Error norm: 6863.796690
Active features: 5406
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #4 *****
Loss: 22277.498757
Feature norm: 1.811149
Error norm: 5301.347651
Active 

***** Iteration #65 *****
Loss: 1782.500695
Feature norm: 124.592630
Error norm: 54.511785
Active features: 2319
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #66 *****
Loss: 1778.713626
Feature norm: 124.927127
Error norm: 73.900612
Active features: 2316
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #67 *****
Loss: 1775.231191
Feature norm: 125.085328
Error norm: 78.822377
Active features: 2307
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #68 *****
Loss: 1771.798250
Feature norm: 125.406451
Error norm: 57.607050
Active features: 2298
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #69 *****
Loss: 1768.968409
Feature norm: 125.502819
Error norm: 41.921502
Active features: 2290
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #113 *****
Loss: 1708.125355
Feature norm: 127.354148
Error norm: 55.448918
Active features: 2106
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.032

***** Iteration #114 *****
Loss: 1707.552280
Feature norm: 127.378580
Error norm: 39.263655
Active features: 2103
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #115 *****
Loss: 1707.333969
Feature norm: 127.508979
Error norm: 67.172561
Active features: 2101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #116 *****
Loss: 1706.533371
Feature norm: 127.540722
Error norm: 35.997036
Active features: 2097
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #117 *****
Loss: 1706.297719
Feature norm: 127.651157
Error norm: 61.625149
Active features: 2097
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #182 *****
Loss: 1688.512211
Feature norm: 130.459788
Error norm: 18.275596
Active features: 2033
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #183 *****
Loss: 1688.475194
Feature norm: 130.474004
Error norm: 38.844611
Active features: 2034
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #184 *****
Loss: 1688.167489
Feature norm: 130.467016
Error norm: 15.685160
Active features: 2036
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #185 *****
Loss: 1688.138106
Feature norm: 130.486741
Error norm: 37.448524
Active features: 2032
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #186 *****
Loss: 1687.844268
Feature norm: 130.481258
Error norm: 18.729628
Active features: 2036
Line search trials: 1
Line search step: 1.000000
Seconds r

## Model Evaluation

In [131]:
from sklearn.metrics import classification_report

labels = {"B": 0, "I": 1,"O":2}
target_names = ['B', 'I','O']

for i in range(len(train_split)):
    print(i)
    tagger = pycrfsuite.Tagger()
    tagger.open('crf.model_'+str(i))
    X_test = [X[j] for j in test_split[i]]
    y_test = [y[j] for j in test_split[i]]
    y_pred = [tagger.tag(xseq) for xseq in X_test]
#     for iterr, X_test_item in enumerate(X_test):
#         for idx,word in enumerate(X_test_item):
#             print(word[1]+" - "+y_test[iterr][idx]+" - "+y_pred[iterr][idx])
    truth = np.array([labels[bio] for sentence in y_test for bio in sentence])
    prediction = np.array([labels[bio] for sentence in y_pred for bio in sentence])
    print(classification_report(truth, prediction, target_names=target_names))

0
              precision    recall  f1-score   support

           B       0.77      0.49      0.60        82
           I       0.55      0.12      0.20        48
           O       0.91      0.99      0.95       832

   micro avg       0.90      0.90      0.90       962
   macro avg       0.74      0.53      0.58       962
weighted avg       0.88      0.90      0.88       962

1
              precision    recall  f1-score   support

           B       0.89      0.89      0.89        45
           I       0.95      1.00      0.98        21
           O       0.99      0.99      0.99       722

   micro avg       0.99      0.99      0.99       788
   macro avg       0.95      0.96      0.95       788
weighted avg       0.99      0.99      0.99       788

2
              precision    recall  f1-score   support

           B       0.98      0.93      0.96        59
           I       1.00      1.00      1.00         9
           O       0.99      1.00      1.00       751

   micro avg  

In [129]:
i=1
tagger = pycrfsuite.Tagger()
tagger.open('crf.model_'+str(i))
X_test = [X[j] for j in test_split[i]]
y_test = [y[j] for j in test_split[i]]
y_pred = [tagger.tag(xseq) for xseq in X_test]
#     for iterr, X_test_item in enumerate(X_test):
#         for idx,word in enumerate(X_test_item):
#             print(word[1]+" - "+y_test[iterr][idx]+" - "+y_pred[iterr][idx])
truth = np.array([labels[bio] for sentence in y_test for bio in sentence])
prediction = np.array([labels[bio] for sentence in y_pred for bio in sentence])
print(prediction)
print(classification_report(truth, prediction, target_names=target_names))

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 0 1 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 0 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 0
 1 2 2 2 2 2 2 2 0 1 2 2 2 2 2 2 2 2 2 2 0 1 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2
 0 1 2 2 2 2 2 2 2 2 2 2 

In [126]:
import pycrfsuite

zzz =2

trainer = pycrfsuite.Trainer(verbose=True)
X_train = [X[j] for j in train_split[zzz]]
y_train = [y[j] for j in train_split[zzz]]
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
trainer.set_params({
    'c1': 0.1, #L1 penalty
    'c2': 0.01, #L2 penalty
    'max_iterations': 200,
    'feature.possible_transitions': False
})
trainer.train('crf.model_coba')

from sklearn.metrics import classification_report

labels = {"B": 0, "I": 1,"O":2}
target_names = ['B', 'I','O']

tagger = pycrfsuite.Tagger()
tagger.open('crf.model_coba')
X_test = [X[j] for j in test_split[zzz]]
y_test = [y[j] for j in test_split[zzz]]
y_pred = [tagger.tag(xseq) for xseq in X_test]
for iterr, X_test_item in enumerate(X_test):
    for idx,word in enumerate(X_test_item):
        print(word[1]+" - "+y_test[iterr][idx]+" - "+y_pred[iterr][idx])
truth = np.array([labels[bio] for sentence in y_test for bio in sentence])
prediction = np.array([labels[bio] for sentence in y_pred for bio in sentence])
print(prediction)
print(classification_report(truth, prediction, target_names=target_names))

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5300
Seconds required: 0.018

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 3109.226570
Feature norm: 1.000000
Error norm: 2000.439133
Active features: 5194
Line search trials: 1
Line search step: 0.000125
Seconds required for this iteration: 0.005

***** Iteration #2 *****
Loss: 2706.487574
Feature norm: 1.330529
Error norm: 718.390491
Active features: 4953
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #3 *****
Loss: 2605.169355
Feature norm: 1.450302
Error norm: 672.463576
Active features: 5128
Line search trials: 1
Line search step: 1.000000
Seconds required for this iterat

***** Iteration #61 *****
Loss: 442.292369
Feature norm: 62.154169
Error norm: 8.188260
Active features: 1671
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #62 *****
Loss: 441.999424
Feature norm: 62.207495
Error norm: 5.891971
Active features: 1666
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #63 *****
Loss: 441.758990
Feature norm: 62.335156
Error norm: 22.999366
Active features: 1655
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #64 *****
Loss: 441.435182
Feature norm: 62.379931
Error norm: 15.442752
Active features: 1653
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #65 *****
Loss: 441.146085
Feature norm: 62.462147
Error norm: 12.303275
Active features: 1640
Line search trials: 1
Line search step: 1.000000
Seconds required for this 

***** Iteration #145 *****
Loss: 433.923725
Feature norm: 65.038754
Error norm: 11.599513
Active features: 1458
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #146 *****
Loss: 433.846789
Feature norm: 65.023137
Error norm: 6.795264
Active features: 1457
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #147 *****
Loss: 433.822736
Feature norm: 65.042432
Error norm: 9.452244
Active features: 1462
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #148 *****
Loss: 433.766875
Feature norm: 65.030712
Error norm: 5.841939
Active features: 1462
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #149 *****
Loss: 433.730385
Feature norm: 65.045274
Error norm: 7.595557
Active features: 1464
Line search trials: 1
Line search step: 1.000000
Seconds required for th

word.lower=came - O - O
word.lower=here - O - O
word.lower=especially - O - O
word.lower=to - O - O
word.lower=enjoy - O - O
word.lower=their - O - O
word.lower=freaky - O - O
word.lower=shake - B - O
word.lower=. - O - O
word.lower=i - O - O
word.lower=came - O - O
word.lower=during - O - O
word.lower=brunch - O - O
word.lower=time - O - O
word.lower=on - O - O
word.lower=saturday - O - O
word.lower=so - O - O
word.lower=they - B - O
word.lower=are - O - O
word.lower=pretty - O - O
word.lower=busy - O - O
word.lower=and - O - O
word.lower=i - O - O
word.lower=had - O - O
word.lower=to - O - O
word.lower=put - O - O
word.lower=my - O - O
word.lower=name - O - O
word.lower=in - O - O
word.lower=the - O - O
word.lower=waiting - O - O
word.lower=list - O - O
word.lower=. - O - O
word.lower=i - O - O
word.lower=ordered - O - O
word.lower=their - O - O
word.lower=original - O - B
word.lower=cronut - O - I
word.lower=shake - O - O
word.lower=. - O - O
word.lower=the - O - B
word.lower=cronut