In [1]:
#imports
from datasets import load_dataset
from thai2transformers.metrics import classification_metrics
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [2]:
#parameters
class Args:
    dataset_name_or_path = 'thainer'
    feature_col = 'tokens'
    label_col = 'pos_tags'
    metric_for_best_model = 'f1_macro'
    seed = 2020
    data_dir = '~/Downloads/LST20_Corpus'

args = Args()

In [3]:
if args.dataset_name_or_path == 'lst20':
    dataset = load_dataset(args.dataset_name_or_path,data_dir=args.data_dir)
else:
    dataset = load_dataset(args.dataset_name_or_path)
dataset

Reusing dataset thainer (/Users/admin/.cache/huggingface/datasets/thainer/thainer/1.3.0/e0a86672e5ad057c1093708597cdda3671a76e9b053d210a32205406726cca92)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 6348
    })
})

In [4]:
if args.dataset_name_or_path == 'thainer' and args.label_col== 'ner_tags':
    dataset = dataset.map(lambda examples: {'ner_tags': [i if i not in [13,26] else 27 for i in examples[args.label_col]]})
    train_valtest_split = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=args.seed)
    dataset['train'] = train_valtest_split['train']
    dataset['validation'] = train_valtest_split['test']
    val_test_split = dataset['validation'].train_test_split(test_size=0.5, shuffle=True, seed=args.seed)
    dataset['validation'] = val_test_split['train']
    dataset['test'] = val_test_split['test']
    tag_labels = dataset['train'].features[args.label_col].feature.names
    tag_labels = [tag_labels[i] for i in range(len(tag_labels)) if i not in [13,26]]
elif args.dataset_name_or_path == 'thainer' and args.label_col== 'pos_tags':
    train_valtest_split = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=args.seed)
    dataset['train'] = train_valtest_split['train']
    dataset['validation'] = train_valtest_split['test']
    val_test_split = dataset['validation'].train_test_split(test_size=0.5, shuffle=True, seed=args.seed)
    dataset['validation'] = val_test_split['train']
    dataset['test'] = val_test_split['test']
    tag_labels = dataset['train'].features[args.label_col].feature.names
else:
    tag_labels = dataset['train'].features[args.label_col].feature.names
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 5078
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 635
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 635
    })
})

In [6]:
if args.dataset_name_or_path == 'thainer':
    from transformers import AutoTokenizer
    mbert_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
    def pre_tokenize(token, space_token):
        token = token.replace(' ', space_token)
        return token
    def is_not_too_long(example,
                        max_length=510):
        tokens = sum([mbert_tokenizer.tokenize(
            pre_tokenize(token, space_token='<_>'))
                      for token in example[args.feature_col]], [])
        return len(tokens) < max_length
    dataset['test'] = dataset['test'].filter(is_not_too_long)
dataset['test']

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Dataset({
    features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
    num_rows: 621
})

In [7]:
%%time
#get sentence forms
def generate_sents(dataset, idx):
    features = dataset[idx][args.feature_col]
    labels = dataset[idx][args.label_col]
    return [(features[i], labels[i]) for i in range(len(features))]

train_sents = [generate_sents(dataset['train'],i) for i in range(len(dataset['train']))]
valid_sents = [generate_sents(dataset['validation'],i) for i in range(len(dataset['validation']))]
test_sents = [generate_sents(dataset['test'],i) for i in range(len(dataset['test']))]
len(train_sents), len(valid_sents), len(test_sents)

CPU times: user 1.21 s, sys: 33.3 ms, total: 1.24 s
Wall time: 1.26 s


(5078, 635, 621)

In [8]:
#generate x,y
def extract_features(doc, window=3, max_n_gram=3):
    #padding for words
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]            
    doc_features = []
    
    #for each word
    for i in range(window, len(doc)-window):
        #bias term
        word_features = ['bias'] 
        
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                
                #word
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

def generate_xy(all_tuples):
    #target
    y = [[str(l) for (w,l) in t] for t in all_tuples]
    #features
    x_pre = [[w for (w,l) in t] for t in all_tuples]
    x = [extract_features(x_, window=2, max_n_gram = 2) for x_ in tqdm(x_pre)]
    return x, y


x_train, y_train = generate_xy(train_sents)
if args.dataset_name_or_path=='lst20':
    import random
    random.seed(args.seed)
    x_train_small = random.sample(x_train,10000)
    random.seed(args.seed)
    y_train_small =  random.sample(y_train,10000)
else:
    x_train_small = x_train
    y_train_small = y_train
x_valid, y_valid = generate_xy(valid_sents)
x_test, y_test = generate_xy(test_sents)

HBox(children=(FloatProgress(value=0.0, max=5078.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=635.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=621.0), HTML(value='')))




In [9]:
import pycrfsuite
from sklearn.metrics import classification_report

def train_crf(model_name, c1, c2, x_train, y_train, max_iterations=500):
    # Train model
    trainer = pycrfsuite.Trainer(verbose=True)

    for xseq, yseq in tqdm(zip(x_train, y_train)):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': c1,
        'c2': c2,
        'max_iterations': max_iterations,
        'feature.possible_transitions': True,
        'feature.minfreq': 3.0,
    })

    trainer.train(f'{model_name}_{c1}_{c2}.model')
    
def evaluate_crf(model_path, features, labels, tag_labels):
    tagger = pycrfsuite.Tagger()
    tagger.open(model_path)
    y_pred = []
    for xseq in tqdm(features, total=len(features)): y_pred.append(tagger.tag(xseq))
    preds = [int(tag) for row in y_pred for tag in row]
    labs = [int(tag) for row in labels for tag in row]
    return classification_report(labs,preds, target_names = tag_labels,digits=4),\
        f1_score(labs,preds,average='micro'),\
        f1_score(labs,preds,average='macro')

In [129]:
hyperparams = []
for c1 in tqdm([0.,0.5,1.]):
    for c2 in tqdm([0.,0.5,1.]):
        train_crf(args.dataset_name_or_path,c1,c2,x_train_small,y_train_small)
        report, f1_micro, f1_macro = evaluate_crf(f'{args.dataset_name_or_path}_{c1}_{c2}.model',
                                                  x_valid, y_valid, tag_labels)
        print(report)
        d = {'c1':c1, 'c2':c2, 'f1_micro':f1_micro, 'f1_macro': f1_macro, 'report':report}
        hyperparams.append(d)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 6.156

L-BFGS optimization
c1: 0.000000
c2: 0.000000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954751.579745
Feature norm: 1.000000
Error norm: 133359.655261
Active features: 148550
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.116

***** Iteration #2 *****
Loss: 818810.936978
Feature norm: 6.335856
Error norm: 185094.626285
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.709

***** Iteration #3 *****
Loss: 707805.971600
Feature norm: 6.235234
Error norm: 100584.678693
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #40 *****
Loss: 110339.957968
Feature norm: 140.095932
Error norm: 9048.291841
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.683

***** Iteration #41 *****
Loss: 107245.914148
Feature norm: 140.897737
Error norm: 3964.862757
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.664

***** Iteration #42 *****
Loss: 105267.391110
Feature norm: 139.635622
Error norm: 5094.316228
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.762

***** Iteration #43 *****
Loss: 103026.548544
Feature norm: 139.764259
Error norm: 4397.294429
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.816

***** Iteration #44 *****
Loss: 98753.318737
Feature norm: 143.202312
Error norm: 3470.789917
Active features: 148711
Line search trials: 1
Line search 

***** Iteration #79 *****
Loss: 52547.019556
Feature norm: 264.195633
Error norm: 4526.487048
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.587

***** Iteration #80 *****
Loss: 51985.204789
Feature norm: 263.864932
Error norm: 1699.600923
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.638

***** Iteration #81 *****
Loss: 51670.880523
Feature norm: 264.135788
Error norm: 979.774213
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.563

***** Iteration #82 *****
Loss: 51235.865903
Feature norm: 265.414936
Error norm: 1491.865646
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.600

***** Iteration #83 *****
Loss: 50583.252569
Feature norm: 267.868815
Error norm: 1832.598801
Active features: 148711
Line search trials: 1
Line search step:

***** Iteration #122 *****
Loss: 33388.232815
Feature norm: 358.544984
Error norm: 624.453284
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.596

***** Iteration #123 *****
Loss: 32942.520949
Feature norm: 361.931911
Error norm: 1057.679551
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.593

***** Iteration #124 *****
Loss: 32789.965845
Feature norm: 363.177128
Error norm: 1131.155124
Active features: 148711
Line search trials: 2
Line search step: 0.319703
Seconds required for this iteration: 1.115

***** Iteration #125 *****
Loss: 32603.930356
Feature norm: 363.651104
Error norm: 853.775621
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.546

***** Iteration #126 *****
Loss: 32250.845073
Feature norm: 364.883783
Error norm: 614.775765
Active features: 148711
Line search trials: 1
Line search st

***** Iteration #162 *****
Loss: 25445.388002
Feature norm: 409.800830
Error norm: 405.105023
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.549

***** Iteration #163 *****
Loss: 25260.846724
Feature norm: 411.393084
Error norm: 598.034954
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.550

***** Iteration #164 *****
Loss: 25189.030945
Feature norm: 412.334357
Error norm: 560.551088
Active features: 148711
Line search trials: 2
Line search step: 0.431692
Seconds required for this iteration: 1.093

***** Iteration #165 *****
Loss: 25085.340215
Feature norm: 413.176424
Error norm: 393.891723
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.541

***** Iteration #166 *****
Loss: 24977.391926
Feature norm: 414.134056
Error norm: 400.909368
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #202 *****
Loss: 20237.307050
Feature norm: 478.494766
Error norm: 554.579082
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.583

***** Iteration #203 *****
Loss: 20039.778779
Feature norm: 482.131380
Error norm: 607.982451
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.565

***** Iteration #204 *****
Loss: 19906.821947
Feature norm: 490.801962
Error norm: 755.791220
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.586

***** Iteration #205 *****
Loss: 19758.616851
Feature norm: 489.788585
Error norm: 451.823466
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.558

***** Iteration #206 *****
Loss: 19676.093779
Feature norm: 489.572770
Error norm: 356.139159
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #241 *****
Loss: 16573.959501
Feature norm: 559.409047
Error norm: 263.570180
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.589

***** Iteration #242 *****
Loss: 16501.499062
Feature norm: 560.255210
Error norm: 362.672527
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.595

***** Iteration #243 *****
Loss: 16402.505364
Feature norm: 562.632957
Error norm: 424.207980
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.580

***** Iteration #244 *****
Loss: 16375.575237
Feature norm: 568.281164
Error norm: 911.629070
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.588

***** Iteration #245 *****
Loss: 16268.091470
Feature norm: 567.441605
Error norm: 254.718730
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #280 *****
Loss: 13893.144271
Feature norm: 663.184284
Error norm: 807.348847
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.517

***** Iteration #281 *****
Loss: 13819.743314
Feature norm: 663.526677
Error norm: 216.991336
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.540

***** Iteration #282 *****
Loss: 13796.566480
Feature norm: 664.712874
Error norm: 187.034116
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.566

***** Iteration #283 *****
Loss: 13741.990778
Feature norm: 668.385137
Error norm: 279.584392
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.560

***** Iteration #284 *****
Loss: 13666.655475
Feature norm: 673.561910
Error norm: 310.445678
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #319 *****
Loss: 11464.775965
Feature norm: 883.249714
Error norm: 307.573589
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.556

***** Iteration #320 *****
Loss: 11446.149663
Feature norm: 901.433434
Error norm: 558.319847
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.559

***** Iteration #321 *****
Loss: 11355.010460
Feature norm: 898.077320
Error norm: 181.968688
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.561

***** Iteration #322 *****
Loss: 11321.054419
Feature norm: 898.354394
Error norm: 162.219796
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.557

***** Iteration #323 *****
Loss: 11269.658508
Feature norm: 902.275347
Error norm: 209.816654
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #359 *****
Loss: 9527.049194
Feature norm: 1137.570267
Error norm: 156.907041
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.550

***** Iteration #360 *****
Loss: 9498.243951
Feature norm: 1139.914140
Error norm: 188.866411
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.544

***** Iteration #361 *****
Loss: 9468.824568
Feature norm: 1141.794407
Error norm: 287.323719
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.549

***** Iteration #362 *****
Loss: 9433.781127
Feature norm: 1143.876546
Error norm: 182.800059
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.550

***** Iteration #363 *****
Loss: 9382.035827
Feature norm: 1150.651633
Error norm: 149.002718
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #398 *****
Loss: 8185.313744
Feature norm: 1351.310874
Error norm: 246.539201
Active features: 148711
Line search trials: 2
Line search step: 0.272444
Seconds required for this iteration: 1.195

***** Iteration #399 *****
Loss: 8152.974236
Feature norm: 1359.029998
Error norm: 110.830182
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.587

***** Iteration #400 *****
Loss: 8137.613605
Feature norm: 1362.520936
Error norm: 95.001370
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.567

***** Iteration #401 *****
Loss: 8113.645202
Feature norm: 1367.871650
Error norm: 161.010991
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.557

***** Iteration #402 *****
Loss: 8089.211570
Feature norm: 1370.789650
Error norm: 111.966564
Active features: 148711
Line search trials: 1
Line search step:

***** Iteration #438 *****
Loss: 7336.828009
Feature norm: 1526.901596
Error norm: 152.415640
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.573

***** Iteration #439 *****
Loss: 7322.495585
Feature norm: 1529.526079
Error norm: 72.134989
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.597

***** Iteration #440 *****
Loss: 7310.783512
Feature norm: 1534.210079
Error norm: 101.349509
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.608

***** Iteration #441 *****
Loss: 7297.424978
Feature norm: 1539.022199
Error norm: 97.803033
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.668

***** Iteration #442 *****
Loss: 7279.657352
Feature norm: 1551.897855
Error norm: 163.827739
Active features: 148711
Line search trials: 1
Line search step: 

***** Iteration #479 *****
Loss: 6761.785202
Feature norm: 1648.391950
Error norm: 59.300725
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.589

***** Iteration #480 *****
Loss: 6752.134907
Feature norm: 1649.687148
Error norm: 96.924827
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.586

***** Iteration #481 *****
Loss: 6745.622897
Feature norm: 1652.027501
Error norm: 149.504517
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.590

***** Iteration #482 *****
Loss: 6735.915001
Feature norm: 1651.783900
Error norm: 62.838911
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.598

***** Iteration #483 *****
Loss: 6730.239737
Feature norm: 1651.731644
Error norm: 60.648771
Active features: 148711
Line search trials: 1
Line search step: 1.

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9506    0.9479    0.9492     70861
          VV     0.9172    0.9359    0.9265     47521
          PU     0.9994    0.9988    0.9991     45313
          CC     0.9229    0.9314    0.9271     18822
          PS     0.9123    0.9054    0.9089     13646
          AX     0.8781    0.8975    0.8877      7750
          AV     0.8197    0.7187    0.7659      7735
          FX     0.9895    0.9905    0.9900      8559
          NU     0.9260    0.9078    0.9168      7346
          AJ     0.7783    0.7307    0.7538      4979
          CL     0.7684    0.7888    0.7785      3940
          PR     0.7474    0.8369    0.7896      2238
          NG     0.9978    0.9978    0.9978      1795
          PA     0.7479    0.8656    0.8024       305
          XX     0.0357    0.0222    0.0274        45
          IJ     0.2500    0.2000    0.2222         5

    accuracy                         0.9347    240860
   macro avg     0.7901  

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 8.563

L-BFGS optimization
c1: 0.000000
c2: 0.500000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954752.079745
Feature norm: 1.000000
Error norm: 133358.660504
Active features: 148550
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.596

***** Iteration #2 *****
Loss: 818816.296870
Feature norm: 6.335600
Error norm: 185089.902463
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.594

***** Iteration #3 *****
Loss: 707816.761699
Feature norm: 6.234809
Error norm: 100576.636624
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #39 *****
Loss: 120198.737782
Feature norm: 125.295504
Error norm: 4632.536755
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.701

***** Iteration #40 *****
Loss: 117681.516408
Feature norm: 126.422354
Error norm: 3846.030335
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.692

***** Iteration #41 *****
Loss: 115692.623366
Feature norm: 129.092241
Error norm: 9683.867019
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.731

***** Iteration #42 *****
Loss: 113437.269148
Feature norm: 129.222613
Error norm: 5487.063303
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.672

***** Iteration #43 *****
Loss: 111593.135265
Feature norm: 129.409069
Error norm: 2906.681702
Active features: 148711
Line search trials: 1
Line search

***** Iteration #82 *****
Loss: 71173.775239
Feature norm: 177.197937
Error norm: 1361.786007
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.587

***** Iteration #83 *****
Loss: 70794.901570
Feature norm: 177.431195
Error norm: 994.074695
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.595

***** Iteration #84 *****
Loss: 70424.732874
Feature norm: 177.699154
Error norm: 1378.800559
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.576

***** Iteration #85 *****
Loss: 70008.345267
Feature norm: 178.811302
Error norm: 2460.711316
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.614

***** Iteration #86 *****
Loss: 69511.683947
Feature norm: 179.010514
Error norm: 1146.638802
Active features: 148711
Line search trials: 1
Line search step:

***** Iteration #123 *****
Loss: 61951.953917
Feature norm: 200.609014
Error norm: 912.564133
Active features: 148711
Line search trials: 2
Line search step: 0.402284
Seconds required for this iteration: 1.177

***** Iteration #124 *****
Loss: 61903.011555
Feature norm: 200.548077
Error norm: 748.457306
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.595

***** Iteration #125 *****
Loss: 61731.152046
Feature norm: 200.868465
Error norm: 463.357560
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.596

***** Iteration #126 *****
Loss: 61626.622919
Feature norm: 201.119842
Error norm: 555.068992
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.586

***** Iteration #127 *****
Loss: 61430.371329
Feature norm: 202.172338
Error norm: 991.481765
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #163 *****
Loss: 59344.866263
Feature norm: 204.871606
Error norm: 454.122591
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.581

***** Iteration #164 *****
Loss: 59320.948580
Feature norm: 204.849292
Error norm: 349.963171
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.563

***** Iteration #165 *****
Loss: 59285.971428
Feature norm: 204.766183
Error norm: 344.728485
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.575

***** Iteration #166 *****
Loss: 59270.524561
Feature norm: 204.459867
Error norm: 1096.117809
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.632

***** Iteration #167 *****
Loss: 59225.029814
Feature norm: 204.469941
Error norm: 350.393057
Active features: 148711
Line search trials: 1
Line search ste

***** Iteration #202 *****
Loss: 58579.404737
Feature norm: 200.785505
Error norm: 376.321674
Active features: 148711
Line search trials: 2
Line search step: 0.248926
Seconds required for this iteration: 1.180

***** Iteration #203 *****
Loss: 58570.745198
Feature norm: 200.752810
Error norm: 240.018841
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.570

***** Iteration #204 *****
Loss: 58561.317851
Feature norm: 200.713025
Error norm: 150.906996
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.577

***** Iteration #205 *****
Loss: 58551.922003
Feature norm: 200.654084
Error norm: 181.693270
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.555

***** Iteration #206 *****
Loss: 58543.736165
Feature norm: 200.602443
Error norm: 181.142824
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #247 *****
Loss: 58392.248275
Feature norm: 200.026431
Error norm: 81.286587
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.572

***** Iteration #248 *****
Loss: 58391.748739
Feature norm: 200.027940
Error norm: 58.085693
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.575

***** Iteration #249 *****
Loss: 58390.960799
Feature norm: 200.030656
Error norm: 71.386156
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.571

***** Iteration #250 *****
Loss: 58389.603416
Feature norm: 200.035299
Error norm: 78.688358
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.587

***** Iteration #251 *****
Loss: 58388.656803
Feature norm: 200.046033
Error norm: 185.337099
Active features: 148711
Line search trials: 1
Line search step: 1.

***** Iteration #286 *****
Loss: 58353.202832
Feature norm: 200.268170
Error norm: 46.246240
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.695

***** Iteration #287 *****
Loss: 58352.536986
Feature norm: 200.268567
Error norm: 47.637505
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.657

***** Iteration #288 *****
Loss: 58352.290981
Feature norm: 200.271271
Error norm: 77.039183
Active features: 148711
Line search trials: 2
Line search step: 0.242190
Seconds required for this iteration: 1.155

***** Iteration #289 *****
Loss: 58351.960202
Feature norm: 200.272513
Error norm: 55.759024
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.580

***** Iteration #290 *****
Loss: 58351.307055
Feature norm: 200.280184
Error norm: 32.620310
Active features: 148711
Line search trials: 1
Line search step: 1.0

***** Iteration #328 *****
Loss: 58340.757904
Feature norm: 200.431479
Error norm: 23.331838
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.580

***** Iteration #329 *****
Loss: 58340.659076
Feature norm: 200.437791
Error norm: 50.449252
Active features: 148711
Line search trials: 2
Line search step: 0.463883
Seconds required for this iteration: 1.271

***** Iteration #330 *****
Loss: 58340.484697
Feature norm: 200.443030
Error norm: 27.820266
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.606

***** Iteration #331 *****
Loss: 58340.367822
Feature norm: 200.445757
Error norm: 18.820301
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.647

***** Iteration #332 *****
Loss: 58340.213870
Feature norm: 200.450783
Error norm: 25.484223
Active features: 148711
Line search trials: 1
Line search step: 1.0

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9482    0.9749    0.9614     70861
          VV     0.9409    0.9473    0.9441     47521
          PU     1.0000    0.9987    0.9993     45313
          CC     0.9334    0.9561    0.9446     18822
          PS     0.9407    0.9286    0.9346     13646
          AX     0.9246    0.9159    0.9202      7750
          AV     0.8922    0.7524    0.8164      7735
          FX     0.9957    0.9921    0.9939      8559
          NU     0.9705    0.9122    0.9404      7346
          AJ     0.8490    0.7881    0.8174      4979
          CL     0.8595    0.7871    0.8217      3940
          PR     0.8158    0.8092    0.8125      2238
          NG     1.0000    0.9978    0.9989      1795
          PA     0.9140    0.8361    0.8733       305
          XX     0.1667    0.0222    0.0392        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9508    240860
   macro avg     0.8219  

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 5.587

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954752.579745
Feature norm: 1.000000
Error norm: 133357.665747
Active features: 148550
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.240

***** Iteration #2 *****
Loss: 818821.655569
Feature norm: 6.335344
Error norm: 185085.178476
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.655

***** Iteration #3 *****
Loss: 707827.550625
Feature norm: 6.234385
Error norm: 100568.595094
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #42 *****
Loss: 121894.649296
Feature norm: 119.322758
Error norm: 8348.513153
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.593

***** Iteration #43 *****
Loss: 119626.590990
Feature norm: 122.767899
Error norm: 3893.874693
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.590

***** Iteration #44 *****
Loss: 118600.647278
Feature norm: 122.704317
Error norm: 3034.546182
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.589

***** Iteration #45 *****
Loss: 116231.135389
Feature norm: 122.869339
Error norm: 3031.097684
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.595

***** Iteration #46 *****
Loss: 114327.148977
Feature norm: 123.692047
Error norm: 3192.508493
Active features: 148711
Line search trials: 1
Line search

***** Iteration #84 *****
Loss: 84120.343906
Feature norm: 164.503765
Error norm: 1519.908914
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.566

***** Iteration #85 *****
Loss: 84026.576968
Feature norm: 164.949711
Error norm: 4632.652752
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.578

***** Iteration #86 *****
Loss: 83524.923412
Feature norm: 165.048274
Error norm: 1222.798776
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.564

***** Iteration #87 *****
Loss: 83396.627227
Feature norm: 165.109953
Error norm: 1161.025130
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.583

***** Iteration #88 *****
Loss: 83147.712397
Feature norm: 165.259781
Error norm: 1270.723868
Active features: 148711
Line search trials: 1
Line search step

***** Iteration #123 *****
Loss: 76122.951375
Feature norm: 164.429974
Error norm: 1017.470953
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.657

***** Iteration #124 *****
Loss: 76017.745945
Feature norm: 164.301243
Error norm: 616.739919
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.717

***** Iteration #125 *****
Loss: 75901.409896
Feature norm: 164.169335
Error norm: 551.969805
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.600

***** Iteration #126 *****
Loss: 75813.452333
Feature norm: 164.003619
Error norm: 529.836948
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.593

***** Iteration #127 *****
Loss: 75737.487739
Feature norm: 163.649797
Error norm: 1713.234444
Active features: 148711
Line search trials: 1
Line search st

***** Iteration #167 *****
Loss: 74480.394030
Feature norm: 160.332730
Error norm: 492.049448
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.562

***** Iteration #168 *****
Loss: 74464.814434
Feature norm: 160.323162
Error norm: 206.079969
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.614

***** Iteration #169 *****
Loss: 74458.705523
Feature norm: 160.344717
Error norm: 145.654859
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.599

***** Iteration #170 *****
Loss: 74451.829823
Feature norm: 160.370601
Error norm: 173.408728
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.686

***** Iteration #171 *****
Loss: 74448.524978
Feature norm: 160.371269
Error norm: 401.016017
Active features: 148711
Line search trials: 2
Line search step

***** Iteration #211 *****
Loss: 74295.700617
Feature norm: 160.547222
Error norm: 56.546711
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.610

***** Iteration #212 *****
Loss: 74294.785506
Feature norm: 160.549304
Error norm: 123.347096
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.555

***** Iteration #213 *****
Loss: 74294.004188
Feature norm: 160.551899
Error norm: 80.696753
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.571

***** Iteration #214 *****
Loss: 74293.444778
Feature norm: 160.552685
Error norm: 49.478382
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.582

***** Iteration #215 *****
Loss: 74292.665153
Feature norm: 160.554426
Error norm: 49.366887
Active features: 148711
Line search trials: 1
Line search step: 1.

***** Iteration #251 *****
Loss: 74267.930930
Feature norm: 160.710951
Error norm: 51.871494
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.621

***** Iteration #252 *****
Loss: 74267.519630
Feature norm: 160.717239
Error norm: 42.463821
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.636

***** Iteration #253 *****
Loss: 74267.018710
Feature norm: 160.727380
Error norm: 49.607936
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.573

***** Iteration #254 *****
Loss: 74266.825497
Feature norm: 160.733554
Error norm: 85.029160
Active features: 148711
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.565

***** Iteration #255 *****
Loss: 74266.611397
Feature norm: 160.735017
Error norm: 39.155150
Active features: 148711
Line search trials: 1
Line search step: 1.0

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9419    0.9747    0.9580     70861
          VV     0.9388    0.9424    0.9406     47521
          PU     1.0000    0.9983    0.9991     45313
          CC     0.9317    0.9552    0.9433     18822
          PS     0.9414    0.9292    0.9353     13646
          AX     0.9285    0.9135    0.9210      7750
          AV     0.8927    0.7509    0.8157      7735
          FX     0.9959    0.9887    0.9923      8559
          NU     0.9727    0.9074    0.9389      7346
          AJ     0.8540    0.7813    0.8160      4979
          CL     0.8592    0.7777    0.8164      3940
          PR     0.8227    0.7962    0.8093      2238
          NG     1.0000    0.9978    0.9989      1795
          PA     0.9170    0.7967    0.8526       305
          XX     0.0000    0.0000    0.0000        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9488    240860
   macro avg     0.8123  

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 5.856

L-BFGS optimization
c1: 0.500000
c2: 0.000000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954762.488445
Feature norm: 1.000000
Error norm: 133347.783352
Active features: 146013
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.190

***** Iteration #2 *****
Loss: 819765.103239
Feature norm: 6.331973
Error norm: 184734.191897
Active features: 144136
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.717

***** Iteration #3 *****
Loss: 709997.441733
Feature norm: 6.197161
Error norm: 102291.659705
Active features: 141011
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #38 *****
Loss: 53160.445674
Feature norm: 269.833048
Error norm: 307.306407
Active features: 56721
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.611

***** Iteration #39 *****
Loss: 52448.440773
Feature norm: 276.784437
Error norm: 769.388231
Active features: 53982
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.589

***** Iteration #40 *****
Loss: 52042.389313
Feature norm: 281.890786
Error norm: 1355.186317
Active features: 52723
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.587

***** Iteration #41 *****
Loss: 51550.463919
Feature norm: 285.354119
Error norm: 378.901880
Active features: 51619
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.540

***** Iteration #42 *****
Loss: 51127.252158
Feature norm: 289.225631
Error norm: 488.597249
Active features: 50088
Line search trials: 1
Line search step: 1.00000

***** Iteration #78 *****
Loss: 47191.973569
Feature norm: 335.161988
Error norm: 263.214481
Active features: 36238
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.662

***** Iteration #79 *****
Loss: 47162.735301
Feature norm: 335.671603
Error norm: 296.377574
Active features: 36110
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.664

***** Iteration #80 *****
Loss: 47140.510951
Feature norm: 336.165033
Error norm: 382.135938
Active features: 36026
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.707

***** Iteration #81 *****
Loss: 47118.203699
Feature norm: 336.652688
Error norm: 297.286921
Active features: 35904
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.756

***** Iteration #82 *****
Loss: 47099.042671
Feature norm: 337.274220
Error norm: 307.862324
Active features: 35769
Line search trials: 1
Line search step: 1.000000

***** Iteration #123 *****
Loss: 46567.727901
Feature norm: 349.589174
Error norm: 142.675143
Active features: 32499
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.665

***** Iteration #124 *****
Loss: 46560.290013
Feature norm: 349.721329
Error norm: 118.934539
Active features: 32435
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.602

***** Iteration #125 *****
Loss: 46552.433377
Feature norm: 349.816682
Error norm: 103.086214
Active features: 32384
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.600

***** Iteration #126 *****
Loss: 46544.972142
Feature norm: 349.938578
Error norm: 111.746926
Active features: 32313
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.562

***** Iteration #127 *****
Loss: 46537.315384
Feature norm: 350.024692
Error norm: 93.283030
Active features: 32274
Line search trials: 1
Line search step: 1.00

***** Iteration #167 *****
Loss: 46384.864414
Feature norm: 352.961567
Error norm: 109.183427
Active features: 30339
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.652

***** Iteration #168 *****
Loss: 46383.112032
Feature norm: 353.032559
Error norm: 88.053588
Active features: 30325
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.698

***** Iteration #169 *****
Loss: 46381.889869
Feature norm: 353.066625
Error norm: 114.699679
Active features: 30277
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.694

***** Iteration #170 *****
Loss: 46379.986948
Feature norm: 353.124432
Error norm: 81.206338
Active features: 30252
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.659

***** Iteration #171 *****
Loss: 46378.794166
Feature norm: 353.160976
Error norm: 91.849835
Active features: 30251
Line search trials: 1
Line search step: 1.0000

***** Iteration #210 *****
Loss: 46343.019481
Feature norm: 354.189792
Error norm: 46.742415
Active features: 29794
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.569

***** Iteration #211 *****
Loss: 46342.575598
Feature norm: 354.203146
Error norm: 73.308225
Active features: 29785
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.566

***** Iteration #212 *****
Loss: 46341.984641
Feature norm: 354.233757
Error norm: 52.392073
Active features: 29765
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.572

***** Iteration #213 *****
Loss: 46341.570823
Feature norm: 354.245964
Error norm: 87.454284
Active features: 29765
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.569

***** Iteration #214 *****
Loss: 46340.668846
Feature norm: 354.270367
Error norm: 41.162867
Active features: 29766
Line search trials: 1
Line search step: 1.000000

***** Iteration #251 *****
Loss: 46325.164754
Feature norm: 354.795064
Error norm: 40.866405
Active features: 29405
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.832

***** Iteration #252 *****
Loss: 46324.902581
Feature norm: 354.815638
Error norm: 46.090541
Active features: 29399
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.688

***** Iteration #253 *****
Loss: 46324.609368
Feature norm: 354.825006
Error norm: 52.053358
Active features: 29370
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.585

***** Iteration #254 *****
Loss: 46324.299913
Feature norm: 354.846171
Error norm: 52.358481
Active features: 29364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.546

***** Iteration #255 *****
Loss: 46323.986783
Feature norm: 354.855392
Error norm: 33.771452
Active features: 29358
Line search trials: 1
Line search step: 1.000000

***** Iteration #292 *****
Loss: 46316.453199
Feature norm: 355.244877
Error norm: 30.266206
Active features: 29126
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.607

***** Iteration #293 *****
Loss: 46316.311668
Feature norm: 355.252243
Error norm: 38.439097
Active features: 29122
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.569

***** Iteration #294 *****
Loss: 46316.164793
Feature norm: 355.267665
Error norm: 33.055784
Active features: 29120
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.593

***** Iteration #295 *****
Loss: 46316.025750
Feature norm: 355.275588
Error norm: 28.382702
Active features: 29122
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.609

***** Iteration #296 *****
Loss: 46315.923207
Feature norm: 355.289309
Error norm: 25.820274
Active features: 29120
Line search trials: 1
Line search step: 1.000000

***** Iteration #333 *****
Loss: 46310.750847
Feature norm: 355.786599
Error norm: 36.090907
Active features: 28925
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.611

***** Iteration #334 *****
Loss: 46310.594153
Feature norm: 355.801647
Error norm: 29.662199
Active features: 28920
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.600

***** Iteration #335 *****
Loss: 46310.516340
Feature norm: 355.812327
Error norm: 40.076747
Active features: 28908
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.608

***** Iteration #336 *****
Loss: 46310.330472
Feature norm: 355.827505
Error norm: 31.484387
Active features: 28910
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.615

***** Iteration #337 *****
Loss: 46310.251867
Feature norm: 355.837615
Error norm: 35.439977
Active features: 28905
Line search trials: 1
Line search step: 1.000000

***** Iteration #374 *****
Loss: 46306.796623
Feature norm: 356.287547
Error norm: 24.102491
Active features: 28778
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.566

***** Iteration #375 *****
Loss: 46306.724157
Feature norm: 356.295445
Error norm: 31.711275
Active features: 28775
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.537

***** Iteration #376 *****
Loss: 46306.613046
Feature norm: 356.312527
Error norm: 30.557719
Active features: 28771
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.520

***** Iteration #377 *****
Loss: 46306.534329
Feature norm: 356.320080
Error norm: 20.830697
Active features: 28762
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.516

***** Iteration #378 *****
Loss: 46306.470300
Feature norm: 356.338095
Error norm: 23.630754
Active features: 28758
Line search trials: 1
Line search step: 1.000000

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9527    0.9751    0.9638     70861
          VV     0.9446    0.9477    0.9462     47521
          PU     1.0000    0.9991    0.9995     45313
          CC     0.9346    0.9563    0.9453     18822
          PS     0.9354    0.9302    0.9328     13646
          AX     0.9191    0.9177    0.9184      7750
          AV     0.8904    0.7560    0.8177      7735
          FX     0.9959    0.9922    0.9940      8559
          NU     0.9641    0.9107    0.9366      7346
          AJ     0.8493    0.7901    0.8186      4979
          CL     0.8498    0.7997    0.8240      3940
          PR     0.8060    0.8374    0.8214      2238
          NG     0.9994    0.9989    0.9992      1795
          PA     0.8738    0.8852    0.8795       305
          XX     0.0909    0.0222    0.0357        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9519    240860
   macro avg     0.8129  

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 5.527

L-BFGS optimization
c1: 0.500000
c2: 0.500000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954762.988445
Feature norm: 1.000000
Error norm: 133346.788595
Active features: 146013
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.179

***** Iteration #2 *****
Loss: 819770.408179
Feature norm: 6.331717
Error norm: 184729.485593
Active features: 144136
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.592

***** Iteration #3 *****
Loss: 710008.441432
Feature norm: 6.196760
Error norm: 102282.888710
Active features: 141011
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #44 *****
Loss: 74010.327702
Feature norm: 173.076485
Error norm: 483.175247
Active features: 59789
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.586

***** Iteration #45 *****
Loss: 73794.416031
Feature norm: 173.654758
Error norm: 890.862667
Active features: 58382
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.573

***** Iteration #46 *****
Loss: 73635.242730
Feature norm: 174.081618
Error norm: 335.702360
Active features: 57524
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.570

***** Iteration #47 *****
Loss: 73449.122136
Feature norm: 174.668978
Error norm: 420.300597
Active features: 56515
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.568

***** Iteration #48 *****
Loss: 73355.196867
Feature norm: 175.006484
Error norm: 691.134426
Active features: 56055
Line search trials: 2
Line search step: 0.500000

***** Iteration #89 *****
Loss: 72318.103698
Feature norm: 178.976820
Error norm: 113.862289
Active features: 47697
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.593

***** Iteration #90 *****
Loss: 72315.121963
Feature norm: 178.976993
Error norm: 52.145719
Active features: 47624
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.580

***** Iteration #91 *****
Loss: 72312.700850
Feature norm: 178.971487
Error norm: 91.093638
Active features: 47576
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.585

***** Iteration #92 *****
Loss: 72309.822858
Feature norm: 178.971548
Error norm: 70.103627
Active features: 47514
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.574

***** Iteration #93 *****
Loss: 72307.134162
Feature norm: 178.962810
Error norm: 85.079617
Active features: 47437
Line search trials: 1
Line search step: 1.000000
Sec

***** Iteration #130 *****
Loss: 72268.164691
Feature norm: 179.077612
Error norm: 71.809482
Active features: 46876
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.597

***** Iteration #131 *****
Loss: 72268.017132
Feature norm: 179.083197
Error norm: 112.883011
Active features: 46872
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.582

***** Iteration #132 *****
Loss: 72267.416492
Feature norm: 179.089572
Error norm: 69.537261
Active features: 46864
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.581

***** Iteration #133 *****
Loss: 72267.242249
Feature norm: 179.094994
Error norm: 107.985693
Active features: 46855
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.577

***** Iteration #134 *****
Loss: 72266.664526
Feature norm: 179.100443
Error norm: 66.737791
Active features: 46850
Line search trials: 1
Line search step: 1.0000

***** Iteration #171 *****
Loss: 72257.510211
Feature norm: 179.247558
Error norm: 91.110114
Active features: 46709
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.576

***** Iteration #172 *****
Loss: 72257.196062
Feature norm: 179.250612
Error norm: 49.972544
Active features: 46703
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.598

***** Iteration #173 *****
Loss: 72257.102054
Feature norm: 179.251618
Error norm: 80.377734
Active features: 46695
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.599

***** Iteration #174 *****
Loss: 72256.873376
Feature norm: 179.254024
Error norm: 49.275347
Active features: 46698
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.573

***** Iteration #175 *****
Loss: 72256.794594
Feature norm: 179.254766
Error norm: 77.476635
Active features: 46704
Line search trials: 1
Line search step: 1.000000

***** Iteration #213 *****
Loss: 72251.443446
Feature norm: 179.272835
Error norm: 58.219716
Active features: 46616
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.577

***** Iteration #214 *****
Loss: 72251.346239
Feature norm: 179.273072
Error norm: 59.082107
Active features: 46614
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.587

***** Iteration #215 *****
Loss: 72251.229264
Feature norm: 179.272058
Error norm: 63.917208
Active features: 46612
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.581

***** Iteration #216 *****
Loss: 72251.123974
Feature norm: 179.272399
Error norm: 56.416873
Active features: 46607
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.773

***** Iteration #217 *****
Loss: 72251.051336
Feature norm: 179.271254
Error norm: 74.107155
Active features: 46609
Line search trials: 1
Line search step: 1.000000

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9450    0.9756    0.9600     70861
          VV     0.9429    0.9443    0.9436     47521
          PU     1.0000    0.9987    0.9993     45313
          CC     0.9318    0.9569    0.9442     18822
          PS     0.9403    0.9322    0.9363     13646
          AX     0.9293    0.9142    0.9217      7750
          AV     0.8944    0.7579    0.8205      7735
          FX     0.9958    0.9888    0.9923      8559
          NU     0.9723    0.9068    0.9384      7346
          AJ     0.8528    0.7911    0.8208      4979
          CL     0.8574    0.7797    0.8167      3940
          PR     0.8212    0.8088    0.8149      2238
          NG     1.0000    0.9978    0.9989      1795
          PA     0.9194    0.8230    0.8685       305
          XX     0.0000    0.0000    0.0000        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9504    240860
   macro avg     0.8127  

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 5.316

L-BFGS optimization
c1: 0.500000
c2: 1.000000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954763.488445
Feature norm: 1.000000
Error norm: 133345.793838
Active features: 146013
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.139

***** Iteration #2 *****
Loss: 819775.711931
Feature norm: 6.331462
Error norm: 184724.779126
Active features: 144136
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.556

***** Iteration #3 *****
Loss: 710019.440424
Feature norm: 6.196359
Error norm: 102274.118984
Active features: 141011
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #42 *****
Loss: 86734.667303
Feature norm: 146.570977
Error norm: 393.405561
Active features: 64797
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.564

***** Iteration #43 *****
Loss: 86599.322264
Feature norm: 146.889801
Error norm: 630.293410
Active features: 64020
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.563

***** Iteration #44 *****
Loss: 86441.216415
Feature norm: 147.211002
Error norm: 640.885118
Active features: 63217
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.553

***** Iteration #45 *****
Loss: 86355.529963
Feature norm: 147.791090
Error norm: 2320.206761
Active features: 61163
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.564

***** Iteration #46 *****
Loss: 86237.264730
Feature norm: 147.877108
Error norm: 285.336185
Active features: 61219
Line search trials: 1
Line search step: 1.00000

***** Iteration #90 *****
Loss: 85504.863711
Feature norm: 149.277118
Error norm: 101.857360
Active features: 53986
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.571

***** Iteration #91 *****
Loss: 85502.945325
Feature norm: 149.284162
Error norm: 89.122711
Active features: 53950
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.565

***** Iteration #92 *****
Loss: 85501.164077
Feature norm: 149.283914
Error norm: 123.426240
Active features: 53931
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.556

***** Iteration #93 *****
Loss: 85499.621347
Feature norm: 149.293584
Error norm: 112.641708
Active features: 53909
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.563

***** Iteration #94 *****
Loss: 85498.146739
Feature norm: 149.290292
Error norm: 136.247536
Active features: 53871
Line search trials: 1
Line search step: 1.000000


***** Iteration #137 *****
Loss: 85475.696364
Feature norm: 149.427868
Error norm: 85.336798
Active features: 53608
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.574

***** Iteration #138 *****
Loss: 85475.507774
Feature norm: 149.428074
Error norm: 130.256228
Active features: 53607
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.572

***** Iteration #139 *****
Loss: 85475.062568
Feature norm: 149.431663
Error norm: 77.930662
Active features: 53607
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.548

***** Iteration #140 *****
Loss: 85474.948700
Feature norm: 149.431945
Error norm: 136.349450
Active features: 53595
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.565

***** Iteration #141 *****
Loss: 85474.441452
Feature norm: 149.435036
Error norm: 72.814850
Active features: 53593
Line search trials: 1
Line search step: 1.0000

***** Iteration #177 *****
Loss: 85465.881033
Feature norm: 149.453864
Error norm: 65.934629
Active features: 53514
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.568

***** Iteration #178 *****
Loss: 85465.865407
Feature norm: 149.453014
Error norm: 124.484456
Active features: 53510
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.665

***** Iteration #179 *****
Loss: 85465.505144
Feature norm: 149.453666
Error norm: 62.128833
Active features: 53506
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.567

***** Iteration #180 *****
Loss: 85465.316294
Feature norm: 149.453253
Error norm: 37.363284
Active features: 53507
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.133

***** Iteration #181 *****
Loss: 85465.223645
Feature norm: 149.452724
Error norm: 92.456636
Active features: 53499
Line search trials: 1
Line search step: 1.00000

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))




  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          NN     0.9385    0.9749    0.9564     70861
          VV     0.9398    0.9390    0.9394     47521
          PU     1.0000    0.9984    0.9992     45313
          CC     0.9312    0.9555    0.9432     18822
          PS     0.9403    0.9310    0.9356     13646
          AX     0.9295    0.9134    0.9214      7750
          AV     0.8923    0.7523    0.8164      7735
          FX     0.9959    0.9887    0.9923      8559
          NU     0.9746    0.8997    0.9357      7346
          AJ     0.8540    0.7825    0.8167      4979
          CL     0.8637    0.7718    0.8152      3940
          PR     0.8273    0.7962    0.8115      2238
          NG     1.0000    0.9978    0.9989      1795
          PA     0.9278    0.8000    0.8592       305
          XX     0.0000    0.0000    0.0000        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9481    240860
   macro avg     0.8134   

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 5.569

L-BFGS optimization
c1: 1.000000
c2: 0.000000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954773.133321
Feature norm: 1.000000
Error norm: 133336.224424
Active features: 143404
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.112

***** Iteration #2 *****
Loss: 819847.666727
Feature norm: 6.331061
Error norm: 184719.924926
Active features: 141197
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.569

***** Iteration #3 *****
Loss: 710096.904853
Feature norm: 6.195549
Error norm: 102257.023798
Active features: 139370
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #38 *****
Loss: 66244.947289
Feature norm: 225.316235
Error norm: 1355.875715
Active features: 37589
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.616

***** Iteration #39 *****
Loss: 66042.766669
Feature norm: 227.647447
Error norm: 2848.349377
Active features: 36337
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.843

***** Iteration #40 *****
Loss: 65552.170129
Feature norm: 227.732850
Error norm: 908.360250
Active features: 36700
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.703

***** Iteration #41 *****
Loss: 65308.640074
Feature norm: 228.559744
Error norm: 319.678954
Active features: 36286
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.575

***** Iteration #42 *****
Loss: 64743.832884
Feature norm: 232.702204
Error norm: 1040.467229
Active features: 34395
Line search trials: 1
Line search step: 1.000

***** Iteration #83 *****
Loss: 61142.381939
Feature norm: 260.626770
Error norm: 289.091375
Active features: 22220
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.549

***** Iteration #84 *****
Loss: 61127.724363
Feature norm: 260.973018
Error norm: 280.611787
Active features: 22167
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.549

***** Iteration #85 *****
Loss: 61111.115700
Feature norm: 261.313347
Error norm: 290.822239
Active features: 22102
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.554

***** Iteration #86 *****
Loss: 61092.804194
Feature norm: 261.650705
Error norm: 296.510813
Active features: 22040
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.549

***** Iteration #87 *****
Loss: 61078.640567
Feature norm: 261.958565
Error norm: 327.767746
Active features: 21976
Line search trials: 1
Line search step: 1.000000

***** Iteration #136 *****
Loss: 60623.818151
Feature norm: 269.803943
Error norm: 60.195664
Active features: 19801
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.575

***** Iteration #137 *****
Loss: 60618.850976
Feature norm: 269.927160
Error norm: 235.165803
Active features: 19705
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.562

***** Iteration #138 *****
Loss: 60613.463104
Feature norm: 269.984417
Error norm: 141.134387
Active features: 19706
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.582

***** Iteration #139 *****
Loss: 60609.525714
Feature norm: 270.058730
Error norm: 86.060969
Active features: 19667
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.562

***** Iteration #140 *****
Loss: 60605.853809
Feature norm: 270.110418
Error norm: 83.774390
Active features: 19610
Line search trials: 1
Line search step: 1.0000

***** Iteration #177 *****
Loss: 60534.323780
Feature norm: 272.020811
Error norm: 104.927286
Active features: 19161
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.545

***** Iteration #178 *****
Loss: 60533.427941
Feature norm: 272.033439
Error norm: 73.186419
Active features: 19151
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.565

***** Iteration #179 *****
Loss: 60532.542669
Feature norm: 272.058153
Error norm: 71.316371
Active features: 19143
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.550

***** Iteration #180 *****
Loss: 60531.900579
Feature norm: 272.070444
Error norm: 94.188433
Active features: 19128
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.565

***** Iteration #181 *****
Loss: 60530.983165
Feature norm: 272.098118
Error norm: 82.537534
Active features: 19124
Line search trials: 1
Line search step: 1.00000

***** Iteration #220 *****
Loss: 60509.520206
Feature norm: 272.763577
Error norm: 92.431877
Active features: 18880
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.555

***** Iteration #221 *****
Loss: 60508.866206
Feature norm: 272.781103
Error norm: 54.267150
Active features: 18880
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.572

***** Iteration #222 *****
Loss: 60508.735256
Feature norm: 272.798799
Error norm: 92.783562
Active features: 18882
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.551

***** Iteration #223 *****
Loss: 60508.024751
Feature norm: 272.817040
Error norm: 46.594851
Active features: 18872
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.565

***** Iteration #224 *****
Loss: 60507.938440
Feature norm: 272.835305
Error norm: 95.522101
Active features: 18861
Line search trials: 1
Line search step: 1.000000

***** Iteration #266 *****
Loss: 60496.040307
Feature norm: 273.457084
Error norm: 50.208646
Active features: 18720
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.578

***** Iteration #267 *****
Loss: 60495.795028
Feature norm: 273.473594
Error norm: 43.911755
Active features: 18714
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.556

***** Iteration #268 *****
Loss: 60495.608862
Feature norm: 273.483441
Error norm: 44.121289
Active features: 18712
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.544

***** Iteration #269 *****
Loss: 60495.412140
Feature norm: 273.496870
Error norm: 45.986120
Active features: 18705
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.580

***** Iteration #270 *****
Loss: 60495.227824
Feature norm: 273.507727
Error norm: 40.860370
Active features: 18709
Line search trials: 1
Line search step: 1.000000

***** Iteration #308 *****
Loss: 60490.253894
Feature norm: 273.886145
Error norm: 50.797942
Active features: 18568
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.560

***** Iteration #309 *****
Loss: 60490.061877
Feature norm: 273.894097
Error norm: 26.933347
Active features: 18563
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.562

***** Iteration #310 *****
Loss: 60490.057084
Feature norm: 273.904710
Error norm: 54.446766
Active features: 18566
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.545

***** Iteration #311 *****
Loss: 60489.842964
Feature norm: 273.912941
Error norm: 24.608003
Active features: 18567
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.546

***** Iteration #312 *****
Loss: 60489.749739
Feature norm: 273.917942
Error norm: 16.646382
Active features: 18565
Line search trials: 2
Line search step: 0.500000

***** Iteration #350 *****
Loss: 60486.729711
Feature norm: 274.201664
Error norm: 39.894947
Active features: 18468
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.569

***** Iteration #351 *****
Loss: 60486.618313
Feature norm: 274.210134
Error norm: 22.539123
Active features: 18466
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.572

***** Iteration #352 *****
Loss: 60486.605617
Feature norm: 274.216142
Error norm: 39.806960
Active features: 18463
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.593

***** Iteration #353 *****
Loss: 60486.495574
Feature norm: 274.224216
Error norm: 21.532635
Active features: 18459
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.566

***** Iteration #354 *****
Loss: 60486.481780
Feature norm: 274.231152
Error norm: 39.506824
Active features: 18458
Line search trials: 1
Line search step: 1.000000

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9493    0.9769    0.9629     70861
          VV     0.9483    0.9451    0.9467     47521
          PU     1.0000    0.9991    0.9995     45313
          CC     0.9304    0.9583    0.9442     18822
          PS     0.9366    0.9329    0.9348     13646
          AX     0.9303    0.9160    0.9231      7750
          AV     0.8951    0.7620    0.8232      7735
          FX     0.9955    0.9891    0.9923      8559
          NU     0.9688    0.9055    0.9361      7346
          AJ     0.8564    0.8082    0.8316      4979
          CL     0.8594    0.7957    0.8263      3940
          PR     0.8114    0.8307    0.8209      2238
          NG     0.9994    0.9989    0.9992      1795
          PA     0.8953    0.8689    0.8819       305
          XX     0.0909    0.0222    0.0357        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9523    240860
   macro avg     0.8167  

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 5.788

L-BFGS optimization
c1: 1.000000
c2: 0.500000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954773.633321
Feature norm: 1.000000
Error norm: 133335.229667
Active features: 143404
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.089

***** Iteration #2 *****
Loss: 819852.965159
Feature norm: 6.330806
Error norm: 184715.218606
Active features: 141197
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.551

***** Iteration #3 *****
Loss: 710107.897273
Feature norm: 6.195149
Error norm: 102248.255890
Active features: 139370
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #40 *****
Loss: 84008.570387
Feature norm: 158.397687
Error norm: 730.313332
Active features: 42045
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.785

***** Iteration #41 *****
Loss: 83546.167666
Feature norm: 159.789286
Error norm: 861.700083
Active features: 40607
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.997

***** Iteration #42 *****
Loss: 83279.594165
Feature norm: 160.431509
Error norm: 686.898570
Active features: 39757
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.953

***** Iteration #43 *****
Loss: 83005.127034
Feature norm: 161.419892
Error norm: 1057.052694
Active features: 38669
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.669

***** Iteration #44 *****
Loss: 82809.264840
Feature norm: 162.101433
Error norm: 512.423974
Active features: 37998
Line search trials: 1
Line search step: 1.00000

***** Iteration #81 *****
Loss: 81282.469835
Feature norm: 168.416476
Error norm: 139.412126
Active features: 28678
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.656

***** Iteration #82 *****
Loss: 81276.663136
Feature norm: 168.443156
Error norm: 191.626499
Active features: 28623
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.753

***** Iteration #83 *****
Loss: 81269.367754
Feature norm: 168.455662
Error norm: 191.519309
Active features: 28538
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.621

***** Iteration #84 *****
Loss: 81263.336916
Feature norm: 168.482485
Error norm: 238.687780
Active features: 28500
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.622

***** Iteration #85 *****
Loss: 81256.321243
Feature norm: 168.486756
Error norm: 161.125038
Active features: 28484
Line search trials: 1
Line search step: 1.000000

***** Iteration #121 *****
Loss: 81168.911868
Feature norm: 168.483641
Error norm: 102.597813
Active features: 27701
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.583

***** Iteration #122 *****
Loss: 81168.352172
Feature norm: 168.489800
Error norm: 125.040900
Active features: 27709
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.552

***** Iteration #123 *****
Loss: 81167.574176
Feature norm: 168.486380
Error norm: 114.874362
Active features: 27706
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.570

***** Iteration #124 *****
Loss: 81166.936520
Feature norm: 168.491905
Error norm: 98.791039
Active features: 27705
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.616

***** Iteration #125 *****
Loss: 81166.296535
Feature norm: 168.489704
Error norm: 101.969242
Active features: 27702
Line search trials: 1
Line search step: 1.00

***** Iteration #162 *****
Loss: 81151.428534
Feature norm: 168.565651
Error norm: 67.668902
Active features: 27585
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.602

***** Iteration #163 *****
Loss: 81151.379025
Feature norm: 168.566972
Error norm: 129.274206
Active features: 27577
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.584

***** Iteration #164 *****
Loss: 81150.885775
Feature norm: 168.569809
Error norm: 58.497690
Active features: 27577
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.579

***** Iteration #165 *****
Loss: 81150.838216
Feature norm: 168.571317
Error norm: 122.779559
Active features: 27580
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.572

***** Iteration #166 *****
Loss: 81150.403009
Feature norm: 168.573465
Error norm: 57.647262
Active features: 27579
Line search trials: 1
Line search step: 1.0000

***** Iteration #202 *****
Loss: 81143.716631
Feature norm: 168.635437
Error norm: 46.941959
Active features: 27533
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.577

***** Iteration #203 *****
Loss: 81143.692210
Feature norm: 168.635510
Error norm: 97.605305
Active features: 27529
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.571

***** Iteration #204 *****
Loss: 81143.429228
Feature norm: 168.637395
Error norm: 41.232040
Active features: 27528
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.575

***** Iteration #205 *****
Loss: 81143.314039
Feature norm: 168.637415
Error norm: 34.504072
Active features: 27529
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.392

***** Iteration #206 *****
Loss: 81143.240871
Feature norm: 168.637784
Error norm: 81.065647
Active features: 27520
Line search trials: 1
Line search step: 1.000000

***** Iteration #243 *****
Loss: 81138.986355
Feature norm: 168.652492
Error norm: 82.160768
Active features: 27468
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.108

***** Iteration #244 *****
Loss: 81138.802287
Feature norm: 168.653161
Error norm: 29.039919
Active features: 27467
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.101

***** Iteration #245 *****
Loss: 81138.756459
Feature norm: 168.652889
Error norm: 59.857280
Active features: 27463
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.099

***** Iteration #246 *****
Loss: 81138.650852
Feature norm: 168.653211
Error norm: 27.427762
Active features: 27468
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.115

***** Iteration #247 *****
Loss: 81138.605340
Feature norm: 168.653114
Error norm: 59.180795
Active features: 27461
Line search trials: 2
Line search step: 0.500000

HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9409    0.9757    0.9580     70861
          VV     0.9432    0.9396    0.9414     47521
          PU     1.0000    0.9985    0.9992     45313
          CC     0.9300    0.9563    0.9430     18822
          PS     0.9404    0.9327    0.9365     13646
          AX     0.9294    0.9137    0.9215      7750
          AV     0.8946    0.7572    0.8202      7735
          FX     0.9958    0.9891    0.9924      8559
          NU     0.9735    0.8995    0.9351      7346
          AJ     0.8529    0.7933    0.8221      4979
          CL     0.8605    0.7749    0.8154      3940
          PR     0.8184    0.8056    0.8120      2238
          NG     1.0000    0.9983    0.9992      1795
          PA     0.9291    0.8164    0.8691       305
          XX     0.0000    0.0000    0.0000        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9492    240860
   macro avg     0.8130  

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 148711
Seconds required: 5.617

L-BFGS optimization
c1: 1.000000
c2: 1.000000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 954774.133321
Feature norm: 1.000000
Error norm: 133334.234910
Active features: 143404
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 1.161

***** Iteration #2 *****
Loss: 819858.262405
Feature norm: 6.330550
Error norm: 184710.512123
Active features: 141197
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.593

***** Iteration #3 *****
Loss: 710118.990523
Feature norm: 6.194721
Error norm: 102239.364494
Active features: 139370
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #40 *****
Loss: 94736.672708
Feature norm: 138.680679
Error norm: 578.717319
Active features: 44298
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.569

***** Iteration #41 *****
Loss: 94665.453902
Feature norm: 138.668363
Error norm: 1375.519201
Active features: 43449
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.136

***** Iteration #42 *****
Loss: 94522.474212
Feature norm: 138.944982
Error norm: 884.729649
Active features: 42442
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.581

***** Iteration #43 *****
Loss: 94380.771066
Feature norm: 139.285075
Error norm: 462.976735
Active features: 41219
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.574

***** Iteration #44 *****
Loss: 94179.750270
Feature norm: 139.891417
Error norm: 874.162228
Active features: 39472
Line search trials: 1
Line search step: 1.00000

***** Iteration #85 *****
Loss: 93139.602233
Feature norm: 143.308045
Error norm: 301.181108
Active features: 32460
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.567

***** Iteration #86 *****
Loss: 93135.478821
Feature norm: 143.288108
Error norm: 149.826461
Active features: 32432
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.561

***** Iteration #87 *****
Loss: 93132.572381
Feature norm: 143.279743
Error norm: 157.913591
Active features: 32426
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.579

***** Iteration #88 *****
Loss: 93129.599865
Feature norm: 143.264744
Error norm: 136.408999
Active features: 32412
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.576

***** Iteration #89 *****
Loss: 93126.619252
Feature norm: 143.248607
Error norm: 225.215448
Active features: 32370
Line search trials: 1
Line search step: 1.000000

***** Iteration #131 *****
Loss: 93082.175677
Feature norm: 143.107795
Error norm: 98.815281
Active features: 31977
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.556

***** Iteration #132 *****
Loss: 93081.876055
Feature norm: 143.104162
Error norm: 131.932654
Active features: 31975
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.559

***** Iteration #133 *****
Loss: 93081.340305
Feature norm: 143.104069
Error norm: 92.321866
Active features: 31965
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.557

***** Iteration #134 *****
Loss: 93081.054577
Feature norm: 143.101820
Error norm: 128.573584
Active features: 31958
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.571

***** Iteration #135 *****
Loss: 93080.534514
Feature norm: 143.101462
Error norm: 86.109003
Active features: 31962
Line search trials: 1
Line search step: 1.0000

***** Iteration #171 *****
Loss: 93070.824249
Feature norm: 143.089774
Error norm: 55.997995
Active features: 31898
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.561

***** Iteration #172 *****
Loss: 93070.786308
Feature norm: 143.090103
Error norm: 91.142347
Active features: 31898
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.642

***** Iteration #173 *****
Loss: 93070.525400
Feature norm: 143.091613
Error norm: 46.649073
Active features: 31899
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.660

***** Iteration #174 *****
Loss: 93070.484982
Feature norm: 143.092258
Error norm: 88.211726
Active features: 31893
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.603

***** Iteration #175 *****
Loss: 93070.260850
Feature norm: 143.093526
Error norm: 47.402633
Active features: 31886
Line search trials: 1
Line search step: 1.000000

***** Iteration #211 *****
Loss: 93065.517209
Feature norm: 143.104994
Error norm: 68.959601
Active features: 31816
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.571

L-BFGS terminated with the stopping criteria
Total seconds required for training: 126.692

Storing the model
Number of active features: 31816 (148711)
Number of active attributes: 21457 (588372)
Number of active labels: 16 (16)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.186



HBox(children=(FloatProgress(value=0.0, max=6094.0), HTML(value='')))


              precision    recall  f1-score   support

          NN     0.9345    0.9747    0.9542     70861
          VV     0.9394    0.9343    0.9368     47521
          PU     1.0000    0.9983    0.9991     45313
          CC     0.9279    0.9559    0.9417     18822
          PS     0.9404    0.9288    0.9346     13646
          AX     0.9300    0.9119    0.9208      7750
          AV     0.8934    0.7545    0.8181      7735
          FX     0.9959    0.9891    0.9925      8559
          NU     0.9744    0.8904    0.9305      7346
          AJ     0.8548    0.7817    0.8166      4979
          CL     0.8628    0.7662    0.8117      3940
          PR     0.8350    0.7958    0.8149      2238
          NG     1.0000    0.9978    0.9989      1795
          PA     0.9346    0.7967    0.8602       305
          XX     0.0000    0.0000    0.0000        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9466    240860
   macro avg     0.8139  

In [130]:
hyperparams_df = pd.DataFrame(hyperparams).sort_values('f1_macro',ascending=False).reset_index(drop=True)
best_hyperparams = hyperparams_df.iloc[0,:].to_dict()
hyperparams_df

Unnamed: 0,c1,c2,f1_micro,f1_macro,report
0,1.0,0.0,0.952271,0.803645,precision recall f1-score ...
1,0.5,0.0,0.951856,0.80205,precision recall f1-score ...
2,0.0,0.5,0.950801,0.801114,precision recall f1-score ...
3,0.5,0.5,0.950444,0.798502,precision recall f1-score ...
4,1.0,0.5,0.949195,0.797815,precision recall f1-score ...
5,0.5,1.0,0.94809,0.796299,precision recall f1-score ...
6,0.0,1.0,0.948829,0.796092,precision recall f1-score ...
7,1.0,1.0,0.946645,0.795668,precision recall f1-score ...
8,0.0,0.0,0.934692,0.790179,precision recall f1-score ...


In [131]:
print(best_hyperparams['report'])

              precision    recall  f1-score   support

          NN     0.9493    0.9769    0.9629     70861
          VV     0.9483    0.9451    0.9467     47521
          PU     1.0000    0.9991    0.9995     45313
          CC     0.9304    0.9583    0.9442     18822
          PS     0.9366    0.9329    0.9348     13646
          AX     0.9303    0.9160    0.9231      7750
          AV     0.8951    0.7620    0.8232      7735
          FX     0.9955    0.9891    0.9923      8559
          NU     0.9688    0.9055    0.9361      7346
          AJ     0.8564    0.8082    0.8316      4979
          CL     0.8594    0.7957    0.8263      3940
          PR     0.8114    0.8307    0.8209      2238
          NG     0.9994    0.9989    0.9992      1795
          PA     0.8953    0.8689    0.8819       305
          XX     0.0909    0.0222    0.0357        45
          IJ     0.0000    0.0000    0.0000         5

    accuracy                         0.9523    240860
   macro avg     0.8167   

In [20]:
#final model
c1, c2 = best_hyperparams['c1'], best_hyperparams['c2']
train_crf(f'{args.dataset_name_or_path}_{args.label_col}_best',c1,c2,x_train,y_train)

In [14]:
#debug
c1, c2 = 0.5, 0.0

if args.dataset_name_or_path=='lst20' and args.label_col=='ner_tags':
    report, f1_micro, f1_macro = evaluate_crf(f'{args.dataset_name_or_path}_{args.label_col}_best_{c1}_{c2}.model',
                                              x_test, y_test, tag_labels[:-1]) #test set of lst20 does not have E_TTL
    print(report)
else:
    report, f1_micro, f1_macro = evaluate_crf(f'{args.dataset_name_or_path}_{args.label_col}_best_{c1}_{c2}.model',
                                              x_test, y_test, tag_labels)
    print(report)

HBox(children=(FloatProgress(value=0.0, max=621.0), HTML(value='')))


              precision    recall  f1-score   support

         ADJ     0.8787    0.7263    0.7953       369
         ADP     0.9487    0.9322    0.9404      1032
         ADV     0.8333    0.6818    0.7500       374
         AUX     0.9645    0.9685    0.9665       953
       CCONJ     0.9760    0.9613    0.9686       465
         DET     0.9302    0.8542    0.8906       343
        NOUN     0.9261    0.9711    0.9481      8410
         NUM     0.9753    0.9180    0.9458       732
        PART     0.9923    0.9556    0.9736       270
        PRON     0.9452    0.9139    0.9293       151
       PROPN     0.9085    0.8121    0.8576       660
       PUNCT     0.9974    0.9987    0.9981      3893
       SCONJ     0.9664    0.9485    0.9574      1243
        VERB     0.9280    0.9042    0.9159      3205

    accuracy                         0.9450     22100
   macro avg     0.9408    0.8962    0.9169     22100
weighted avg     0.9447    0.9450    0.9442     22100

