In [1]:
import matplotlib
import copy
import logging
#import torch

#from tqdm.autonotebook import tqdm

from data.data_loader import Dataset
from data.germeval2017 import germeval2017_dataset

from misc.preferences import PREFERENCES
#from misc.visualizer import *
from misc.run_configuration import get_default_params, randomize_params
from misc import utils

from optimizer import get_default_optimizer
from criterion import NllLoss, LossCombiner

from models.transformer.encoder import TransformerEncoder
from models.softmax_output import SoftmaxOutputLayerWithCommentWiseClass
from models.transformer_tagger import TransformerTagger
from models.jointAspectTagger import JointAspectTagger
from models.transformer.train import Trainer
import pprint

In [2]:
PREFERENCES.defaults(
    data_root='./data/germeval2017',
    data_train='train_v1.4.tsv',    
    data_validation='dev_v1.4.tsv',
    data_test='test_TIMESTAMP1.tsv',
    early_stopping='highest_5_F1'
)
def load(hp, logger):
    dataset = Dataset(
        'germeval',
        logger,
        hp,
        source_index=0,
        target_vocab_index=2,
        data_path=PREFERENCES.data_root,
        train_file=PREFERENCES.data_train,
        valid_file=PREFERENCES.data_validation,
        test_file=PREFERENCES.data_test,
        file_format='.tsv',
        init_token=None,
        eos_token=None
    )
    dataset.load_data(germeval2017_dataset, verbose=False)
    return dataset

In [3]:
def load_model(dataset, hp, experiment_name):
    loss = LossCombiner(4, dataset.class_weights, NllLoss)
    transformer = TransformerEncoder(dataset.source_embedding,
                                     hyperparameters=hp)
    model = JointAspectTagger(transformer, hp, 4, 20, dataset.target_names)
    optimizer = get_default_optimizer(model, hp)
    trainer = Trainer(
                        model,
                        loss,
                        optimizer,
                        hp,
                        dataset,
                        experiment_name,
                        enable_tensorboard=False,
                        verbose=False)
    return trainer

In [4]:
experiment_name = 'ConvHyperParameterSearch'
use_cuda = True

In [5]:
# get general logger just for search
experiment_name = utils.create_loggers(experiment_name=experiment_name)
logger = logging.getLogger(__name__)
logger.info('Run hyper parameter random grid search for experiment with name ' + experiment_name)

Log path is  /data/home/felix/ABSA-Transformer/logs/ConvHyperParameterSearch/20190224/3


In [6]:
utils.get_current_git_commit()
logger.info('Current commit: ' + utils.get_current_git_commit())

In [7]:
num_optim_iterations = 5
logger.info('num_optim_iterations: ' + str(num_optim_iterations))

In [8]:
random_grid_search_ranges = {
    'batch_size': (10, 20),
    'num_encoder_blocks': (2, 6),
    'pointwise_layer_size': (32, 4000),
    'clip_comments_to': (80, 120),
    'learning_rate': (0, 1e-2),
    #'learning_rate_factor': (1e-3, 4),
    #'learning_rate_warmup': (1000, 10000),
    #'optim_adam_beta1': (0.5, 0.99),
    #'optim_adam_beta2': (0.5, 0.99),
    'dropout_rate': (0, 0.8),
    'output_conv_num_filters': (5, 300),
    'output_conv_kernel_size': (1, 6),
    'transformer_config': {
        'transformer_heads': [1, 2, 3, 4, 5, 6, 10, 12, 15, 20]
    }
}
logger.info(pprint.pformat(random_grid_search_ranges, indent=2))

In [9]:
default_hp = get_default_params(use_cuda)
default_hp.num_epochs = 25
default_hp.seed = None
default_hp.output_layer_type = 'sum'

logger.info(default_hp)
print(default_hp)

+------------------------------------+
|          Hyperparameters           |
+-------------------------+----------+
|        Parameter        |  Value   |
+-------------------------+----------+
|        batch_size       |    12    |
|        model_size       |   300    |
|    learning_rate_type   |   noam   |
|      learning_rate      |    0     |
|   learning_rate_warmup  |   4800   |
|   learning_rate_factor  |    2     |
|     optim_adam_beta1    |   0.9    |
|     optim_adam_beta2    |   0.98   |
|      early_stopping     |    5     |
|         use_cuda        |   True   |
|       n_enc_blocks      |    3     |
|         n_heads         |    6     |
|           d_k           |    50    |
|           d_v           |    50    |
|       dropout_rate      |   0.1    |
|   pointwise_layer_size  |   2048   |
|    output_layer_type    |   sum    |
| output_conv_num_filters |   300    |
| output_conv_kernel_size |    5     |
|    output_conv_stride   |    1     |
|   output_conv_padding  

In [10]:
dataset_logger = logging.getLogger('data_loader')
best_f1 = 0.0
best_model = None
best_hp = None
best_iteration = -1
for optim_iteration in range(num_optim_iterations):
        
    print(f'\n\n+-------------------------------------------------+\nOptim Iteration: {optim_iteration}\n\n')
    logger.info(f'\n\n=================================\nOptim Iteration: {optim_iteration}\n=================================')
    
    # generate iteration hyper parameters
    hp = randomize_params(default_hp, random_grid_search_ranges)
        
    logger.info('New Params:')
    logger.info(hp)
    print(hp)
    
    logger.debug('Load dataset')
    dataset = load(hp, dataset_logger)
    logger.debug('dataset loaded')
    logger.debug('Load model')
    trainer = load_model(dataset, hp, experiment_name)
    logger.debug('model loaded')
    
    logger.debug('Begin training')
    model = None
    try:
        result = trainer.train(use_cuda=hp.use_cuda, perform_evaluation=False)
        model = result['model']
    except Exception as err:
        logger.exception("Could not complete iteration " + str(optim_iteration))
        print(f'Could not complete iteration {optim_iteration} because of {str(err)}')
        continue
        
    # perform evaluation and log results
    result = None
    try:
        result = trainer.perform_final_evaluation(use_test_set=False, verbose=False)
    except Exception as err:
        logger.exception("Could not complete iteration evaluation for it " + str(optim_iteration))
        print(f'Could not complete iteration {optim_iteration} evaluation because of {str(err)}')
        continue
    
    it_f1 = result[1][1]
    if best_f1 < it_f1:
        best_f1 = it_f1
        best_model = model
        best_hp = copy.copy(hp)
        best_iteration = optim_iteration
        print('+-------------------------------------------------+')
        print(f'Best Valid Result: {best_f1}')
        print('+-------------------------------------------------+')
    else:
        print(f'\nValid Result: {best_f1}\n')    
    print('\n\n###################################################\n')
    
print('Best iteration: ' + str(best_iteration))
print('Best f1: ' + str(best_f1))
print('Best HP:')
print(best_hp)



+-------------------------------------------------+
Optim Iteration: 0


+-------------------------------------------------+
|                 Hyperparameters                 |
+-------------------------+-----------------------+
|        Parameter        |         Value         |
+-------------------------+-----------------------+
|        batch_size       |           20          |
|        model_size       |          300          |
|    learning_rate_type   |          noam         |
|      learning_rate      | 0.0008692870511049277 |
|   learning_rate_warmup  |          4800         |
|   learning_rate_factor  |           2           |
|     optim_adam_beta1    |          0.9          |
|     optim_adam_beta2    |          0.98         |
|      early_stopping     |           5           |
|         use_cuda        |          True         |
|       n_enc_blocks      |           5           |
|         n_heads         |           10          |
|           d_k           |           30 

                                           

pre_training - DEBUG - 20 initialized
pre_training - DEBUG - Initilize parameters with nn.init.xavier_uniform_
pre_training - DEBUG - Tagger initialized


Epoch 1: 100%|██████████| 853/853 [02:42<00:00,  5.83it/s]

# EP	# IT	tr loss		val loss	f1		acc		duration / total time
1	853	186.32		[32m170.61		[32m0.204[0m		0.757		2.71m - 2.7m / 0.0m


Epoch 2: 100%|██████████| 853/853 [05:18<00:00,  5.83it/s]

2	1706	124.75		[32m87.47		[32m0.235[0m		0.862		2.59m - 5.3m / 67.8m


Epoch 4:   0%|          | 0/853 [07:55<02:26,  5.83it/s]  

3	2559	101.37		[32m87.54		[37m0.223[0m		0.829		2.61m - 7.9m / 64.8m


Epoch 4: 100%|██████████| 853/853 [10:31<00:00,  5.83it/s]

4	3412	74.52		[32m49.62		[32m0.238[0m		0.889		2.61m - 10.5m / 65.4m


Epoch 6:   0%|          | 0/853 [13:07<02:26,  5.83it/s]  

5	4265	47.54		[32m43.01		[37m0.218[0m		0.828		2.59m - 13.1m / 65.3m


Epoch 7:   0%|          | 0/853 [15:42<02:26,  5.83it/s]  

6	5118	28.63		[32m23.03		[37m0.193[0m		0.701		2.58m - 15.7m / 64.9m


Epoch 8:   0%|          | 0/853 [18:16<02:26,  5.83it/s]  

7	5971	18.39		[32m14.15		[37m0.232[0m		0.869		2.57m - 18.3m / 64.8m


Epoch 8: 100%|██████████| 853/853 [20:50<00:00,  5.83it/s]

8	6824	13.91		[32m12.09		[32m0.240[0m		0.903		2.57m - 20.8m / 64.5m


Epoch 10:   0%|          | 0/853 [23:24<02:26,  5.83it/s] 

9	7677	12.57		[32m11.09		[37m0.238[0m		0.922		2.57m - 23.4m / 64.5m


Epoch 11:   0%|          | 0/853 [25:58<02:26,  5.83it/s]  

10	8530	12.10		[32m11.34		[37m0.236[0m		0.909		2.57m - 26.0m / 64.5m


Epoch 11: 100%|██████████| 853/853 [28:32<00:00,  5.83it/s]

11	9383	12.08		[32m10.70		[32m0.246[0m		0.957		2.57m - 28.5m / 64.5m


Epoch 13:   0%|          | 0/853 [31:06<02:26,  5.83it/s]  

12	10236	11.95		[32m10.98		[37m0.239[0m		0.923		2.56m - 31.1m / 64.5m


Epoch 14:   0%|          | 0/853 [33:40<02:26,  5.83it/s]  

13	11089	11.82		[32m10.82		[37m0.244[0m		0.953		2.56m - 33.7m / 64.4m


Epoch 15:   0%|          | 0/853 [36:14<02:26,  5.83it/s]  

14	11942	11.75		[32m10.64		[37m0.244[0m		0.959		2.56m - 36.2m / 64.4m


Epoch 16:   0%|          | 0/853 [38:48<02:26,  5.83it/s]  

15	12795	11.70		[32m10.87		[37m0.244[0m		0.949		2.56m - 38.8m / 64.4m


Epoch 16: 100%|██████████| 853/853 [41:22<00:00,  5.83it/s]


16	13648	11.65		[32m10.94		[37m0.233[0m		0.909		2.56m - 41.4m / 64.4m
+-------------------------------------------------+
Best Valid Result: 0.23287411927246504
+-------------------------------------------------+


###################################################



+-------------------------------------------------+
Optim Iteration: 1


+------------------------------------------------+
|                Hyperparameters                 |
+-------------------------+----------------------+
|        Parameter        |        Value         |
+-------------------------+----------------------+
|        batch_size       |          14          |
|        model_size       |         300          |
|    learning_rate_type   |         noam         |
|      learning_rate      | 0.00792115236517247  |
|   learning_rate_warmup  |         4800         |
|   learning_rate_factor  |          2           |
|     optim_adam_beta1    |         0.9          |
|     optim_adam_beta2    |         0.98  

                                           

pre_training - DEBUG - 20 initialized
pre_training - DEBUG - Initilize parameters with nn.init.xavier_uniform_
pre_training - DEBUG - Tagger initialized


Epoch 1: 100%|██████████| 1218/1218 [01:50<00:00, 11.88it/s]

# EP	# IT	tr loss		val loss	f1		acc		duration / total time
1	1218	172.48		[32m104.79		[32m0.224[0m		0.755		1.84m - 1.8m / 0.0m


Epoch 2: 100%|██████████| 1218/1218 [03:41<00:00, 11.88it/s]

2	2436	109.09		[32m73.97		[32m0.242[0m		0.845		1.84m - 3.7m / 46.1m


Epoch 4:   0%|          | 1/1218 [05:32<01:42, 11.88it/s]   

3	3654	83.55		[32m85.20		[37m0.242[0m		0.888		1.85m - 5.5m / 46.0m


Epoch 4: 100%|██████████| 1218/1218 [07:22<00:00, 11.88it/s]

4	4872	57.32		[32m37.30		[32m0.258[0m		0.902		1.83m - 7.4m / 46.1m


Epoch 5: 100%|██████████| 1218/1218 [09:12<00:00, 11.88it/s]

5	6090	23.14		[32m12.17		[32m0.264[0m		0.888		1.84m - 9.2m / 45.9m


Epoch 6: 100%|██████████| 1218/1218 [11:02<00:00, 11.88it/s]

6	7308	8.57		[32m8.99		[32m0.280[0m		0.930		1.83m - 11.0m / 46.0m


Epoch 8:   0%|          | 1/1218 [12:53<01:42, 11.88it/s]   

7	8526	7.52		[37m9.35		[37m0.280[0m		0.919		1.84m - 12.9m / 45.9m


Epoch 8: 100%|██████████| 1218/1218 [14:43<00:00, 11.88it/s]

8	9744	6.82		[37m9.00		[32m0.288[0m		0.936		1.84m - 14.7m / 45.9m


Epoch 9: 100%|██████████| 1218/1218 [16:35<00:00, 11.88it/s]

9	10962	6.23		[37m9.66		[32m0.294[0m		0.929		1.85m - 16.6m / 46.1m


Epoch 10: 100%|██████████| 1218/1218 [18:25<00:00, 11.88it/s]

10	12180	5.88		[37m9.55		[32m0.301[0m		0.935		1.84m - 18.4m / 46.2m


Epoch 12:   0%|          | 1/1218 [20:15<01:42, 11.88it/s]   

11	13398	5.55		[37m10.08		[37m0.290[0m		0.922		1.83m - 20.3m / 46.0m


Epoch 13:   0%|          | 1/1218 [22:06<01:42, 11.88it/s]   

12	14616	5.30		[37m11.44		[37m0.293[0m		0.907		1.83m - 22.1m / 45.9m


Epoch 14:   0%|          | 1/1218 [23:55<01:42, 11.88it/s]   

13	15834	4.99		[37m10.87		[37m0.298[0m		0.918		1.83m - 23.9m / 45.9m


Epoch 14: 100%|██████████| 1218/1218 [25:45<00:00, 11.88it/s]

14	17052	4.76		[37m11.96		[32m0.301[0m		0.937		1.83m - 25.8m / 45.9m


Epoch 16:   0%|          | 1/1218 [27:35<01:42, 11.88it/s]   

15	18270	4.66		[37m10.92		[37m0.300[0m		0.930		1.83m - 27.6m / 45.9m


Epoch 17:   0%|          | 1/1218 [29:25<01:42, 11.88it/s]   

16	19488	4.41		[37m11.83		[37m0.297[0m		0.928		1.83m - 29.4m / 45.9m


Epoch 17: 100%|██████████| 1218/1218 [31:15<00:00, 11.88it/s]

17	20706	4.26		[37m11.98		[32m0.302[0m		0.925		1.84m - 31.3m / 45.9m


Epoch 19:   0%|          | 1/1218 [33:06<01:42, 11.88it/s]   

18	21924	4.08		[37m12.53		[37m0.295[0m		0.919		1.83m - 33.1m / 46.0m


Epoch 20:   0%|          | 1/1218 [34:55<01:42, 11.88it/s]   

19	23142	3.96		[37m12.37		[37m0.293[0m		0.909		1.83m - 34.9m / 45.9m


Epoch 21:   0%|          | 1/1218 [36:45<01:42, 11.88it/s]   

20	24360	3.76		[37m12.99		[37m0.289[0m		0.914		1.83m - 36.8m / 45.9m


Epoch 22:   0%|          | 1/1218 [38:37<01:42, 11.88it/s]   

21	25578	3.77		[37m12.42		[37m0.295[0m		0.910		1.86m - 38.6m / 45.9m


Epoch 22: 100%|██████████| 1218/1218 [40:27<00:00, 11.88it/s]


22	26796	3.61		[37m12.86		[37m0.291[0m		0.923		1.84m - 40.5m / 46.0m
+-------------------------------------------------+
Best Valid Result: 0.29419235245030106
+-------------------------------------------------+


###################################################



+-------------------------------------------------+
Optim Iteration: 2


+-----------------------------------------------+
|                Hyperparameters                |
+-------------------------+---------------------+
|        Parameter        |        Value        |
+-------------------------+---------------------+
|        batch_size       |          20         |
|        model_size       |         300         |
|    learning_rate_type   |         noam        |
|      learning_rate      | 0.00283351164028252 |
|   learning_rate_warmup  |         4800        |
|   learning_rate_factor  |          2          |
|     optim_adam_beta1    |         0.9         |
|     optim_adam_beta2    |         0.98        |
|    

                                           

pre_training - DEBUG - 20 initialized
pre_training - DEBUG - Initilize parameters with nn.init.xavier_uniform_
pre_training - DEBUG - Tagger initialized


Epoch 1: 100%|██████████| 853/853 [02:58<00:00,  5.04it/s]

# EP	# IT	tr loss		val loss	f1		acc		duration / total time
1	853	204.62		[32m137.11		[32m0.219[0m		0.785		2.98m - 3.0m / 0.0m


Epoch 2: 100%|██████████| 853/853 [05:57<00:00,  5.04it/s]

2	1706	149.78		[32m125.58		[32m0.236[0m		0.828		2.97m - 6.0m / 74.5m


Epoch 3: 100%|██████████| 853/853 [08:55<00:00,  5.04it/s]

3	2559	126.56		[32m113.63		[32m0.237[0m		0.847		2.97m - 8.9m / 74.2m


Epoch 4: 100%|██████████| 853/853 [11:54<00:00,  5.04it/s]

4	3412	99.47		[32m69.70		[32m0.243[0m		0.859		2.97m - 11.9m / 74.3m


Epoch 6:   0%|          | 0/853 [14:52<02:49,  5.04it/s]  

5	4265	62.92		[32m51.58		[37m0.242[0m		0.842		2.97m - 14.9m / 74.2m


Epoch 6: 100%|██████████| 853/853 [17:50<00:00,  5.04it/s]

6	5118	33.01		[32m21.97		[32m0.244[0m		0.846		2.97m - 17.8m / 74.3m


Epoch 7: 100%|██████████| 853/853 [20:49<00:00,  5.04it/s]

7	5971	17.35		[32m13.12		[32m0.268[0m		0.924		2.97m - 20.8m / 74.3m


Epoch 9:   0%|          | 0/853 [23:47<02:49,  5.04it/s]  

8	6824	10.68		[32m13.23		[37m0.256[0m		0.837		2.97m - 23.8m / 74.3m


Epoch 10:   0%|          | 0/853 [26:45<02:49,  5.04it/s] 

9	7677	8.86		[37m13.87		[37m0.265[0m		0.844		2.97m - 26.8m / 74.2m


Epoch 11:   0%|          | 0/853 [29:44<02:49,  5.04it/s]  

10	8530	8.18		[37m12.27		[37m0.264[0m		0.880		2.97m - 29.7m / 74.3m


Epoch 11: 100%|██████████| 853/853 [32:42<00:00,  5.04it/s]

11	9383	7.66		[37m12.32		[32m0.271[0m		0.911		2.97m - 32.7m / 74.3m


Epoch 13:   0%|          | 0/853 [35:41<02:49,  5.04it/s]  

12	10236	7.29		[37m14.53		[37m0.268[0m		0.861		2.97m - 35.7m / 74.3m


Epoch 13: 100%|██████████| 853/853 [38:39<00:00,  5.04it/s]

13	11089	7.00		[37m12.80		[32m0.272[0m		0.903		2.98m - 38.7m / 74.3m


Epoch 14: 100%|██████████| 853/853 [41:38<00:00,  5.04it/s]

14	11942	6.58		[37m13.03		[32m0.273[0m		0.881		2.97m - 41.6m / 74.4m


Epoch 16:   0%|          | 0/853 [44:37<02:49,  5.04it/s]  

15	12795	6.26		[37m14.52		[37m0.273[0m		0.862		2.98m - 44.6m / 74.4m


Epoch 16: 100%|██████████| 853/853 [47:36<00:00,  5.04it/s]

16	13648	5.94		[37m13.33		[32m0.279[0m		0.875		2.98m - 47.6m / 74.4m


Epoch 17: 100%|██████████| 853/853 [50:35<00:00,  5.04it/s]

17	14501	5.78		[37m13.96		[32m0.280[0m		0.883		2.98m - 50.6m / 74.4m


Epoch 18: 100%|██████████| 853/853 [53:33<00:00,  5.04it/s]

18	15354	5.53		[37m14.38		[32m0.285[0m		0.891		2.98m - 53.6m / 74.4m


Epoch 20:   0%|          | 0/853 [56:32<02:49,  5.04it/s]  

19	16207	5.45		[37m15.04		[37m0.280[0m		0.890		2.98m - 56.5m / 74.4m


Epoch 21:   0%|          | 0/853 [59:31<02:49,  5.04it/s]  

20	17060	5.15		[37m15.48		[37m0.281[0m		0.888		2.98m - 59.5m / 74.4m


Epoch 22:   0%|          | 0/853 [1:02:30<02:49,  5.04it/s]  

21	17913	5.18		[37m13.33		[37m0.275[0m		0.854		2.98m - 62.5m / 74.4m


Epoch 23:   0%|          | 0/853 [1:05:29<02:49,  5.04it/s]  

22	18766	5.05		[37m15.82		[37m0.282[0m		0.882		2.98m - 65.5m / 74.4m


Epoch 23: 100%|██████████| 853/853 [1:08:27<00:00,  5.04it/s]


23	19619	4.83		[37m15.41		[37m0.270[0m		0.858		2.98m - 68.5m / 74.4m

Valid Result: 0.29419235245030106



###################################################



+-------------------------------------------------+
Optim Iteration: 3


+-------------------------------------------------+
|                 Hyperparameters                 |
+-------------------------+-----------------------+
|        Parameter        |         Value         |
+-------------------------+-----------------------+
|        batch_size       |           20          |
|        model_size       |          300          |
|    learning_rate_type   |          noam         |
|      learning_rate      | 0.0017900384991516572 |
|   learning_rate_warmup  |          4800         |
|   learning_rate_factor  |           2           |
|     optim_adam_beta1    |          0.9          |
|     optim_adam_beta2    |          0.98         |
|      early_stopping     |           5           |
|         use_cuda        |       

                                           

pre_training - DEBUG - 20 initialized
pre_training - DEBUG - Initilize parameters with nn.init.xavier_uniform_
pre_training - DEBUG - Tagger initialized


Epoch 1: 100%|██████████| 853/853 [03:45<00:00,  4.04it/s]

# EP	# IT	tr loss		val loss	f1		acc		duration / total time
1	853	264.30		[32m151.88		[32m0.235[0m		0.886		3.76m - 3.8m / 0.0m


Epoch 3:   0%|          | 0/853 [07:31<03:30,  4.04it/s]  

2	1706	167.69		[32m139.21		[37m0.233[0m		0.847		3.75m - 7.5m / 94.0m


Epoch 4:   0%|          | 0/853 [11:15<03:30,  4.04it/s]  

3	2559	136.46		[32m131.60		[37m0.225[0m		0.823		3.75m - 11.3m / 93.8m


Epoch 5:   0%|          | 0/853 [15:00<03:30,  4.04it/s]  

4	3412	104.41		[32m69.69		[37m0.235[0m		0.868		3.75m - 15.0m / 93.7m


Epoch 6:   0%|          | 0/853 [18:45<03:30,  4.04it/s]  

5	4265	68.23		[32m75.81		[37m0.215[0m		0.820		3.75m - 18.8m / 93.7m


Epoch 6: 100%|██████████| 853/853 [22:30<00:00,  4.04it/s]


6	5118	38.87		[32m28.66		[37m0.203[0m		0.720		3.75m - 22.5m / 93.7m

Valid Result: 0.29419235245030106



###################################################



+-------------------------------------------------+
Optim Iteration: 4


+------------------------------------------------+
|                Hyperparameters                 |
+-------------------------+----------------------+
|        Parameter        |        Value         |
+-------------------------+----------------------+
|        batch_size       |          11          |
|        model_size       |         300          |
|    learning_rate_type   |         noam         |
|      learning_rate      | 0.000738475015191512 |
|   learning_rate_warmup  |         4800         |
|   learning_rate_factor  |          2           |
|     optim_adam_beta1    |         0.9          |
|     optim_adam_beta2    |         0.98         |
|      early_stopping     |          5           |
|         use_cuda        |         True         

                                           

pre_training - DEBUG - 20 initialized
pre_training - DEBUG - Initilize parameters with nn.init.xavier_uniform_
pre_training - DEBUG - Tagger initialized


Epoch 1: 100%|██████████| 1550/1550 [01:54<00:00, 14.42it/s]

# EP	# IT	tr loss		val loss	f1		acc		duration / total time
1	1550	148.76		[32m111.95		[32m0.235[0m		0.818		1.91m - 1.9m / 0.0m


Epoch 2: 100%|██████████| 1550/1550 [03:48<00:00, 34.29s/it]

2	3100	79.86		[32m56.16		[32m0.242[0m		0.845		1.90m - 3.8m / 47.7m


Epoch 3: 100%|██████████| 1550/1550 [05:42<00:00, 34.29s/it]  

3	4650	40.31		[32m22.05		[32m0.259[0m		0.865		1.90m - 5.7m / 47.4m


Epoch 4: 100%|██████████| 1550/1550 [07:36<00:00, 34.29s/it]  

4	6200	14.27		[32m10.07		[32m0.269[0m		0.946		1.90m - 7.6m / 47.5m


Epoch 5: 100%|██████████| 1550/1550 [09:31<00:00, 34.29s/it]  

5	7750	6.96		[32m8.35		[32m0.300[0m		0.945		1.90m - 9.5m / 47.5m


Epoch 7:   0%|          | 2/1550 [11:25<14:44:34, 34.29s/it]  

6	9300	5.64		[37m8.86		[37m0.296[0m		0.936		1.90m - 11.4m / 47.5m


Epoch 7: 100%|██████████| 1550/1550 [13:19<00:00, 34.29s/it]  

7	10850	4.93		[37m8.32		[32m0.306[0m		0.932		1.90m - 13.3m / 47.5m


Epoch 9:   0%|          | 2/1550 [15:13<14:44:34, 34.29s/it]  

8	12400	4.35		[37m8.95		[37m0.305[0m		0.929		1.90m - 15.2m / 47.6m


Epoch 9: 100%|██████████| 1550/1550 [17:07<00:00, 34.29s/it]  

9	13950	3.99		[37m9.71		[32m0.313[0m		0.930		1.90m - 17.1m / 47.5m


Epoch 10: 100%|██████████| 1550/1550 [19:01<00:00, 34.29s/it]  

10	15500	3.58		[37m10.80		[32m0.314[0m		0.930		1.90m - 19.0m / 47.5m


Epoch 11: 100%|██████████| 1550/1550 [20:55<00:00, 34.29s/it]  

11	17050	3.26		[37m10.41		[32m0.324[0m		0.939		1.90m - 20.9m / 47.5m


Epoch 13:   0%|          | 1/1550 [22:50<14:45:08, 34.29s/it]  

12	18600	2.96		[37m11.74		[37m0.314[0m		0.929		1.90m - 22.8m / 47.5m


Epoch 14:   0%|          | 1/1550 [24:44<14:45:08, 34.29s/it]  

13	20150	2.76		[37m12.74		[37m0.319[0m		0.924		1.90m - 24.7m / 47.6m


Epoch 15:   0%|          | 2/1550 [26:38<14:44:34, 34.29s/it]  

14	21700	2.55		[37m13.18		[37m0.303[0m		0.913		1.90m - 26.6m / 47.5m


Epoch 16:   0%|          | 2/1550 [28:32<14:44:34, 34.29s/it]  

15	23250	2.39		[37m12.60		[37m0.321[0m		0.931		1.90m - 28.5m / 47.6m


Epoch 16: 100%|██████████| 1550/1550 [30:26<00:00, 34.29s/it]  


16	24800	2.25		[37m13.68		[37m0.312[0m		0.894		1.90m - 30.4m / 47.5m
+-------------------------------------------------+
Best Valid Result: 0.3137767038764019
+-------------------------------------------------+


###################################################

Best iteration: 4
Best f1: 0.3137767038764019
Best HP:
+------------------------------------------------+
|                Hyperparameters                 |
+-------------------------+----------------------+
|        Parameter        |        Value         |
+-------------------------+----------------------+
|        batch_size       |          11          |
|        model_size       |         300          |
|    learning_rate_type   |         noam         |
|      learning_rate      | 0.000738475015191512 |
|   learning_rate_warmup  |         4800         |
|   learning_rate_factor  |          2           |
|     optim_adam_beta1    |         0.9          |
|     optim_adam_beta2    |         0.98         |
|      early_