In [14]:
from data.conll import conll2003_dataset, extract_samples
from misc.preferences import PREFERENCES
from misc.visualizer import *
from misc.hyperparameters import get_default_params
from optimizer import get_default_optimizer
from misc import utils
from models.transformer.encoder import TransformerEncoder
from models.softmax_output import SoftmaxOutputLayer, OutputLayer
from models.transformer_tagger import TransformerTagger
from models.transformer.train import Trainer
from criterion import NllLoss

In [15]:
experiment_name = 'Normal training'

In [16]:
PREFERENCES.defaults(
    data_root='./data/conll2003',
    data_train='eng.train.txt',
    data_validation='eng.testa.txt',
    data_test='eng.testb.txt',
    early_stopping='highest_5_F1'
)

hyper_parameters = get_default_params()
hyper_parameters.model_size = 300
hyper_parameters.batch_size = 80
hyper_parameters.early_stopping = -1
experiment_name = utils.create_loggers(experiment_name=experiment_name)

Log path is  /Users/felix/Documents/Repositories/TUM/ABSA-Transformer/logs/Normal training


In [17]:
conll2003 = conll2003_dataset('ner', hyper_parameters.batch_size,
                              root=PREFERENCES.data_root,
                              train_file=PREFERENCES.data_train,
                              validation_file=PREFERENCES.data_validation,
                              test_file=PREFERENCES.data_test,
                              use_cuda=False)



In [18]:
samples = extract_samples(conll2003['examples'])
print_samples(samples)

-docstart- - O

#######################

eu - I-ORG
rejects - O
german - I-MISC
call - O
to - O
boycott - O
british - I-MISC
lamb - O
. - O

#######################

peter - I-PER
blackburn - I-PER

#######################

-docstart- - O

#######################

cricket - O
- - O
leicestershire - I-ORG
take - O
over - O
at - O
top - O
after - O
innings - O
victory - O
. - O

#######################

london - I-LOC
1996-08-30 - O

#######################

-docstart- - O

#######################

soccer - O
- - O
japan - I-LOC
get - O
lucky - O
win - O
, - O
china - I-PER
in - O
surprise - O
defeat - O
. - O

#######################

nadim - I-PER
ladki - I-PER

#######################



In [19]:
# 10 words with a 100-length embedding
target_vocab = conll2003['vocabs'][0]
target_size = len(target_vocab)

In [20]:
loss = NllLoss(target_size)
# transformer = GoogleTransformer(True, target_size, target_size, num_units, 2, 2, 512, 0.1)
transformer = TransformerEncoder(conll2003['embeddings'][0],
                                 n_enc_blocks=2,
                                 n_head=3,
                                 d_model=hyper_parameters.model_size,
                                 d_k=100,
                                 d_v=100)
tagging_softmax = SoftmaxOutputLayer(hyper_parameters.model_size, target_size)
model = TransformerTagger(transformer, tagging_softmax)


In [21]:
# predict now to see model in initial state
#test_sample_iter = iterate_with_sample_data(conll2003['iters'][1], 200)
#df = predict_some_examples_to_df(model, test_sample_iter, num_samples=50)
#print(df)

In [22]:
optimizer = get_default_optimizer(model, hyper_parameters)
trainer = Trainer(model,
                    loss,
                    optimizer,
                    hyper_parameters,
                    conll2003['iters'],
                    experiment_name,
                    log_every_xth_iteration=50,
                    enable_tensorboard=True,
                    dummy_input=conll2003['dummy_input'])

pre_training - INFO - TransformerTagger (
  (encoder): TransformerEncoder(
    (src_embeddings): Embedding(26059, 300)
    (positional_encoding): PositionalEncoding2(
      (dropout): Dropout(p=0.1)
    )
    (encoder_blocks): ModuleList(
      (0): EncoderBlock(
        (self_attention_layer): MultiHeadedSelfAttentionLayer(
          (query_projections): Linear(in_features=300, out_features=300, bias=False)
          (key_projections): Linear(in_features=300, out_features=300, bias=False)
          (value_projections): Linear(in_features=300, out_features=300, bias=False)
          (attention_layer): ScaledDotProductAttentionLayer(
            (dropout): Dropout(p=0.1)
          )
          (layer_norm): LayerNorm()
          (w_0): Linear(in_features=300, out_features=300, bias=False)
          (dropout): Dropout(p=0.1)
        )
        (feed_forward_layer): PointWiseFCLayer(
          (layer_norm): LayerNorm()
          (w_1): Linear(in_features=300, out_features=2048, bias=True)
 

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Embedding-1              [-1, 42, 300]       7,817,700
           Dropout-2              [-1, 42, 300]               0
PositionalEncoding2-3              [-1, 42, 300]               0
            Linear-4              [-1, 42, 300]          90,000
            Linear-5              [-1, 42, 300]          90,000
            Linear-6              [-1, 42, 300]          90,000
           Dropout-7               [-1, 42, 42]               0
ScaledDotProductAttentionLayer-8              [-1, 42, 100]               0
            Linear-9              [-1, 42, 300]          90,000
          Dropout-10              [-1, 42, 300]               0
        LayerNorm-11              [-1, 42, 300]               0
MultiHeadedSelfAttentionLayer-12              [-1, 42, 300]               0
           Linear-13             [-1, 42, 2048]         616,448
           Lin

In [23]:
result = trainer.train(10)

pre_training - DEBUG - train without cuda support
pre_training - DEBUG - train without cuda support
pre_training - DEBUG - train without cuda support
pre_training - INFO - 188 Iterations per epoch with batch size of 80
pre_training - INFO - 188 Iterations per epoch with batch size of 80
pre_training - INFO - 188 Iterations per epoch with batch size of 80
pre_training - INFO - START training.
pre_training - INFO - START training.
pre_training - INFO - START training.





HBox(children=(IntProgress(value=0, description='Epoch 0', max=188, style=ProgressStyle(description_width='ini…

# EP	# IT	tr loss		val loss	f1
# EP	# IT	tr loss		val loss	f1
# EP	# IT	tr loss		val loss	f1
0	50	9.800		9.599		0.043
0	100	8.213		8.654		0.043
0	150	7.022		7.285		0.615
0	50	9.800		9.599		0.043
0	100	8.213		8.654		0.043
0	150	7.022		7.285		0.615
0	50	9.800		9.599		0.043
0	100	8.213		8.654		0.043
0	150	7.022		7.285		0.615
0	188	7.805		6.066		0.622
0	188	7.805		6.066		0.622
0	188	7.805		6.066		0.622


HBox(children=(IntProgress(value=0, description='Epoch 1', max=188, style=ProgressStyle(description_width='ini…

KeyboardInterrupt: 

In [16]:
import numpy as np
a = np.array([
    np.array([[1, 1], [1, 1]]),
    np.array([[2, 2], [-2, -3]])
])
print(a)

[[[ 1  1]
  [ 1  1]]

 [[ 2  2]
  [-2 -3]]]


In [18]:
a.sum(axis=0)

array([[ 3,  3],
       [-1, -2]])

In [None]:
#df = predict_some_examples_to_df(model, test_sample_iter)
#print(df)

In [None]:
#print(result)

In [None]:
(tr_loss, tr_f1) = result['result_train']
