In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..', '..', '..', '..')))
from nlp.bert.bert_model import BertModel
import spacy

lang = 'el'
# =======================================================
# model_dirname examples:
# bert_base_greek_uncased
# bert_base_multilingual_uncased
# =======================================================
model_dirname = 'bert_base_greek_uncased'
# =======================================================
# data_dirname examples:
# (nl) demorgen2000
# (el) spacyner2018
# =======================================================
data_dirname = 'spacyner2018'


pretrained_model_path = \
    os.path.abspath(os.path.join('..', '..', '..', 'bert', 'pretrained', model_dirname))
pretrained_model_path = \
    os.path.abspath(os.path.join('..', '..', 'resources', lang, 'models', 'bert_base_greek_uncased_ft'))
data_path = \
    os.path.abspath(os.path.join('..', '..', 'resources', lang, 'data', data_dirname, 'train.txt'))
output_path = \
    os.path.abspath(os.path.join('..', '..', 'training', 'bert', 'output'))
spacy_path = \
    os.path.abspath(os.path.join('..', '..', 'resources', lang, 'models', 'spacy_default'))
print('pretrained model path: ', pretrained_model_path)
print('data path: ', data_path)
print('output path: ', output_path)
print('spacy path: ', spacy_path)

pretrained model path:  /home/bo/workspace/online-segmentation/nlp/ner/resources/el/models/bert_base_greek_uncased_ft
data path:  /home/bo/workspace/online-segmentation/nlp/ner/resources/el/data/spacyner2018/train.txt
output path:  /home/bo/workspace/online-segmentation/nlp/ner/training/bert/output
spacy path:  /home/bo/workspace/online-segmentation/nlp/ner/resources/el/models/spacy_default


In [2]:
# to use uncased tokenizer
tokenizer_args = {
    'do_lower_case': True,
    'strip_accents': True,
    'keep_accents': False
}
bert_model = BertModel(pretrained_model_path, bert_type='bert', 
                       tokenizer_args=tokenizer_args, num_epochs=10, 
                       learning_rate=1e-6,
                       batch_size=32)

model_path:  /home/bo/workspace/online-segmentation/nlp/ner/resources/el/models/bert_base_greek_uncased_ft
tokenizer_args:  {'do_lower_case': True, 'strip_accents': True, 'keep_accents': False, 'use_fast': False}
entity_labels:  ['B-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'O', 'I-LOC', 'I-PER', 'B-MISC', 'I-MISC']
bert_type:  bert
num_epochs:  10
batch_size:  32
max_seq_length:  128
learning_rate:  1e-06
adam_epsilon:  1e-08
device:  cuda


In [None]:
_global_step, _tr_loss = bert_model.train(data_path, output_path)
print(f'training finished, global_step={_global_step}, tr_loss={_tr_loss}')

In [6]:
# verify training
nlp = spacy.load(spacy_path)
# load the newly trained model
new_model_path = os.path.abspath(os.path.join('output', 'bert-checkpoint-510'))
# new_model_path = '/home/bo/workspace/online-segmentation/ner/evaluation/nl/models/bert_base_dutch_cased'
new_model = BertModel(new_model_path, bert_type='bert')

model_path:  /home/bo/workspace/online-segmentation/nlp/ner/training/bert/output/bert-checkpoint-510
tokenizer_args:  {'do_lower_case': False, 'strip_accents': True, 'keep_accents': False, 'use_fast': False}
entity_labels:  ['B-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'O', 'I-LOC', 'I-PER', 'B-MISC', 'I-MISC']
bert_type:  bert
num_epochs:  1
batch_size:  32
max_seq_length:  128
device:  cuda


In [7]:
# from parliamentary data
s = 'Hij bezocht Amsterdam.'
s = 'Mark Rutte is een Nederlands politicus.'
s = 'Μπορεί να έχασαν όλες τις εκπλήξεις και τις συγκινήσεις που μας χάρισε το Παγκόσμιο Κύπελλο στη Ρωσία μέχρι τώρα , αλλά οι δύο τελικοί δεν χάνονται με τίποτα! '
s = 'Η Αθήνα, Ελλάδα είναι ένα όμορφο μέρος.'
s = 'Σε δήλωσή του , ο κ. Μίχαλος εξέφρασε την οδύνη του ΕΒΕΑ για τις ανθρώπινες ζωές που χάθηκαν , τονίζοντας πως η τραγωδία επισκιάζει τα πάντα.'
words = [t.text for t in nlp(s.lower())]
print('words')
print(words)
preds = new_model.ner(words)
print('predictions:')
print(preds)

words
['σε', 'δήλωσή', 'του', ',', 'ο', 'κ.', 'μίχαλος', 'εξέφρασε', 'την', 'οδύνη', 'του', 'εβεα', 'για', 'τις', 'ανθρώπινες', 'ζωές', 'που', 'χάθηκαν', ',', 'τονίζοντας', 'πως', 'η', 'τραγωδία', 'επισκιάζει', 'τα', 'πάντα', '.']
predictions:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [1]:
from torch.utils.tensorboard import SummaryWriter

In [2]:
import numpy as np

writer = SummaryWriter()

for n_iter in range(100):
    writer.add_scalar('Loss/train', np.random.random(), n_iter)
    writer.add_scalar('Loss/test', np.random.random(), n_iter)
    writer.add_scalar('Accuracy/train', np.random.random(), n_iter)
    writer.add_scalar('Accuracy/test', np.random.random(), n_iter)