Skip to content

Commit

Permalink
Merge branch 'master' into GH-2120-imporved-tensorboard-logging
Browse files Browse the repository at this point in the history
  • Loading branch information
alanakbik committed Mar 23, 2021
2 parents 38eb8e6 + 8b93b40 commit d53e013
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 30 deletions.
24 changes: 16 additions & 8 deletions flair/datasets/document_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,21 +745,25 @@ def __init__(self,
base_path: Path = Path(base_path)

# this dataset name
dataset_name = self.__class__.__name__.lower() + '_v2'

if rebalance_corpus:
dataset_name = dataset_name + '-rebalanced'
dataset_name = self.__class__.__name__.lower() + '_v4'

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
imdb_acl_path = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

if rebalance_corpus:
dataset_name = dataset_name + '-rebalanced'
data_folder = base_path / dataset_name
data_path = Path(flair.cache_root) / "datasets" / dataset_name
data_file = data_path / "train.txt"
if not data_file.is_file():
train_data_file = data_path / "train.txt"
test_data_file = data_path / "test.txt"

if train_data_file.is_file()==False or (rebalance_corpus==False and test_data_file.is_file()==False):
[os.remove(file_path) for file_path in [train_data_file, test_data_file] if file_path.is_file()]

cached_path(imdb_acl_path, Path("datasets") / dataset_name)
import tarfile

Expand All @@ -783,7 +787,11 @@ def __init__(self,
if f"{dataset}/{label}" in m.name
],
)
with open(f"{data_path}/train-all.txt", "at") as f_p:
data_file = train_data_file
if rebalance_corpus==False and dataset=="test":
data_file = test_data_file

with open(data_file, "at") as f_p:
current_path = data_path / "aclImdb" / dataset / label
for file_name in current_path.iterdir():
if file_name.is_file() and file_name.name.endswith(
Expand Down
48 changes: 38 additions & 10 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
entity_linking:bool = False,
in_memory: bool = True,
**corpusargs,
):
Expand All @@ -436,6 +437,7 @@ def __init__(
Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the eng.testa, .testb, .train
files in a folder called 'conll_03'. Then set the base_path parameter in the constructor to the path to the
parent directory where the conll_03 folder resides.
If using entity linking, the conll03 dateset is reduced by about 20 Documents, which are not part of the yago dataset.
:param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
:param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
POS tags or chunks respectively
Expand All @@ -446,16 +448,31 @@ def __init__(
base_path: Path = Path(base_path)

# column format
columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
if not entity_linking:
columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
else:
columns = {0: "text", 1: "pos", 2: "np", 3: "ner", 4: 'tmp',5:'entity' ,6:'normalised entity', 7: 'link', 8:'tmp_nr', 9:'tmpLink'}

# this dataset name
dataset_name = self.__class__.__name__.lower()
if entity_linking:
dataset_name = self.__class__.__name__.lower()+"-yago-reduced"
else:
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

if entity_linking:
print('Test')
conll_yago_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/conll_entity_linking/"
cached_path(f"{conll_yago_path}combinedENG.testa", Path("datasets") / dataset_name)
cached_path(f"{conll_yago_path}combinedENG.testb", Path("datasets") / dataset_name)
cached_path(f"{conll_yago_path}combinedENG.train", Path("datasets") / dataset_name)



# check if data there
if not data_folder.exists():
log.warning("-" * 100)
Expand All @@ -465,14 +482,25 @@ def __init__(
)
log.warning("-" * 100)

super(CONLL_03, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
in_memory=in_memory,
document_separator_token="-DOCSTART-",
**corpusargs,
)
if entity_linking:
super(CONLL_03, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
column_delimiter='\t',
in_memory=in_memory,
document_separator_token="-DOCSTART-",
**corpusargs,
)
else:
super(CONLL_03, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
in_memory=in_memory,
document_separator_token="-DOCSTART-",
**corpusargs,
)


class CONLL_03_GERMAN(ColumnCorpus):
Expand Down
2 changes: 1 addition & 1 deletion flair/models/sequence_tagger_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys

from pathlib import Path
from typing import List, Union, Optional, Dict
from typing import List, Union, Optional, Dict, Tuple
from warnings import warn

import numpy as np
Expand Down
111 changes: 102 additions & 9 deletions flair/trainers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,91 @@ def __init__(
self.tensorboard_log_dir = tensorboard_log_dir
self.metrics_for_tensorboard = metrics_for_tensorboard

def initialize_best_dev_score(self,log_dev):
"""
Initialize the best score the model has seen so far.
The score is the loss if we don't have dev data and main_score_type otherwise.
:param log_dev: whether dev data is available
"""
if log_dev:
# assume that the score used on the dev set should be maximized and is >=0
self.score_mode_for_best_model_saving = "max"
self.best_dev_score_seen = 0
else:
self.score_mode_for_best_model_saving = "min"
self.best_dev_score_seen = 100000000000

def check_for_best_score(self,score_value_for_best_model_saving):
"""
Check whether score_value_for_best_model_saving is better than the best score the trainer has seen so far.
The score is the loss if we don't have dev data and main_score_type otherwise.
:param score_value_for_best_model_saving: The current epoch score
:return: boolean indicating whether score_value_for_best_model_saving is better than the best score the trainer has seen so far
"""

if self.score_mode_for_best_model_saving=="max":
if self.best_dev_score_seen<score_value_for_best_model_saving:
found_best_model = True
self.best_dev_score_seen=score_value_for_best_model_saving
else:
found_best_model = False
else:
if self.best_dev_score_seen>score_value_for_best_model_saving:
found_best_model = True
self.best_dev_score_seen=score_value_for_best_model_saving
else:
found_best_model = False
return found_best_model

def save_best_model(self, base_path, save_checkpoint):
# delete previous best model
previous_best_path = self.get_best_model_path(base_path)
if os.path.exists(previous_best_path):
os.remove(previous_best_path)
if save_checkpoint:
best_checkpoint_path = previous_best_path.replace("model", "checkpoint")
if os.path.exists(best_checkpoint_path):
os.remove(best_checkpoint_path)
# save current best model
self.model.save(
base_path / f"best-model_epoch{self.epoch}.pt")
if save_checkpoint:
self.save_checkpoint(
base_path / f"best-checkpoint_epoch{self.epoch}.pt")

@staticmethod
def check_for_and_delete_previous_best_models(base_path, save_checkpoint):
all_best_model_names = [filename for filename in os.listdir(base_path) if
filename.startswith("best-model_epoch")]
if len(all_best_model_names) != 0:
warnings.warn(
"There should be no best model saved at epoch 1 except there is a model from previous trainings in your training folder. All previous best models will be deleted.")
for single_model in all_best_model_names:
previous_best_path = os.path.join(base_path, single_model)
if os.path.exists(previous_best_path):
os.remove(previous_best_path)
if save_checkpoint:
best_checkpoint_path = previous_best_path.replace("model", "checkpoint")
if os.path.exists(best_checkpoint_path):
os.remove(best_checkpoint_path)

def get_best_model_path(self, base_path, check_model_existance=False):
all_best_model_names = [filename for filename in os.listdir(base_path) if
filename.startswith("best-model_epoch")]
if check_model_existance:
if len(all_best_model_names) > 0:
assert len(all_best_model_names) == 1, "There should be at most one best model saved at any time."
return os.path.join(base_path, all_best_model_names[0])
else:
return ""
else:
if self.epoch > 1:
assert len(all_best_model_names) == 1, "There should be exactly one best model saved at any epoch > 1"
return os.path.join(base_path, all_best_model_names[0])
else:
assert len(all_best_model_names) == 0, "There should be no best model saved at epoch 1"
return ""

def train(
self,
base_path: Union[Path, str],
Expand Down Expand Up @@ -102,6 +187,7 @@ def train(
save_model_each_k_epochs: int = 0,
classification_main_metric=("micro avg", 'f1-score'),
tensorboard_comment='',
save_best_checkpoints=False,
**kwargs,
) -> dict:
"""
Expand Down Expand Up @@ -140,6 +226,7 @@ def train(
:param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved
:param classification_main_metric: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used), currently only applicable for text_classification_model
:param tensorboard_comment: Comment to use for tensorboard logging
:param save_best_checkpoints: If True, in addition to saving the best model also the corresponding checkpoint is saved
:param kwargs: Other arguments for the Optimizer
:return:
"""
Expand All @@ -152,10 +239,12 @@ def train(
if self.use_tensorboard:
try:
from torch.utils.tensorboard import SummaryWriter

if self.tensorboard_log_dir is not None and not os.path.exists(self.tensorboard_log_dir):
os.mkdir(self.tensorboard_log_dir)
writer = SummaryWriter(log_dir=self.tensorboard_log_dir, comment=tensorboard_comment)
log.info(f"tensorboard logging path is {self.tensorboard_log_dir}")

except:
log_line(log)
log.warning(
Expand Down Expand Up @@ -211,6 +300,9 @@ def train(
log_line(log)
log.warning(f'WARNING: Specified class weights will not take effect when using CRF')

# check for previously saved best models in the current training folder and delete them
self.check_for_and_delete_previous_best_models(base_path, save_best_checkpoints)

# determine what splits (train, dev, test) to evaluate and log
log_train = True if monitor_train else False
log_test = (
Expand All @@ -219,6 +311,7 @@ def train(
else False
)
log_dev = False if train_with_dev or not self.corpus.dev else True
self.initialize_best_dev_score(log_dev)
log_train_part = (
True
if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0)
Expand Down Expand Up @@ -338,12 +431,12 @@ def train(
if (
(anneal_with_restarts or anneal_with_prestarts)
and learning_rate != previous_learning_rate
and (base_path / "best-model.pt").exists()
and os.path.exists(self.get_best_model_path(base_path))
):
if anneal_with_restarts:
log.info("resetting to best model")
self.model.load_state_dict(
self.model.load(base_path / "best-model.pt").state_dict()
self.model.load(self.get_best_model_path(base_path)).state_dict()
)
if anneal_with_prestarts:
log.info("resetting to pre-best model")
Expand Down Expand Up @@ -629,12 +722,10 @@ def train(
if (
(not train_with_dev or anneal_with_restarts or anneal_with_prestarts)
and not param_selection_mode
and not isinstance(lr_scheduler, OneCycleLR)
and current_score == lr_scheduler.best
and bad_epochs == 0
and self.check_for_best_score(current_score)
):
print("saving best model")
self.model.save(base_path / "best-model.pt")
self.save_best_model(base_path, save_checkpoint=save_best_checkpoints)

if anneal_with_prestarts:
current_state_dict = self.model.state_dict()
Expand Down Expand Up @@ -701,12 +792,14 @@ def final_test(
base_path = Path(base_path)

log_line(log)
log.info("Testing using best model ...")

self.model.eval()

if (base_path / "best-model.pt").exists():
self.model = self.model.load(base_path / "best-model.pt")
if (os.path.exists(self.get_best_model_path(base_path, check_model_existance=True))):
log.info("Testing using best model ...")
self.model = self.model.load(self.get_best_model_path(base_path, check_model_existance=True))
else:
log.info("Testing using last state of model ...")

test_results, test_loss = self.model.evaluate(
self.corpus.test,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
python-dateutil>=2.6.1
torch>=1.5.0
torch>=1.5.0,<1.8.0
gensim>=3.4.0,<=3.8.3
tqdm>=4.26.0
segtok>=1.5.7
Expand Down
2 changes: 1 addition & 1 deletion resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ The following embeddings are currently supported:
| 'pl' | Polish | Polish FastText embeddings |
| 'cz' | Czech | Czech FastText embeddings |
| 'sk' | Slovak | Slovak FastText embeddings |
| 'si' | Slovenian | Slovenian FastText embeddings |
| 'sl' | Slovenian | Slovenian FastText embeddings |
| 'sr' | Serbian | Serbian FastText embeddings |
| 'hr' | Croatian | Croatian FastText embeddings |
| 'bg' | Bulgarian | Bulgarian FastText embeddings |
Expand Down

0 comments on commit d53e013

Please sign in to comment.