Merge branch 'master' into GH-2120-imporved-tensorboard-logging

flairNLP · Mar 23, 2021 · d53e013 · d53e013
2 parents 38eb8e6 + 8b93b40
commit d53e013
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 30 deletions.
diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
@@ -745,21 +745,25 @@ def __init__(self,
             base_path: Path = Path(base_path)
 
         # this dataset name
-        dataset_name = self.__class__.__name__.lower() + '_v2'
-
-        if rebalance_corpus:
-            dataset_name = dataset_name + '-rebalanced'
+        dataset_name = self.__class__.__name__.lower() + '_v4'
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
 
         # download data if necessary
         imdb_acl_path = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+
+        if rebalance_corpus:
+            dataset_name = dataset_name + '-rebalanced'
+        data_folder = base_path / dataset_name
         data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        data_file = data_path / "train.txt"
-        if not data_file.is_file():
+        train_data_file = data_path / "train.txt"
+        test_data_file = data_path / "test.txt"
+
+        if train_data_file.is_file()==False or (rebalance_corpus==False and test_data_file.is_file()==False):
+            [os.remove(file_path) for file_path in [train_data_file, test_data_file] if file_path.is_file()]
+
             cached_path(imdb_acl_path, Path("datasets") / dataset_name)
             import tarfile
 
@@ -783,7 +787,11 @@ def __init__(self,
                                 if f"{dataset}/{label}" in m.name
                             ],
                         )
-                        with open(f"{data_path}/train-all.txt", "at") as f_p:
+                        data_file = train_data_file
+                        if rebalance_corpus==False and dataset=="test":
+                            data_file = test_data_file
+
+                        with open(data_file, "at") as f_p:
                             current_path = data_path / "aclImdb" / dataset / label
                             for file_name in current_path.iterdir():
                                 if file_name.is_file() and file_name.name.endswith(

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -428,6 +428,7 @@ def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
+            entity_linking:bool = False,
             in_memory: bool = True,
             **corpusargs,
     ):
@@ -436,6 +437,7 @@ def __init__(
         Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the eng.testa, .testb, .train
         files in a folder called 'conll_03'. Then set the base_path parameter in the constructor to the path to the
         parent directory where the conll_03 folder resides.
+        If using entity linking, the conll03 dateset is reduced by about 20 Documents, which are not part of the yago dataset.
         :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
         :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
         POS tags or chunks respectively
@@ -446,16 +448,31 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
+        if not entity_linking:
+            columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
+        else:
+            columns = {0: "text", 1: "pos", 2: "np", 3: "ner", 4: 'tmp',5:'entity' ,6:'normalised entity', 7: 'link', 8:'tmp_nr', 9:'tmpLink'}
 
         # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+        if entity_linking:
+            dataset_name = self.__class__.__name__.lower()+"-yago-reduced"
+        else:
+            dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
+        if entity_linking:
+            print('Test')
+            conll_yago_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/conll_entity_linking/"
+            cached_path(f"{conll_yago_path}combinedENG.testa", Path("datasets") / dataset_name)
+            cached_path(f"{conll_yago_path}combinedENG.testb", Path("datasets") / dataset_name)
+            cached_path(f"{conll_yago_path}combinedENG.train", Path("datasets") / dataset_name)
+
+
+
         # check if data there
         if not data_folder.exists():
             log.warning("-" * 100)
@@ -465,14 +482,25 @@ def __init__(
             )
             log.warning("-" * 100)
 
-        super(CONLL_03, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
-            document_separator_token="-DOCSTART-",
-            **corpusargs,
-        )
+        if entity_linking:
+            super(CONLL_03, self).__init__(
+                data_folder,
+                columns,
+                tag_to_bioes=tag_to_bioes,
+                column_delimiter='\t',
+                in_memory=in_memory,
+                document_separator_token="-DOCSTART-",
+                **corpusargs,
+            )
+        else:    
+            super(CONLL_03, self).__init__(
+                data_folder,
+                columns,
+                tag_to_bioes=tag_to_bioes,
+                in_memory=in_memory,
+                document_separator_token="-DOCSTART-",
+                **corpusargs,
+            )
 
 
 class CONLL_03_GERMAN(ColumnCorpus):

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -2,7 +2,7 @@
 import sys
 
 from pathlib import Path
-from typing import List, Union, Optional, Dict
+from typing import List, Union, Optional, Dict, Tuple
 from warnings import warn
 
 import numpy as np

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
@@ -67,6 +67,91 @@ def __init__(
         self.tensorboard_log_dir = tensorboard_log_dir
         self.metrics_for_tensorboard = metrics_for_tensorboard
 
+    def initialize_best_dev_score(self,log_dev):
+        """
+        Initialize the best score the model has seen so far.
+        The score is the loss if we don't have dev data and main_score_type otherwise.
+        :param log_dev: whether dev data is available
+        """
+        if log_dev:
+            # assume that the score used on the dev set should be maximized and is >=0
+            self.score_mode_for_best_model_saving = "max"
+            self.best_dev_score_seen = 0
+        else:
+            self.score_mode_for_best_model_saving = "min"
+            self.best_dev_score_seen = 100000000000
+
+    def check_for_best_score(self,score_value_for_best_model_saving):
+        """
+        Check whether score_value_for_best_model_saving is better than the best score the trainer has seen so far.
+        The score is the loss if we don't have dev data and main_score_type otherwise.
+        :param score_value_for_best_model_saving: The current epoch score
+        :return: boolean indicating whether score_value_for_best_model_saving is better than the best score the trainer has seen so far
+        """
+
+        if self.score_mode_for_best_model_saving=="max":
+            if self.best_dev_score_seen<score_value_for_best_model_saving:
+                found_best_model = True
+                self.best_dev_score_seen=score_value_for_best_model_saving
+            else:
+                found_best_model = False
+        else:
+            if self.best_dev_score_seen>score_value_for_best_model_saving:
+                found_best_model = True
+                self.best_dev_score_seen=score_value_for_best_model_saving
+            else:
+                found_best_model = False
+        return found_best_model
+
+    def save_best_model(self, base_path, save_checkpoint):
+        # delete previous best model
+        previous_best_path = self.get_best_model_path(base_path)
+        if os.path.exists(previous_best_path):
+            os.remove(previous_best_path)
+        if save_checkpoint:
+            best_checkpoint_path = previous_best_path.replace("model", "checkpoint")
+            if os.path.exists(best_checkpoint_path):
+                os.remove(best_checkpoint_path)
+        # save current best model
+        self.model.save(
+            base_path / f"best-model_epoch{self.epoch}.pt")
+        if save_checkpoint:
+            self.save_checkpoint(
+                base_path / f"best-checkpoint_epoch{self.epoch}.pt")
+
+    @staticmethod
+    def check_for_and_delete_previous_best_models(base_path, save_checkpoint):
+        all_best_model_names = [filename for filename in os.listdir(base_path) if
+                                filename.startswith("best-model_epoch")]
+        if len(all_best_model_names) != 0:
+            warnings.warn(
+                "There should be no best model saved at epoch 1 except there is a model from previous trainings in your training folder. All previous best models will be deleted.")
+        for single_model in all_best_model_names:
+            previous_best_path = os.path.join(base_path, single_model)
+            if os.path.exists(previous_best_path):
+                os.remove(previous_best_path)
+            if save_checkpoint:
+                best_checkpoint_path = previous_best_path.replace("model", "checkpoint")
+                if os.path.exists(best_checkpoint_path):
+                    os.remove(best_checkpoint_path)
+
+    def get_best_model_path(self, base_path, check_model_existance=False):
+        all_best_model_names = [filename for filename in os.listdir(base_path) if
+                                     filename.startswith("best-model_epoch")]
+        if check_model_existance:
+            if len(all_best_model_names) > 0:
+                assert len(all_best_model_names) == 1, "There should be at most one best model saved at any time."
+                return os.path.join(base_path, all_best_model_names[0])
+            else:
+                return ""
+        else:
+            if self.epoch > 1:
+                assert len(all_best_model_names) == 1, "There should be exactly one best model saved at any epoch > 1"
+                return os.path.join(base_path, all_best_model_names[0])
+            else:
+                assert len(all_best_model_names) == 0, "There should be no best model saved at epoch 1"
+                return ""
+
     def train(
             self,
             base_path: Union[Path, str],
@@ -102,6 +187,7 @@ def train(
             save_model_each_k_epochs: int = 0,
             classification_main_metric=("micro avg", 'f1-score'),
             tensorboard_comment='',
+            save_best_checkpoints=False,
             **kwargs,
     ) -> dict:
         """
@@ -140,6 +226,7 @@ def train(
         :param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved
         :param classification_main_metric: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used), currently only applicable for text_classification_model
         :param tensorboard_comment: Comment to use for tensorboard logging
+        :param save_best_checkpoints: If True, in addition to saving the best model also the corresponding checkpoint is saved
         :param kwargs: Other arguments for the Optimizer
         :return:
         """
@@ -152,10 +239,12 @@ def train(
         if self.use_tensorboard:
             try:
                 from torch.utils.tensorboard import SummaryWriter
+
                 if self.tensorboard_log_dir is not None and not os.path.exists(self.tensorboard_log_dir):
                     os.mkdir(self.tensorboard_log_dir)
                 writer = SummaryWriter(log_dir=self.tensorboard_log_dir, comment=tensorboard_comment)
                 log.info(f"tensorboard logging path is {self.tensorboard_log_dir}")
+
             except:
                 log_line(log)
                 log.warning(
@@ -211,6 +300,9 @@ def train(
             log_line(log)
             log.warning(f'WARNING: Specified class weights will not take effect when using CRF')
 
+        # check for previously saved best models in the current training folder and delete them
+        self.check_for_and_delete_previous_best_models(base_path, save_best_checkpoints)
+
         # determine what splits (train, dev, test) to evaluate and log
         log_train = True if monitor_train else False
         log_test = (
@@ -219,6 +311,7 @@ def train(
             else False
         )
         log_dev = False if train_with_dev or not self.corpus.dev else True
+        self.initialize_best_dev_score(log_dev)
         log_train_part = (
             True
             if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0)
@@ -338,12 +431,12 @@ def train(
                 if (
                         (anneal_with_restarts or anneal_with_prestarts)
                         and learning_rate != previous_learning_rate
-                        and (base_path / "best-model.pt").exists()
+                        and os.path.exists(self.get_best_model_path(base_path))
                 ):
                     if anneal_with_restarts:
                         log.info("resetting to best model")
                         self.model.load_state_dict(
-                            self.model.load(base_path / "best-model.pt").state_dict()
+                            self.model.load(self.get_best_model_path(base_path)).state_dict()
                         )
                     if anneal_with_prestarts:
                         log.info("resetting to pre-best model")
@@ -629,12 +722,10 @@ def train(
                 if (
                         (not train_with_dev or anneal_with_restarts or anneal_with_prestarts)
                         and not param_selection_mode
-                        and not isinstance(lr_scheduler, OneCycleLR)
-                        and current_score == lr_scheduler.best
-                        and bad_epochs == 0
+                        and self.check_for_best_score(current_score)
                 ):
                     print("saving best model")
-                    self.model.save(base_path / "best-model.pt")
+                    self.save_best_model(base_path, save_checkpoint=save_best_checkpoints)
 
                     if anneal_with_prestarts:
                         current_state_dict = self.model.state_dict()
@@ -701,12 +792,14 @@ def final_test(
             base_path = Path(base_path)
 
         log_line(log)
-        log.info("Testing using best model ...")
 
         self.model.eval()
 
-        if (base_path / "best-model.pt").exists():
-            self.model = self.model.load(base_path / "best-model.pt")
+        if (os.path.exists(self.get_best_model_path(base_path, check_model_existance=True))):
+            log.info("Testing using best model ...")
+            self.model = self.model.load(self.get_best_model_path(base_path, check_model_existance=True))
+        else:
+            log.info("Testing using last state of model ...")
 
         test_results, test_loss = self.model.evaluate(
             self.corpus.test,

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 python-dateutil>=2.6.1
-torch>=1.5.0
+torch>=1.5.0,<1.8.0
 gensim>=3.4.0,<=3.8.3
 tqdm>=4.26.0
 segtok>=1.5.7

diff --git a/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md b/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
@@ -65,7 +65,7 @@ The following embeddings are currently supported:
 | 'pl' | Polish | Polish FastText embeddings |
 | 'cz' | Czech | Czech FastText embeddings |
 | 'sk' | Slovak | Slovak FastText embeddings |
-| 'si' | Slovenian | Slovenian FastText embeddings |
+| 'sl' | Slovenian | Slovenian FastText embeddings |
 | 'sr' | Serbian | Serbian FastText embeddings |
 | 'hr' | Croatian | Croatian FastText embeddings |
 | 'bg' | Bulgarian | Bulgarian FastText embeddings |