flairNLP · alanakbik · Mar 24, 2021 · Mar 24, 2021 · Mar 24, 2021 · Mar 24, 2021
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -408,6 +408,10 @@ def _requires_span_F1_evaluation(self) -> bool:
         for item in self.tag_dictionary.get_items():
             if item.startswith('B-'):
                 span_F1 = True
+            if item == 'O':
+                span_F1 = True
+            if item == '':
+                span_F1 = True
         return span_F1
 
     def _evaluate_with_span_F1(self, data_loader, embedding_storage_mode, mini_batch_size, out_path):
@@ -511,25 +515,7 @@ def _evaluate_with_span_F1(self, data_loader, embedding_storage_mode, mini_batch
 
         return result, eval_loss
 
-    def evaluate(
-            self,
-            sentences: Union[List[Sentence], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-            mini_batch_size: int = 32,
-            num_workers: int = 8,
-            wsd_evaluation: bool = False,
-            **kwargs
-    ) -> (Result, float):
-
-        # read Dataset into data loader (if list of sentences passed, make Dataset first)
-        if not isinstance(sentences, Dataset):
-            sentences = SentenceDataset(sentences)
-        data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers)
-
-        # if span F1 needs to be used, use separate eval method
-        if self._requires_span_F1_evaluation() and not wsd_evaluation:
-            return self._evaluate_with_span_F1(data_loader, embedding_storage_mode, mini_batch_size, out_path)
+    def _evaluate_with_regular_F1(self, data_loader, embedding_storage_mode, mini_batch_size, out_path):
 
         # else, use scikit-learn to evaluate
         y_true = []
@@ -560,13 +546,7 @@ def evaluate(
                     y_true.append(labels.add_item(gold_tag))
 
                     # add predicted tag
-                    if wsd_evaluation:
-                        if gold_tag == 'O':
-                            predicted_tag = 'O'
-                        else:
-                            predicted_tag = token.get_tag('predicted').value
-                    else:
-                        predicted_tag = token.get_tag('predicted').value
+                    predicted_tag = token.get_tag('predicted').value
 
                     y_pred.append(labels.add_item(predicted_tag))
 
@@ -605,7 +585,6 @@ def evaluate(
         classification_report = metrics.classification_report(y_true, y_pred, digits=4, target_names=target_names,
                                                               zero_division=1, labels=labels_to_report)
 
-
         # get scores
         micro_f_score = round(
             metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', labels=labels_to_report), 4)
@@ -633,6 +612,28 @@ def evaluate(
         )
         return result, eval_loss
 
+    def evaluate(
+            self,
+            sentences: Union[List[Sentence], Dataset],
+            out_path: Union[str, Path] = None,
+            embedding_storage_mode: str = "none",
+            mini_batch_size: int = 32,
+            num_workers: int = 8,
+            wsd_evaluation: bool = False,
+            **kwargs
+    ) -> (Result, float):
+
+        # read Dataset into data loader (if list of sentences passed, make Dataset first)
+        if not isinstance(sentences, Dataset):
+            sentences = SentenceDataset(sentences)
+        data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers)
+
+        # depending on whether span F1 needs to be used, use separate eval method
+        if self._requires_span_F1_evaluation():
+            return self._evaluate_with_span_F1(data_loader, embedding_storage_mode, mini_batch_size, out_path)
+        else:
+            return self._evaluate_with_regular_F1(data_loader, embedding_storage_mode, mini_batch_size, out_path)
+
     def forward_loss(
             self, data_points: Union[List[Sentence], Sentence], sort=True
     ) -> torch.tensor:
@@ -1148,10 +1149,12 @@ def _fetch_model(model_name) -> str:
 
             # output information
             log.info("-" * 80)
-            log.info(f"The model key '{model_name}' now maps to 'https://huggingface.co/{hf_model_name}' on the HuggingFace ModelHub")
+            log.info(
+                f"The model key '{model_name}' now maps to 'https://huggingface.co/{hf_model_name}' on the HuggingFace ModelHub")
             log.info(f" - The most current version of the model is automatically downloaded from there.")
             if model_name in hu_model_map:
-                log.info(f" - (you can alternatively manually download the original model at {hu_model_map[model_name]})")
+                log.info(
+                    f" - (you can alternatively manually download the original model at {hu_model_map[model_name]})")
             log.info("-" * 80)
 
             # use mapped name instead
@@ -1229,7 +1232,7 @@ def _fetch_model(model_name) -> str:
                 log.error(f" -> Please check https://huggingface.co/models?filter=flair for all available models.")
                 log.error(f" -> Alternatively, point to a model file on your local drive.")
                 log.error("-" * 80)
-                Path(flair.cache_root / 'models' / model_folder).rmdir() # remove folder again if not valid
+                Path(flair.cache_root / 'models' / model_folder).rmdir()  # remove folder again if not valid
 
         return model_path
 

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
@@ -61,7 +61,7 @@ def __init__(
         self.epoch: int = epoch
         self.use_tensorboard: bool = use_tensorboard
 
-    def initialize_best_dev_score(self,log_dev):
+    def initialize_best_dev_score(self, log_dev):
         """
         Initialize the best score the model has seen so far.
         The score is the loss if we don't have dev data and main_score_type otherwise.
@@ -75,24 +75,24 @@ def initialize_best_dev_score(self,log_dev):
             self.score_mode_for_best_model_saving = "min"
             self.best_dev_score_seen = 100000000000
 
-    def check_for_best_score(self,score_value_for_best_model_saving):
+    def check_for_best_score(self, score_value_for_best_model_saving):
         """
         Check whether score_value_for_best_model_saving is better than the best score the trainer has seen so far.
         The score is the loss if we don't have dev data and main_score_type otherwise.
         :param score_value_for_best_model_saving: The current epoch score
         :return: boolean indicating whether score_value_for_best_model_saving is better than the best score the trainer has seen so far
         """
 
-        if self.score_mode_for_best_model_saving=="max":
-            if self.best_dev_score_seen<score_value_for_best_model_saving:
+        if self.score_mode_for_best_model_saving == "max":
+            if self.best_dev_score_seen < score_value_for_best_model_saving:
                 found_best_model = True
-                self.best_dev_score_seen=score_value_for_best_model_saving
+                self.best_dev_score_seen = score_value_for_best_model_saving
             else:
                 found_best_model = False
         else:
-            if self.best_dev_score_seen>score_value_for_best_model_saving:
+            if self.best_dev_score_seen > score_value_for_best_model_saving:
                 found_best_model = True
-                self.best_dev_score_seen=score_value_for_best_model_saving
+                self.best_dev_score_seen = score_value_for_best_model_saving
             else:
                 found_best_model = False
         return found_best_model
@@ -131,7 +131,7 @@ def check_for_and_delete_previous_best_models(base_path, save_checkpoint):
 
     def get_best_model_path(self, base_path, check_model_existance=False):
         all_best_model_names = [filename for filename in os.listdir(base_path) if
-                                     filename.startswith("best-model_epoch")]
+                                filename.startswith("best-model_epoch")]
         if check_model_existance:
             if len(all_best_model_names) > 0:
                 assert len(all_best_model_names) == 1, "There should be at most one best model saved at any time."
@@ -222,12 +222,9 @@ def train(
         :param kwargs: Other arguments for the Optimizer
         :return:
         """
-        if isinstance(self.model, TextClassifier):
-            self.main_score_type=classification_main_metric
-        else:
-            if classification_main_metric is not None:
-                warnings.warn("Specification of main score type only implemented for text classifier. Defaulting to main score type of selected model.")
-            self.main_score_type = None
+
+        main_score_type = classification_main_metric if isinstance(self.model, TextClassifier) else None
+
         if self.use_tensorboard:
             try:
                 from torch.utils.tensorboard import SummaryWriter
@@ -546,7 +543,7 @@ def train(
                         mini_batch_size=mini_batch_chunk_size,
                         num_workers=num_workers,
                         embedding_storage_mode=embeddings_storage_mode,
-                        main_score_type=self.main_score_type
+                        main_score_type=main_score_type
                     )
                     result_line += f"\t{train_eval_result.log_line}"
 
@@ -559,7 +556,7 @@ def train(
                         mini_batch_size=mini_batch_chunk_size,
                         num_workers=num_workers,
                         embedding_storage_mode=embeddings_storage_mode,
-                        main_score_type=self.main_score_type
+                        main_score_type=main_score_type
                     )
                     result_line += (
                         f"\t{train_part_loss}\t{train_part_eval_result.log_line}"
@@ -575,7 +572,7 @@ def train(
                         num_workers=num_workers,
                         out_path=base_path / "dev.tsv",
                         embedding_storage_mode=embeddings_storage_mode,
-                        main_score_type=self.main_score_type
+                        main_score_type=main_score_type
                     )
                     result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}"
                     log.info(
@@ -604,7 +601,7 @@ def train(
                         num_workers=num_workers,
                         out_path=base_path / "test.tsv",
                         embedding_storage_mode=embeddings_storage_mode,
-                        main_score_type=self.main_score_type
+                        main_score_type=main_score_type
                     )
                     result_line += f"\t{test_loss}\t{test_eval_result.log_line}"
                     log.info(
@@ -725,7 +722,7 @@ def train(
 
         # test best model if test data is present
         if self.corpus.test and not train_with_test:
-            final_score = self.final_test(base_path, mini_batch_chunk_size, num_workers)
+            final_score = self.final_test(base_path, mini_batch_chunk_size, num_workers, main_score_type)
         else:
             final_score = 0
             log.info("Test data not provided setting final score to 0")
@@ -755,7 +752,11 @@ def load_checkpoint(cls, checkpoint: Union[Path, str], corpus: Corpus):
         return model
 
     def final_test(
-            self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8
+            self,
+            base_path: Union[Path, str],
+            eval_mini_batch_size: int,
+            num_workers: int = 8,
+            main_score_type: str = None,
     ):
         if type(base_path) is str:
             base_path = Path(base_path)
@@ -776,7 +777,7 @@ def final_test(
             num_workers=num_workers,
             out_path=base_path / "test.tsv",
             embedding_storage_mode="none",
-            main_score_type=self.main_score_type
+            main_score_type=main_score_type
         )
 
         test_results: Result = test_results
@@ -795,7 +796,7 @@ def final_test(
                         num_workers=num_workers,
                         out_path=base_path / f"{subcorpus.name}-test.tsv",
                         embedding_storage_mode="none",
-                        main_score_type=self.main_score_type
+                        main_score_type=main_score_type
                     )
                     log.info(subcorpus.name)
                     log.info(subcorpus_results.log_line)