Fix/missing truncation bug (#679)

* Added stripping of whitespaces to input validation * Added verbose=False flag to encoder call by tokenizer
deepset-ai · Jan 7, 2021 · 42af265 · 42af265
1 parent 1f046ca
commit 42af265
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
@@ -2101,7 +2101,7 @@ def _convert_answers(self, baskets):
                                 label_idxs[i][0] = -100  # TODO remove this hack also from featurization
                                 label_idxs[i][1] = -100
                                 break  # Break loop around answers, so the error message is not shown multiple times
-                            elif answer_indices != answer_text.strip():
+                            elif answer_indices.strip() != answer_text.strip():
                                 logger.warning(f"""Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n
                                                    Example will not be converted for training/evaluation.""")
                                 error_in_answer = True

diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
@@ -567,7 +567,7 @@ def tokenize_batch_question_answering(pre_baskets, tokenizer, indices):
     baskets = []
     # # Tokenize texts in batch mode
     texts = [d["context"] for d in pre_baskets]
-    tokenized_docs_batch = tokenizer.batch_encode_plus(texts, return_offsets_mapping=True, return_special_tokens_mask=True, add_special_tokens=False)
+    tokenized_docs_batch = tokenizer.batch_encode_plus(texts, return_offsets_mapping=True, return_special_tokens_mask=True, add_special_tokens=False, verbose=False)
 
     # Extract relevant data
     tokenids_batch = tokenized_docs_batch["input_ids"]