Skip to content

Commit

Permalink
Fix/missing truncation bug (#679)
Browse files Browse the repository at this point in the history
* Added stripping of whitespaces to input validation
* Added verbose=False flag to encoder call by tokenizer
  • Loading branch information
Julian Risch committed Jan 7, 2021
1 parent 1f046ca commit 42af265
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion farm/data_handler/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2101,7 +2101,7 @@ def _convert_answers(self, baskets):
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
label_idxs[i][1] = -100
break # Break loop around answers, so the error message is not shown multiple times
elif answer_indices != answer_text.strip():
elif answer_indices.strip() != answer_text.strip():
logger.warning(f"""Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n
Example will not be converted for training/evaluation.""")
error_in_answer = True
Expand Down
2 changes: 1 addition & 1 deletion farm/modeling/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ def tokenize_batch_question_answering(pre_baskets, tokenizer, indices):
baskets = []
# # Tokenize texts in batch mode
texts = [d["context"] for d in pre_baskets]
tokenized_docs_batch = tokenizer.batch_encode_plus(texts, return_offsets_mapping=True, return_special_tokens_mask=True, add_special_tokens=False)
tokenized_docs_batch = tokenizer.batch_encode_plus(texts, return_offsets_mapping=True, return_special_tokens_mask=True, add_special_tokens=False, verbose=False)

# Extract relevant data
tokenids_batch = tokenized_docs_batch["input_ids"]
Expand Down

0 comments on commit 42af265

Please sign in to comment.