Skip to content

Commit

Permalink
Fix logged error msg for FastTokenizer + QA (#541)
Browse files Browse the repository at this point in the history
  • Loading branch information
tholor committed Sep 14, 2020
1 parent ce34cc2 commit f340a9e
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions farm/data_handler/input_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,12 +438,16 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
return_special_tokens_mask=True,
return_token_type_ids=True)

if (len(encoded["input_ids"]) - encoded["special_tokens_mask"].count(1)) != \
(len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])):
logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
f"{len(encoded['input_ids']) - encoded['special_tokens_mask'].count(1)} tokens, which differs "
f"from number of tokens produced in tokenize_with_metadata(). \n"
f"Further processing is likely to be wrong.")
n_tokens_encoded = len(encoded["input_ids"]) - encoded["special_tokens_mask"].count(1)
n_tokens_with_metadata = len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])

if n_tokens_encoded != n_tokens_with_metadata:
tokens_encoded = tokenizer.convert_ids_to_tokens(encoded["input_ids"])
logger.error(f"FastTokenizer encoded sample to {n_tokens_encoded} tokens,"
f" while the previous tokenize_with_metadata produced {n_tokens_with_metadata} tokens. \n"
f"Further processing is likely to be wrong.\n"
f"FastTokenizer: {tokens_encoded} \n"
f"tokenize_with_metadata: {sample.tokenized['question_tokens'] + sample.tokenized['passage_tokens']}")
else:
encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"],
text_pair=sample.tokenized["passage_tokens"],
Expand Down

0 comments on commit f340a9e

Please sign in to comment.