In [1]:
from transformers import T5Tokenizer
import json

In [2]:
tokenizer = T5Tokenizer.from_pretrained('google/mt5-xl', legacy=False)

In [6]:
sample_query = "After whom is the Riemannian geometry named? <pad> <pad> <pad>SCONJ PRON AUX DET ADJ NOUN VERB PUNCT <pad> wd_Q761383 <pad> <pad> "
tokenizer.tokenize(sample_query)

['▁After',
 '▁who',
 'm',
 '▁is',
 '▁the',
 '▁Rie',
 'manni',
 'an',
 '▁',
 'geometry',
 '▁',
 'named',
 '?',
 '<pad>',
 '<pad>',
 '<pad>',
 'SCO',
 'NJ',
 '▁PRO',
 'N',
 '▁',
 'AUX',
 '▁DET',
 '▁A',
 'DJ',
 '▁NO',
 'UN',
 '▁VER',
 'B',
 '▁PUN',
 'CT',
 '<pad>',
 'w',
 'd',
 '_',
 'Q',
 '76',
 '1383',
 '<pad>',
 '<pad>']

**Token analysis for datasets**

In [3]:
qald_data_paths = {
    'qald9plus_train_path': '../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata.json',
    'qald9plus_test_path': '../datasets/qald9plus/wikidata/qald_9_plus_test_wikidata.json',
    'qald10_train_path': '../datasets/qald10/qald_10.json'
}

lcquad_data_paths = {
    'lcquad2_train_path': '../datasets/lcquad2/train.json',
    'lcquad2_test_path': '../datasets/lcquad2/test.json'
}

In [4]:
# Find maximum length in QALDfiles
qald_max_length = 0
max_question_string = ''
max_file = ''
for file in qald_data_paths.values():
    with open(file, 'r') as fp:
        qald_obj = json.load(fp)
    for question_obj in qald_obj['questions']:
        for q_pair in question_obj['question']:
            q_str = q_pair['string']
            cur_len = len(tokenizer.tokenize(q_str))
            if cur_len > qald_max_length:
                qald_max_length = cur_len
                max_question_string = q_str
                max_file = file

print('Maximum token length:', qald_max_length)
print('question:', max_question_string)
print('File:', max_file)

Maximum token length: 47
question: Манхэттенский ҡатнашыусыларҙың проекттары һәм донъяла билдәле ғалим булараҡ нобель премияһына лайыҡ булыусылар ниндәй?
File: ../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata.json


In [5]:
# Find maximum length in LCQUAD files
lcquad_max_length = 0
max_question_string = ''
max_file = ''
for file in lcquad_data_paths.values():
    with open(file, 'r') as fp:
        lcquad_obj = json.load(fp)
    for question_obj in lcquad_obj:
        q_str = question_obj['NNQT_question']
        cur_len = len(tokenizer.tokenize(q_str))
        if cur_len > lcquad_max_length:
            lcquad_max_length = cur_len
            max_question_string = q_str
            max_file = file

print('Maximum token length:', lcquad_max_length)
print('question:', max_question_string)
print('File:', max_file)

Maximum token length: 101
question: What is {safety classification and labelling} of {water}, that has {original title} is {Europa-Parlamentets og Rådets forordning (EF) nr. 1272/2008 af 16. december 2008 om klassificering, mærkning og emballering af stoffer og blandinger og om ændring og ophævelse af direktiv 67/548/EØF og 1999/45/EF og om ændring af forordning (EF) nr. 1907/2006} ?
File: ../datasets/lcquad2/train.json
