In [None]:
from transformers import T5Tokenizer
import json

In [None]:
tokenizer = T5Tokenizer.from_pretrained('google/mt5-xl', legacy=False)

In [None]:
sample_query = "After whom is the Riemannian geometry named? <pad> <pad> <pad>SCONJ PRON AUX DET ADJ NOUN VERB PUNCT <pad> wd_Q761383 <pad> <pad> "
tokenizer.tokenize(sample_query)

**Token analysis for datasets**

In [None]:
### DBpedia
qald_data_paths = {
    'qald9plus_train_path': '../datasets/qald9plus/dbpedia/qald_9_plus_train_dbpedia.json',
    'qald9plus_test_path': '../datasets/qald9plus/dbpedia/qald_9_plus_test_dbpedia.json'
}

lcquad_data_paths = {
    'lcquad2_train_path': '../datasets/lcquad1/train-data.json',
    'lcquad2_test_path': '../datasets/lcquad1/test-data.json'
}

### Wikidata
# qald_data_paths = {
#     'qald9plus_train_path': '../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata.json',
#     'qald9plus_test_path': '../datasets/qald9plus/wikidata/qald_9_plus_test_wikidata.json',
#     'qald10_train_path': '../datasets/qald10/qald_10.json'
# }

# lcquad_data_paths = {
#     'lcquad2_train_path': '../datasets/lcquad2/train.json',
#     'lcquad2_test_path': '../datasets/lcquad2/test.json'
# }

In [None]:
# Find maximum question length in QALDfiles
qald_max_length = 0
max_question_string = ''
max_file = ''
for file in qald_data_paths.values():
    with open(file, 'r') as fp:
        qald_obj = json.load(fp)
    for question_obj in qald_obj['questions']:
        for q_pair in question_obj['question']:
            q_str = q_pair['string']
            cur_len = len(tokenizer.tokenize(q_str))
            if cur_len > qald_max_length:
                qald_max_length = cur_len
                max_question_string = q_str
                max_file = file

print('Maximum NL token length:', qald_max_length)
print('question:', max_question_string)
print('File:', max_file)

In [None]:
# Find maximum SPARQL length in QALDfiles
qald_max_length = 0
max_question_string = ''
max_file = ''
for file in qald_data_paths.values():
    with open(file, 'r') as fp:
        qald_obj = json.load(fp)
    for question_obj in qald_obj['questions']:
        q_str = question_obj['query']['sparql']
        cur_len = len(tokenizer.tokenize(q_str))
        if cur_len > qald_max_length:
            qald_max_length = cur_len
            max_question_string = q_str
            max_file = file

print('Maximum SPARQL token length:', qald_max_length)
print('question:', max_question_string)
print('File:', max_file)

In [None]:
# Find maximum question length in LCQUAD files
lcquad_max_length = 0
max_question_string = ''
max_file = ''
for file in lcquad_data_paths.values():
    with open(file, 'r') as fp:
        lcquad_obj = json.load(fp)
    for question_obj in lcquad_obj:
        q_str = question_obj.get('NNQT_question')
        if not q_str:
            q_str = question_obj.get('corrected_question')
        cur_len = len(tokenizer.tokenize(q_str))
        if cur_len > lcquad_max_length:
            lcquad_max_length = cur_len
            max_question_string = q_str
            max_file = file

print('Maximum NL token length:', lcquad_max_length)
print('question:', max_question_string)
print('File:', max_file)

In [None]:
import csv
from transformers import T5Tokenizer

In [None]:
tokenizer = T5Tokenizer.from_pretrained('google/mt5-xl', legacy=False)
tokenizer.add_tokens(["<start-of-pos-tags>", "<start-of-dependency-relation>", "<start-of-dependency-tree-depth>", "<start-of-entity-info>"])
# csvfile = open('../datasets/lcquad2/train-lc-ent.csv','r')
# csvfile = open('../datasets/lcquad2/train-simple.csv','r')
# csvfile = open('../datasets/lcquad2/train-lc.csv','r')
# csvfile = open('../datasets/lcquad2/train-ent.csv','r')

# csvfile = open('../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata-lc-ent.csv','r')
# csvfile = open('../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata-simple.csv','r')
# csvfile = open('../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata-lc.csv','r')
# csvfile = open('../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata-ent.csv','r')
csvfile = open('../datasets/lcquad1/train-data.csv','r')

spamreader = csv.reader(csvfile)
header = next(spamreader)

In [None]:
expected_len = 576 # lc-ent
# expected_len = 128 # simple
# expected_len = 512 # lc
# expected_len = 192 # ent
all_valid = True
for row in spamreader:
    row_len = len(tokenizer.tokenize(row[0].strip()))
    if(row_len != expected_len):
        print(row[0])
        print(row_len)
        all_valid = False
        break
if all_valid:
    print('All strings format as expected.')

# Note: Only question that fails for samples with linguistic context is from LCQUAD2 train:
# "NNQT_question": "What is {safety classification and labelling} of {water}, that has {original title} is {Europa-Parlamentets og R\u00e5dets forordning (EF) nr. 1272/2008 af 16. december 2008 om klassificering, m\u00e6rkning og emballering af stoffer og blandinger og om \u00e6ndring og oph\u00e6velse af direktiv 67/548/E\u00d8F og 1999/45/EF og om \u00e6ndring af forordning (EF) nr. 1907/2006} ?",
# "uid": 12586
# We choose to ignore this particular error as the sample size is too low.