In [25]:
from transformers import AutoTokenizer
import pandas as pd
import re
import random

In [26]:
mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")
len(mt5_tokenizer)



250100

In [27]:
lcquad = pd.read_csv("datasets/lcqald_wikidata.csv")
lcquad["query"].head()

0     select distinct var_obj where bra_open wd_Q18...
1    SELECT var_answer WHERE bra_open wd_Q169794 wd...
2    ASK WHERE bra_open wd_Q174843 wdt_P106 wd_Q180...
3    SELECT var_answer WHERE bra_open wd_Q675176 wd...
4    select distinct var_answer where bra_open wd_Q...
Name: query, dtype: object

In [28]:
pattern = r"\b(?:wdt|ps|wd|rdfs|xsd|pq|var)\w*\b"
for query in lcquad["query"].head():
    print(re.findall(pattern, query))

['var_obj', 'wd_Q188920', 'wdt_P2813', 'var_obj', 'var_obj', 'wdt_P31', 'wd_Q1002697']
['var_answer', 'wd_Q169794', 'wdt_P26', 'var_X', 'var_X', 'wdt_P22', 'var_answer']
['wd_Q174843', 'wdt_P106', 'wd_Q1804811', 'wd_Q174843', 'wdt_P106', 'wd_Q33231']
['var_answer', 'wd_Q675176', 'wdt_P515', 'var_X', 'var_X', 'wdt_P156', 'var_answer']
['var_answer', 'wd_Q32491', 'wdt_P3362', 'var_answer']


In [29]:
new_tokens = []
for query in lcquad["query"]:
    new_tokens.extend(re.findall(pattern, query))
new_tokens = list(set(new_tokens))
new_tokens[:20]

['wd_Q318734',
 'wd_Q132971',
 'wd_Q41582627',
 'wd_Q1509',
 'wd_Q2696109',
 'wd_Q17199338',
 'wd_Q7192',
 'wd_Q4024510',
 'wd_Q164651',
 'wd_Q34823585',
 'wd_Q1248784',
 'wd_Q1190554',
 'wd_Q500551',
 'wd_Q15074515',
 'wd_Q152805',
 'wd_Q55221138',
 'wd_Q949423',
 'wd_Q1778821',
 'wd_Q511405',
 'wd_Q1028']

In [30]:
mt5_tokenizer.add_tokens(new_tokens)
len(mt5_tokenizer)

273397

In [31]:
symbol_tokens = ["bra_open", "bra_close", "sep_dot", "sep_or"]
mt5_tokenizer.add_tokens(symbol_tokens)
len(mt5_tokenizer)

273401

In [33]:
mt5_tokenizer.train_new_from_iterator(lcquad["query"], vocab_size=300000)





PreTrainedTokenizerFast(name_or_path='google/mt5-base', vocab_size=38699, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [34]:
mt5_tokenizer.save_pretrained("lcquad_tokenizer")

('lcquad_tokenizer/tokenizer_config.json',
 'lcquad_tokenizer/special_tokens_map.json',
 'lcquad_tokenizer/spiece.model',
 'lcquad_tokenizer/added_tokens.json',
 'lcquad_tokenizer/tokenizer.json')

# Test performance of lcquad tokenizer 

In [35]:
lcquad_tokenizer = AutoTokenizer.from_pretrained("lcquad_tokenizer")

In [36]:
example_queries = random.sample(list(lcquad["query"]), 5)
example_queries

['SELECT var_value1 var_obj WHERE bra_open wd_Q81506 p_P39 var_s sep_dot var_s ps_P39 var_obj sep_dot var_s pq_P582 var_value1 sep_dot bra_close ',
 'SELECT (COUNT( var_obj) AS var_value ) bra_open wd_Q153048 wdt_P802 var_obj bra_close ',
 'SELECT var_value1 var_value2 WHERE bra_open wd_Q41532 p_P26 var_s sep_dot var_s ps_P26 wd_Q124210 sep_dot var_s pq_P582 var_value1 sep_dot var_s pq_P580 var_value2 bra_close ',
 'ASK WHERE bra_open wd_Q6276882 wdt_P108 wd_Q49108 sep_dot wd_Q6276882 wdt_P108 wd_Q180865 bra_close ',
 'SELECT var_answer WHERE bra_open wd_Q11392 wdt_P397 var_X sep_dot var_X wdt_P398 var_answer bra_close ']

In [37]:
for query in example_queries:
    tokens = lcquad_tokenizer.tokenize(query)
    print(tokens)

['▁', 'SELECT', '▁', 'var_value1', '▁', 'var_obj', '▁W', 'HERE', '▁', 'bra_open', '▁', 'wd_Q81506', '▁p', '_', 'P', '39', '▁', 'var_s', '▁', 'sep_dot', '▁', 'var_s', '▁', 'ps_P39', '▁', 'var_obj', '▁', 'sep_dot', '▁', 'var_s', '▁', 'pq_P582', '▁', 'var_value1', '▁', 'sep_dot', '▁', 'bra_close', '▁']
['▁', 'SELECT', '▁(', 'COUNT', '(', '▁', 'var_obj', '▁', ')', '▁AS', '▁', 'var_value', '▁', ')', '▁', 'bra_open', '▁', 'wd_Q153048', '▁', 'wdt_P802', '▁', 'var_obj', '▁', 'bra_close', '▁']
['▁', 'SELECT', '▁', 'var_value1', '▁', 'var_value2', '▁W', 'HERE', '▁', 'bra_open', '▁', 'wd_Q41532', '▁p', '_', 'P', '26', '▁', 'var_s', '▁', 'sep_dot', '▁', 'var_s', '▁', 'ps_P26', '▁', 'wd_Q124210', '▁', 'sep_dot', '▁', 'var_s', '▁', 'pq_P582', '▁', 'var_value1', '▁', 'sep_dot', '▁', 'var_s', '▁', 'pq_P580', '▁', 'var_value2', '▁', 'bra_close', '▁']
['▁', 'ASK', '▁W', 'HERE', '▁', 'bra_open', '▁', 'wd_Q6276882', '▁', 'wdt_P108', '▁', 'wd_Q49108', '▁', 'sep_dot', '▁', 'wd_Q6276882', '▁', 'wdt_P108', '▁