# Tokenizer

In [5]:
from transformers import AutoTokenizer
import csv
import pandas as pd

In [2]:
example1 = "SELECT var_uri WHERE bra_open var_uri wdt_P106 wd_Q3665646 ; wdt_P569 var_birthDate ; wdt_P166 var_award sep_dot var_award rdfs_label var_awardLabel FILTER (CONTAINS(STR( var_awardLabel), ""MVP"") sep_or sep_or CONTAINS(STR( var_awardLabel), ""Most Valuable Player"")) sep_dot FILTER (LANGMATCHES(LANG( var_awardLabel), ""EN"" ) ) bra_close ORDER BY DESC( var_birthDate) LIMIT 4"

example2 = "ASK WHERE bra_open wd_Q9215 wdt_P26 var_o1 sep_dot bra_close"

example3 = "SELECT DISTINCT var_uri WHERE bra_open var_uri wdt_P106 wd_Q18574233 ; wdt_P569 var_dateOfBirth sep_dot bra_close ORDER BY DESC( var_dateOfBirth) LIMIT 1"

## Test existing tokenizers

### mt5 tokenizer

the original mt5 tokenizer tokenizes words in SPARQL query very bad, 
e.g. "DISTINCT" is separated to  "_D", "ISTI" and "NCT". 

In [3]:
mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")



In [4]:
tokens = mt5_tokenizer.tokenize(example1)
tokens

['▁',
 'SELECT',
 '▁var',
 '_',
 'uri',
 '▁W',
 'HERE',
 '▁bra',
 '_',
 'open',
 '▁var',
 '_',
 'uri',
 '▁w',
 'd',
 't',
 '_',
 'P',
 '106',
 '▁w',
 'd',
 '_',
 'Q',
 '3665',
 '646',
 '▁',
 ';',
 '▁w',
 'd',
 't',
 '_',
 'P',
 '569',
 '▁var',
 '_',
 'birth',
 'Date',
 '▁',
 ';',
 '▁w',
 'd',
 't',
 '_',
 'P',
 '166',
 '▁var',
 '_',
 'a',
 'ward',
 '▁sep',
 '_',
 'dot',
 '▁var',
 '_',
 'a',
 'ward',
 '▁',
 'rdf',
 's',
 '_',
 'label',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '▁FIL',
 'TER',
 '▁(',
 'CONTA',
 'INS',
 '(',
 'STR',
 '(',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '),',
 '▁',
 'MVP',
 ')',
 '▁sep',
 '_',
 'or',
 '▁sep',
 '_',
 'or',
 '▁CONT',
 'AINS',
 '(',
 'STR',
 '(',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '),',
 '▁Most',
 '▁Valu',
 'able',
 '▁Player',
 '))',
 '▁sep',
 '_',
 'dot',
 '▁FIL',
 'TER',
 '▁(',
 'LANG',
 'MATCH',
 'ES',
 '(',
 'LANG',
 '(',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '),',
 '▁EN',
 '▁',
 ')',
 '▁',
 ')',
 '▁bra',
 '_',
 'close',
 '▁',
 'ORDER',


### edrf model tokenizer

tokenizer from edrf model performs slightly better since some tokens are added to tokenizer manually. But still it tokenizes other words badly, especially entity and relation tokens.  

In [5]:
edrf_tokenizer = AutoTokenizer.from_pretrained("en_de_ru_fr")
tokens = edrf_tokenizer.tokenize(example1)
tokens

['▁',
 'SELECT',
 '▁',
 'var_uri',
 '▁W',
 'HERE',
 '▁',
 'bra_open',
 '▁',
 'var_uri',
 '▁w',
 'd',
 't',
 '_',
 'P',
 '106',
 '▁w',
 'd',
 '_',
 'Q',
 '3665',
 '646',
 '▁',
 ';',
 '▁w',
 'd',
 't',
 '_',
 'P',
 '569',
 '▁var',
 '_',
 'birth',
 'Date',
 '▁',
 ';',
 '▁w',
 'd',
 't',
 '_',
 'P',
 '166',
 '▁var',
 '_',
 'a',
 'ward',
 '▁',
 'sep_dot',
 '▁var',
 '_',
 'a',
 'ward',
 '▁',
 'rdf',
 's',
 '_',
 'label',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '▁FIL',
 'TER',
 '▁(',
 'CONTA',
 'INS',
 '(',
 'STR',
 '(',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '),',
 '▁',
 'MVP',
 ')',
 '▁',
 'sep_or',
 '▁',
 'sep_or',
 '▁CONT',
 'AINS',
 '(',
 'STR',
 '(',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '),',
 '▁Most',
 '▁Valu',
 'able',
 '▁Player',
 '))',
 '▁',
 'sep_dot',
 '▁FIL',
 'TER',
 '▁(',
 'LANG',
 'MATCH',
 'ES',
 '(',
 'LANG',
 '(',
 '▁var',
 '_',
 'a',
 'ward',
 'Label',
 '),',
 '▁EN',
 '▁',
 ')',
 '▁',
 ')',
 '▁',
 'bra_close',
 '▁',
 'ORDER',
 '▁BY',
 '▁',
 'DESC',
 '(',
 '▁var',
 '

## Train a tokenizer with SPARQL queries

In [6]:
dataset = pd.read_csv("wikidata_en.csv")
dataset['query']


0      SELECT var_uri WHERE bra_open var_uri wdt_P31 ...
1      SELECT var_uri WHERE bra_open wd_Q40984 wdt_P1...
2      SELECT var_uri WHERE bra_open var_uri wdt_P19 ...
3      SELECT DISTINCT var_uri WHERE bra_open wd_Q177...
4      SELECT DISTINCT var_uri WHERE bra_open wd_Q60 ...
                             ...                        
366     SELECT var_s1 WHERE bra_open var_s1 wdt_P159 ...
367    SELECT DISTINCT var_uri WHERE bra_open var_uri...
368    SELECT DISTINCT var_s1 WHERE bra_open var_s1 w...
369    SELECT var_o1 WHERE bra_open wd_Q858840 wdt_P1...
370    SELECT var_o1 WHERE bra_open wd_Q339 wdt_P61 v...
Name: query, Length: 371, dtype: object

In [7]:
tokenizer = mt5_tokenizer.train_new_from_iterator(dataset['query'], 260000)





In [8]:
tokenizer.tokenize(example1)

['▁',
 'SE',
 'L',
 'E',
 'CT',
 '▁var_uri',
 '▁',
 'W',
 'H',
 'ER',
 'E',
 '▁bra_',
 'op',
 'en',
 '▁var_uri',
 '▁wdt_P106',
 '▁wd_Q36',
 '6',
 '5646',
 '▁',
 ';',
 '▁wdt_P5',
 '69',
 '▁var_birthDate',
 '▁',
 ';',
 '▁wdt_P16',
 '6',
 '▁var_award',
 '▁sep_',
 'd',
 'ot',
 '▁var_award',
 '▁rdfs_label',
 '▁var_awardLabel',
 '▁FILTER',
 '▁',
 '(CONTAINS(',
 'ST',
 'R(',
 '▁var_awardLabel',
 ')',
 ',',
 '▁',
 'M',
 'V',
 'P',
 ')',
 '▁sep_',
 'or',
 '▁sep_',
 'or',
 '▁',
 'CONTAINS(',
 'ST',
 'R(',
 '▁var_awardLabel',
 ')',
 ',',
 '▁',
 'M',
 'o',
 'st',
 '▁V',
 'alu',
 'ab',
 'le',
 '▁',
 'Pla',
 'y',
 'er',
 '))',
 '▁sep_',
 'd',
 'ot',
 '▁FILTER',
 '▁',
 '(LANG',
 'M',
 'AT',
 'C',
 'HE',
 'S',
 '(LANG',
 '(',
 '▁var_awardLabel',
 ')',
 ',',
 '▁',
 'E',
 'N',
 '▁',
 ')',
 '▁',
 ')',
 '▁bra_',
 'close',
 '▁O',
 'R',
 'D',
 'ER',
 '▁B',
 'Y',
 '▁DESC(',
 '▁var_birthDate',
 ')',
 '▁LIMI',
 'T',
 '▁',
 '4']

In [9]:
# tokenizer.save_pretrained("sparql-tokenizer")

In [10]:
new_edrf_tokenizer = edrf_tokenizer.train_new_from_iterator(dataset['query'], 260000)





In [11]:
new_edrf_tokenizer.tokenize(example1)

['▁',
 'SE',
 'L',
 'E',
 'CT',
 '▁',
 'var_uri',
 '▁',
 'W',
 'H',
 'ER',
 'E',
 '▁',
 'bra_open',
 '▁',
 'var_uri',
 '▁wdt_P106',
 '▁wd_Q36',
 '6',
 '5646',
 '▁',
 ';',
 '▁wdt_P5',
 '69',
 '▁var_birthDate',
 '▁',
 ';',
 '▁wdt_P16',
 '6',
 '▁var_award',
 '▁',
 'sep_dot',
 '▁var_award',
 '▁rdfs_label',
 '▁var_awardLabel',
 '▁FILTER',
 '▁',
 '(CONTAINS(',
 'ST',
 'R(',
 '▁var_awardLabel',
 ')',
 ',',
 '▁',
 'M',
 'V',
 'P',
 ')',
 '▁',
 'sep_or',
 '▁',
 'sep_or',
 '▁',
 'CONTAINS(',
 'ST',
 'R(',
 '▁var_awardLabel',
 ')',
 ',',
 '▁',
 'M',
 'o',
 'st',
 '▁V',
 'alu',
 'ab',
 'le',
 '▁',
 'Pla',
 'y',
 'er',
 '))',
 '▁',
 'sep_dot',
 '▁FILTER',
 '▁',
 '(LANG',
 'M',
 'AT',
 'C',
 'HE',
 'S',
 '(LANG',
 '(',
 '▁var_awardLabel',
 ')',
 ',',
 '▁',
 'E',
 'N',
 '▁',
 ')',
 '▁',
 ')',
 '▁',
 'bra_close',
 '▁O',
 'R',
 'D',
 'ER',
 '▁B',
 'Y',
 '▁DESC(',
 '▁var_birthDate',
 ')',
 '▁LIMI',
 'T',
 '▁',
 '4']

- special tokens like "SELECT", "WHERE" are still separated. They need to be added into tokenizer

- entity and relations are also separated e.g. "_wd_Q36", "6", "5646". This issue has to be solved. 

## Train Tokenizer with SPARQL queries from Wikidata SPARQL Logs

### Preprocess SPARQL queries from Wikidata SPARQL Logs

Wikidata SPARQL Logs collect SPARQL queries in different time intervals, which can be used to train tokenizer. Queries are URL-encoded, thus queries must be decoded and transformed to the same format as our training data. 

In [12]:
from urllib.parse import unquote
import pandas as pd
import re

In [17]:
next(query_generator())

['SELECT+*%0AWHERE+%7B%0A++%3Fvar1++%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2FP698%3E++%2229066813%22.%0A%7D%0A\t2018-02-26 00:00:00\trobotic\tPBB_core fastrun']

In [22]:
data = pd.read_csv('2018-02-26_2018-03-25_organic.tsv', sep='\t', header=0)
data.head()

Unnamed: 0,anonymizedQuery,timestamp,sourceCategory,user_agent
0,SELECT+%3Fvar1++%3Fvar1Label+%28+COUNT+%28+DIS...,2018-02-26 00:00:08,organic,browser
1,SELECT+DISTINCT+%3Fvar1+%0AWHERE+%7B%0A++%3Fva...,2018-02-26 00:00:27,organic,browser
2,SELECT+%3Fvar1++%3Fvar1Label+%28+COUNT+%28+DIS...,2018-02-26 00:00:31,organic,browser
3,SELECT+DISTINCT+%3Fvar1++%3Fvar1Label++%3Fvar2...,2018-02-26 00:00:43,organic,browser
4,SELECT+%3Fvar1++%3Fvar2Label++%3Fvar3+%0AWHERE...,2018-02-26 00:00:57,organic,browser


In [23]:
query_data = data["anonymizedQuery"]
query_data.head()

0    SELECT+%3Fvar1++%3Fvar1Label+%28+COUNT+%28+DIS...
1    SELECT+DISTINCT+%3Fvar1+%0AWHERE+%7B%0A++%3Fva...
2    SELECT+%3Fvar1++%3Fvar1Label+%28+COUNT+%28+DIS...
3    SELECT+DISTINCT+%3Fvar1++%3Fvar1Label++%3Fvar2...
4    SELECT+%3Fvar1++%3Fvar2Label++%3Fvar3+%0AWHERE...
Name: anonymizedQuery, dtype: object

In [24]:
prefix_pattern = [
    [r'<http://dbpedia.org/resource/(.*?)>\.?', 'dbr:'],
    [r'<http://dbpedia.org/property/(.*?)>\.?', 'dbp:'],
    [r'<http://dbpedia.org/ontology/(.*?)>\.?', 'dbo:'],
    [r'<http://dbpedia.org/class/yago/(.*?)>\.?', 'yago:'],
    [r'onto:(.*)', 'dbo:'],
    [r'<http://www.wikidata.org/prop/direct/(.*?)>', 'wdt:'],
    [r'<http://www.wikidata.org/entity/(.*?)>', 'wd:'],
    [r'http://www.wikidata.org/prop/(.*?)', 'p:'],
    [r'<http://www.w3.org/2000/01/rdf-schema#(.*?)', 'rdfs:'],
    [r'<http://wikiba.se/ontology#(.*?)>', 'wbo:'],
    [r'<http://www.bigdata.com/queryHints#(.*?)>', 'bdqh:'],
    [r'<http://schema.org/(.*?)>', 'schema:'],
    [r'<http://www.opengis.net/ont/geosparql#(.*?)>', 'og:'],
    [r'<https://(.*?).wikipedia.org/>', 'wiki:']
]

replacement = [
    ['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>', 'rdf:type'],
    ['<http://www.bigdata.com/rdf#serviceParam>', 'rdf:service'],
    ['{', ' bra_open '],
    ['}', ' bra_close '],
    ['?', ' var_'],
    [':', '_'],
    ['.', ' sep_dot '],
    ['|', ' sep_or '],
    ["+", " "],
    ["\n", " "]
]

def process_query(query):
    query = unquote(query)
    for pattern in prefix_pattern:
        query = re.sub(pattern[0], pattern[1]+r'\1', query)
    for replace in replacement:
        query = query.replace(replace[0], replace[1])
    query = re.sub(' +', ' ', query)
    return query

process_query_generator = (process_query(query) for query in query_data)


In [28]:
def query_generator():
    with open("2018-02-26_2018-03-25_all.tsv", "r") as f:
        reader = csv.reader(f, delimiter="\t")
        next(reader, None)  # skip the headers
        for row in reader:
            # process each row
            yield process_query(row[0])

In [29]:
next(query_generator())

'SELECT * WHERE bra_open var_var1 wdt_P698 "29066813" sep_dot bra_close '

### training

In [21]:
old_tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")
tokenizer = old_tokenizer.train_new_from_iterator(process_query_generator, 300000)







In [30]:
all_trained_tokenizer = mt5_tokenizer.train_new_from_iterator(query_generator(), 400000)





In [31]:
all_trained_tokenizer.save_pretrained("sparql-tokenizer-trainedon_all")

('sparql-tokenizer-trainedon_all/tokenizer_config.json',
 'sparql-tokenizer-trainedon_all/special_tokens_map.json',
 'sparql-tokenizer-trainedon_all/tokenizer.json')

In [32]:
all_trained_tokenizer.tokenize(example1)

['▁',
 'S',
 'ELE',
 'CT',
 '▁var_',
 'uri',
 '▁',
 'WH',
 'ER',
 'E',
 '▁',
 'bra',
 '_',
 'open',
 '▁var_',
 'uri',
 '▁wdt_P10',
 '6',
 '▁wd_Q3665',
 '646',
 '▁',
 ';',
 '▁wdt_P',
 '569',
 '▁var_',
 'birth',
 'D',
 'ate',
 '▁',
 ';',
 '▁wdt_P16',
 '6',
 '▁var_',
 'a',
 'ward',
 '▁',
 'sep',
 '_',
 'dot',
 '▁var_',
 'a',
 'ward',
 '▁rdf',
 's_la',
 'bel',
 '▁var_',
 'a',
 'ward',
 'Label',
 '▁',
 'FI',
 'L',
 'T',
 'ER',
 '▁',
 '(',
 'C',
 'ON',
 'T',
 'A',
 'IN',
 'S',
 '(',
 'S',
 'T',
 'R',
 '(',
 '▁var_',
 'a',
 'ward',
 'Label',
 ')',
 ',',
 '▁',
 'M',
 'V',
 'P',
 ')',
 '▁',
 'sep',
 '_or',
 '▁',
 'sep',
 '_or',
 '▁',
 'C',
 'ON',
 'T',
 'A',
 'IN',
 'S',
 '(',
 'S',
 'T',
 'R',
 '(',
 '▁var_',
 'a',
 'ward',
 'Label',
 ')',
 ',',
 '▁',
 'Most',
 '▁',
 'V',
 'alu',
 'able',
 '▁',
 'Player',
 ')',
 ')',
 '▁',
 'sep',
 '_',
 'dot',
 '▁',
 'FI',
 'L',
 'T',
 'ER',
 '▁',
 '(',
 'LANG',
 'MA',
 'T',
 'C',
 'HE',
 'S',
 '(',
 'LANG',
 '(',
 '▁var_',
 'a',
 'ward',
 'Label',
 ')',
 ','

In [33]:
all_trained_tokenizer.tokenize(example2)

['▁',
 'A',
 'SK',
 '▁',
 'WH',
 'ER',
 'E',
 '▁',
 'bra',
 '_',
 'open',
 '▁wd_Q9215',
 '▁wdt_P26',
 '▁var_o',
 '1',
 '▁',
 'sep',
 '_',
 'dot',
 '▁',
 'bra',
 '_c',
 'lose']

In [34]:
all_trained_tokenizer.tokenize(example3)

['▁',
 'S',
 'ELE',
 'CT',
 '▁',
 'DI',
 'S',
 'T',
 'IN',
 'CT',
 '▁var_',
 'uri',
 '▁',
 'WH',
 'ER',
 'E',
 '▁',
 'bra',
 '_',
 'open',
 '▁var_',
 'uri',
 '▁wdt_P10',
 '6',
 '▁wd_Q18574',
 '233',
 '▁',
 ';',
 '▁wdt_P',
 '569',
 '▁var_',
 'date',
 'Of',
 'Birth',
 '▁',
 'sep',
 '_',
 'dot',
 '▁',
 'bra',
 '_c',
 'lose',
 '▁',
 'O',
 'R',
 'D',
 'ER',
 '▁',
 'BY',
 '▁',
 'DE',
 'S',
 'C',
 '(',
 '▁var_',
 'date',
 'Of',
 'Birth',
 ')',
 '▁',
 'L',
 'I',
 'MIT',
 '▁',
 '1']

### Test and save

In [22]:
tokenizer.tokenize(example1)

['▁SELECT',
 '▁var_',
 'uri',
 '▁WH',
 'ERE',
 '▁bra_',
 'open',
 '▁var_',
 'uri',
 '▁wdt_P106',
 '▁wd_Q36',
 '65646',
 '▁',
 ';',
 '▁wdt_P569',
 '▁var_',
 'birthDate',
 '▁',
 ';',
 '▁wdt_P166',
 '▁var_',
 'a',
 'w',
 'a',
 'r',
 'd',
 '▁sep_',
 'dot',
 '▁var_',
 'a',
 'w',
 'a',
 'r',
 'd',
 '▁rdfs_',
 'label',
 '▁var_',
 'a',
 'w',
 'a',
 'r',
 'd',
 'Label',
 '▁FIL',
 'TER',
 '▁',
 '(',
 'CONT',
 'AIN',
 'S',
 '(',
 'STR',
 '(',
 '▁var_',
 'a',
 'w',
 'a',
 'r',
 'd',
 'Label',
 ')',
 ',',
 '▁',
 'M',
 'V',
 'P',
 ')',
 '▁sep_',
 'o',
 'r',
 '▁sep_',
 'o',
 'r',
 '▁CONT',
 'AIN',
 'S',
 '(',
 'STR',
 '(',
 '▁var_',
 'a',
 'w',
 'a',
 'r',
 'd',
 'Label',
 ')',
 ',',
 '▁',
 'M',
 'o',
 's',
 't',
 '▁',
 'V',
 'a',
 'l',
 'u',
 'a',
 'b',
 'l',
 'e',
 '▁',
 'P',
 'l',
 'ayer',
 ')',
 ')',
 '▁sep_',
 'dot',
 '▁FIL',
 'TER',
 '▁',
 '(',
 'LANG',
 'MAT',
 'CHE',
 'S',
 '(',
 'LANG',
 '(',
 '▁var_',
 'a',
 'w',
 'a',
 'r',
 'd',
 'Label',
 ')',
 ',',
 '▁',
 'E',
 'N',
 '▁',
 ')',
 '▁',
 '

In [23]:
tokenizer.tokenize(example2)

['▁',
 'ASK',
 '▁WH',
 'ERE',
 '▁bra_',
 'open',
 '▁wd_Q9215',
 '▁wdt_P26',
 '▁var_',
 'o',
 '1',
 '▁sep_',
 'dot',
 '▁bra_',
 'close']

In [24]:
tokenizer.tokenize(example3)

['▁SELECT',
 '▁DISTINCT',
 '▁var_',
 'uri',
 '▁WH',
 'ERE',
 '▁bra_',
 'open',
 '▁var_',
 'uri',
 '▁wdt_P106',
 '▁wd_Q18',
 '57423',
 '3',
 '▁',
 ';',
 '▁wdt_P569',
 '▁var_',
 'd',
 'ate',
 'O',
 'f',
 'B',
 'i',
 'rth',
 '▁sep_',
 'dot',
 '▁bra_',
 'close',
 '▁',
 'ORD',
 'ER',
 '▁',
 'BY',
 '▁DESC(',
 '▁var_',
 'd',
 'ate',
 'O',
 'f',
 'B',
 'i',
 'rth',
 ')',
 '▁LI',
 'MIT',
 '▁',
 '1']

In [None]:
tokenizer.save_pretrained("sparql-tokenizer")

03.15.2023

add tokens to tokenizer

In [6]:
mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")



In [7]:
lcquad = pd.read_csv("datasets/lcqald_wikidata.csv")
lcquad["query"].head()

0     select distinct var_obj where bra_open wd_Q18...
1    SELECT var_answer WHERE bra_open wd_Q169794 wd...
2    ASK WHERE bra_open wd_Q174843 wdt_P106 wd_Q180...
3    SELECT var_answer WHERE bra_open wd_Q675176 wd...
4    select distinct var_answer where bra_open wd_Q...
Name: query, dtype: object