In [None]:
# Esempi di sequenze
sequences = [
    '( not ( x_1 <= 0.2988 ) until[11,21] x_0 <= -0.7941 )',
    '( eventually[10,21] ( ( x_0 <= 0.9229 or not ( always[5,inf] ( x_0 <= 0.4263 ) ) ) ) or x_0 >= 0.6745 )',
    '( ( eventually ( ( x_1 <= 0.5132 and x_0 <= -0.457 ) ) until x_2 <= 0.257 ) and x_1 <= 0.1586 )',
    'always[0,3] ( ( always[14,20] ( x_1 >= -0.6583 ) until[12,18] x_1 <= -0.284 ) )',
    '( not ( ( ( x_0 <= -1.0103 and eventually[1,6] ( x_1 <= 0.1484 ) ) and always[1,14] ( x_2 <= -1.7752 ) ) ) or not ( x_1 <= 1.0504 ) )'
]

# Da qua roba seria 

In [None]:
import json
import os
import re
import warnings
from pathlib import Path
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union

from transformers import PreTrainedTokenizer
from transformers.utils import logging
# omitted sentencepiece!

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {
    "source_spm": None,
    "target_spm": None,
    "vocab": "tokenizer_files/tokenizer.json",
    "target_vocab_file": None,
    "tokenizer_config_file": "tokenizer_files/tokenizer_config.json",
}

SPIECE_UNDERLINE = "▁"

In [None]:
class STLTokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES

    def __init__(
        self,
        vocab: str,
        unk_token: str = "unk",
        eos_token: str = "/s",
        pad_token: str = "pad",
        model_max_length: int = 512,
        **kwargs,
    ):
        self.vocab = self.load_json(vocab)
        self.unk_token = unk_token
        self.eos_token = eos_token
        self.pad_token = pad_token
        self.model_max_length = model_max_length

        # capisci meglio qui
        self.encoder = self.vocab
        self.decoder = {v: k for k, v in self.encoder.items()}

        # super().__init__(unk_token=unk_token, eos_token=eos_token, pad_token=pad_token, model_max_length=model_max_length, **kwargs)

    def load_json(self, path: str) -> Dict:
        with open(path, "r") as f:
            return json.load(f)

    def preprocess_sequence(self, sequence, pad_token="pad"):
        """
        Sostituisce ogni spazio nella sequenza con un token di padding specificato.
        
        Args:
            sequence (str): La sequenza di input.
            pad_token (str): Il token da utilizzare per il padding (default: "<pad>").
        
        Returns:
            str: La sequenza preprocessata con gli spazi sostituiti.
        """
        return sequence.replace(' ', f' {pad_token} ')

    def split_number(self, number: str) -> List[str]:
        """
        Divide un numero in token individuali (es. 2.0375 -> ['2', '.', '0', '3', '7', '5']).
        """
        return list(number)

    def split_variable(self, variable: str) -> List[str]:
        """
        Divide una variabile come x_0 in token individuali (es. x_0 -> ['x', '_', '0']).
        """
        return list(variable)

    def tokenize(self, text: str, vocab: dict) -> List[str]:
        """
        Tokenizza una stringa basandosi sul vocabolario dato.
        
        Args:
            text (str): La stringa da tokenizzare.
            vocab (dict): Il vocabolario che definisce i token validi.
        
        Returns:
            List[str]: Lista di token estratti.
        """

        # Preprocess to replace spaces with pad_token
        text = self.preprocess_sequence(text)

        # non generalizzabile -> prendi il vocabolario, crea espressioni a partire da questo
        # ma come?

        # Modified token pattern to capture space-replaced token
        token_pattern = re.compile(
            r'(always|eventually|until|'  # Temporal operators
            r'and|or|not|'  # Logical operators
            r'x_\d+|'  # Variables (e.g., x_0)
            r'-?\d+\.\d+|-?\d+|'  # Numbers (integers or decimals with sign)
            r'<=|>=|<|>|=|'  # Relational operators
            r'\[|\]|\(|\)|,|inf|'  # Special symbols
            r'pad)'  # Match <pad> as a token (escape < and >)
        )
        
        # Trova tutti i token usando il pattern
        tokens = token_pattern.findall(text)
        
        # Espandi numeri e variabili
        expanded_tokens = []
        for token in tokens:
            if re.match(r"-?\d+\.\d+|-?\d+", token):  # Numeri
                expanded_tokens.extend(self.split_number(token))
            elif re.match(r"x_\d+", token):  # Variabili
                expanded_tokens.extend(self.split_variable(token))
            else:  # Altri token validi
                expanded_tokens.append(token)
        return expanded_tokens

    def code(self, tokens):
        # Sostituire i token con i rispettivi ID nel vocabolario, usando <unk> per quelli non presenti
        return [self.encoder.get(token, self.encoder[self.unk_token]) for token in tokens]

    # Funzione di detokenizzazione
    def decode(self, token_ids):
        tokens = [self.decoder.get(id, self.unk_token) for id in token_ids]
        return "".join(tokens).replace(f'pad', ' ')

In [None]:
tokenizer = STLTokenizer(vocab = "tokenizer_files/tokenizer-v2.json")

In [None]:
print(tokenizer.vocab)

In [None]:
test = tokenizer.tokenize(sequences[0], "tokenizer_files/tokenizer.json")
print(test)

In [None]:
encoded = tokenizer.code(test)

In [None]:
decoded = tokenizer.decode(encoded)
print(decoded)

In [None]:
print(sequences[0])

# TO-DO

Cerca di capire se è necessario trasformare l'output della parte di codifica (`code`) e di quello di decodifica (`decode`) in dizionari storati in qualsivoglia `json` file!

In [None]:
import re

# Vocabolario
vocab = {'unk': 0, 'pad': 1, '/s': 2, 's': 3, '(': 4, ')': 5, 'always': 6, 'eventually': 7, 
         'until': 8, 'and': 9, 'or': 10, 'not': 11, '>=': 12, '<=': 13, '>': 14, '<': 15, 
         '=': 16, 'x': 17, '_': 18, '[': 19, ']': 20, ',': 21, 'inf': 22, '-': 23, '.': 24, 
         '0': 25, '1': 26, '2': 27, '3': 28, '4': 29, '5': 30, '6': 31, '7': 32, '8': 33, '9': 34}

def tokenize_vocabulary_based(sequence, vocab):
    tokens = []
    i = 0
    while i < len(sequence):
        best_match = None
        # Trova la migliore corrispondenza dal vocabolario
        for j in range(len(sequence), i, -1):  # Scansione della sequenza di destra verso sinistra
            subtoken = sequence[i:j]
            if subtoken in vocab:
                best_match = subtoken
                break
        if best_match:
            tokens.append(best_match)
            i += len(best_match)
        else:
            tokens.append('unk')  # Se non c'è corrispondenza, usa 'unk'
            i += 1
    return tokens

# Sequenza da tokenizzare
sequence = "(padnotpad(padx_1pad<=pad0.2988pad)paduntil[11,21]padx_0pad<=pad-0.7941pad)"

# Tokenizzazione
tokenized_sequence = tokenize_vocabulary_based(sequence, vocab)
print("Tokenized sequence:", tokenized_sequence)

# Mappatura ai relativi ID
token_ids = [vocab.get(token, vocab['unk']) for token in tokenized_sequence]
print("Token IDs:", token_ids)

In [None]:
tokenizer.decode(token_ids)

# NEW 

In [33]:
import re
import json
from typing import Any, Dict, List, Optional, Tuple, Union

from transformers import PreTrainedTokenizer
from transformers.utils import logging

logger = logging.get_logger(__name__)


def load_json(path: str) -> Union[Dict, List]:
    with open(path, "r") as f:
        return json.load(f)

class STLTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_path: str, unk_token: str = "unk", pad_token: str = "pad", bos_token: str = "/s", eos_token: str = "s"):
        self.vocab = load_json(vocab_path)
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.bos_token = bos_token
        self.eos_token = eos_token
        self.id_to_token = {v: k for k, v in self.vocab.items()}  # Reverse mapping

    @property
    def vocab_size(self) -> int:
        return len(self.vocab)

    def prepad_sequence(self, sequence, undo = False):
        """
        Sostituisce ogni spazio nella sequenza con un token di padding specificato.
        
        Args:
            sequence (str): La sequenza di input.
            undo (bool): pad when False, un-pad when True.
        
        Returns:
            str: La sequenza preprocessata con ` `/`pad` sostituiti.
        """
        if undo:
            return sequence.replace(f'{self.pad_token}', ' ')
        else:
            return sequence.replace(' ', f'{self.pad_token}')

    def tokenize(self, text: str) -> List[str]:
        
        text = self.prepad_sequence(text)
        
        tokens = []
        i = 0
        while i < len(text):
            best_match = None
            for j in range(len(text), i, -1):  
                subtoken = text[i:j]
                if subtoken in self.vocab:
                    best_match = subtoken
                    break
            if best_match:
                tokens.append(best_match)
                i += len(best_match)
            else:
                tokens.append(self.unk_token)
                i += 1
        return tokens

    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
        return [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]

    def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
        return [self.id_to_token.get(i, self.unk_token) for i in ids]

    def decode(self, token_ids: List[int], skip_special_tokens: bool = False) -> str:
        tokens = self.convert_ids_to_tokens(token_ids)
        decoded = "".join(tokens)
        return self.prepad_sequence(decoded, undo = True)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # implementato per sentencepiece per salvare il vocabolario trovato -> non serve
        vocab_file = f"{save_directory}/{filename_prefix + '-' if filename_prefix else ''}vocab.json"
        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(self.vocab, f, indent=2, ensure_ascii=False)
        return (vocab_file,)

    def get_vocab(self) -> dict:
        return self.vocab


# Esempio di utilizzo
# tokenizer = STLTokenizer(vocab)

In [34]:
tokenizer = STLTokenizer('tokenizer_files/tokenizer.json')

In [35]:
# Tokenizzazione
# sequence = "(padnotpad(padx_1pad<=pad0.2988pad)paduntil[11,21]padx_0pad<=pad-0.7941pad)"
sequence = "( not ( x_1 <= 0.2988 ) until[11,21] x_0 <= -0.7941 )"

tokens = tokenizer.tokenize(sequence)
print("Tokenized sequence:", tokens)

Tokenized sequence: ['(', 'pad', 'not', 'pad', '(', 'pad', 'x', '_', '1', 'pad', '<=', 'pad', '0', '.', '2', '9', '8', '8', 'pad', ')', 'pad', 'until', '[', '1', '1', ',', '2', '1', ']', 'pad', 'x', '_', '0', 'pad', '<=', 'pad', '-', '0', '.', '7', '9', '4', '1', 'pad', ')']


In [36]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

Token IDs: [4, 1, 11, 1, 4, 1, 17, 18, 26, 1, 13, 1, 25, 24, 27, 34, 33, 33, 1, 5, 1, 8, 19, 26, 26, 21, 27, 26, 20, 1, 17, 18, 25, 1, 13, 1, 23, 25, 24, 32, 34, 29, 26, 1, 5]


In [27]:
decoded_sequence = tokenizer.decode(token_ids)
print("Decoded sequence:", decoded_sequence)

Decoded sequence: ( not ( x_1 <= 0.2988 ) until[11,21] x_0 <= -0.7941 )


In [28]:
tokenizer.get_vocab()

{'unk': 0,
 'pad': 1,
 '/s': 2,
 's': 3,
 '(': 4,
 ')': 5,
 'always': 6,
 'eventually': 7,
 'until': 8,
 'and': 9,
 'or': 10,
 'not': 11,
 '>=': 12,
 '<=': 13,
 '>': 14,
 '<': 15,
 '=': 16,
 'x': 17,
 '_': 18,
 '[': 19,
 ']': 20,
 ',': 21,
 'inf': 22,
 '-': 23,
 '.': 24,
 '0': 25,
 '1': 26,
 '2': 27,
 '3': 28,
 '4': 29,
 '5': 30,
 '6': 31,
 '7': 32,
 '8': 33,
 '9': 34}