In [1]:
from transformers import AutoConfig, AutoModel
from configuration import STLConfig
from modeling_stldec import STLDec
from handcoded_tokenizer import STLTokenizer

In [2]:
AutoConfig.register("stl-dec", STLConfig)
AutoModel.register(STLConfig, STLDec)

In [3]:
config = STLConfig()
model = AutoModel.from_config(config)
tokenizer = STLTokenizer('tokenizer_files/tokenizer.json')

In [4]:
sequence = "( not ( x_1 <= 0.2988 ) until[11,21] x_0 <= -0.7941 )"
tokenizer = STLTokenizer('tokenizer_files/tokenizer.json')
token_ids = tokenizer.encode(sequence)
# decoded_sequence = tokenizer.decode(token_ids)

# print("Original sequence: ", sequence)
print("Encoded sequence: ", token_ids)

Encoded sequence:  [2, 1, 4, 1, 11, 1, 4, 1, 17, 18, 26, 1, 13, 1, 25, 24, 27, 34, 33, 33, 1, 5, 1, 8, 19, 26, 26, 21, 27, 26, 20, 1, 17, 18, 25, 1, 13, 1, 23, 25, 24, 32, 34, 29, 26, 1, 5, 1, 3]


In [5]:
import argparse
import json
import logging
import math
import os
import random
from itertools import chain
from pathlib import Path

import datasets
import torch
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from datasets import load_dataset
from huggingface_hub import HfApi
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

In [6]:
import os
import math
import logging
import random
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers import get_scheduler
from accelerate import Accelerator
from tqdm.notebook import tqdm
from itertools import chain
from datasets.utils.logging import set_verbosity_warning, set_verbosity_info

# Initialize logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [7]:
# Example configuration variables for Jupyter notebook
args = {
    'dataset_name': None,  # or a custom dataset path
    'train_file': 'formulas_with_embeddings.csv',
    'validation_file': None,
    'output_dir': './output',
    'model_name_or_path': 'stl-dec',
    'tokenizer_name': 'stl-dec',
    'block_size': 128,
    'batch_size': 8,
    'gradient_accumulation_steps': 1,
    'num_train_epochs': 3,
    'learning_rate': 5e-5,
    'weight_decay': 0.01,
    'num_warmup_steps': 0,
    'max_train_steps': None,
    'seed': 42,
    'with_tracking': False,
    'hub_model_id': 'stl-dec',
    'push_to_hub': True,
    'trust_remote_code': True,
    'overwrite_cache': False,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'checkpointing_steps': 'epoch',  # or 'steps' with an int value
    'resume_from_checkpoint': None,
    'hub_token': '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',
}

# Initialize the accelerator
accelerator = Accelerator()

# Send telemetry for resource tracking (assuming you have this function)
# send_example_telemetry("run_clm_no_trainer", args)

# Set seed
if args['seed'] is not None:
    torch.manual_seed(args['seed'])

In [None]:
if args['push_to_hub']:
    print("yes")
else:
    print("no")

In [8]:
# Handle the repository creation
if accelerator.is_main_process:
    if args['push_to_hub']:
        # Retrieve of infer repo_name
        repo_name = args["hub_model_id"]
        if repo_name is None:
            repo_name = Path(args["output_dir"]).absolute().name
        # Create repo and retrieve repo_id
        api = HfApi()
        repo_id = api.create_repo(repo_name, exist_ok=True, token=args["hub_token"]).repo_id

        with open(os.path.join(args["output_dir"], ".gitignore"), "w+") as gitignore:
            if "step_*" not in gitignore:
                gitignore.write("step_*\n")
            if "epoch_*" not in gitignore:
                gitignore.write("epoch_*\n")
    elif args["output_dir"] is not None:
        os.makedirs(args["output_dir"], exist_ok=True)
accelerator.wait_for_everyone()

In [9]:
data_files = {}
data_args = {}

if args["train_file"] is not None:
    data_files["train"] = args["train_file"]
    file_extension = args["train_file"].split(".")[-1]
if args["validation_file"] is not None:
    data_files["validation"] = args["validation_file"]
    file_extension = args["validation_file"].split(".")[-1]
raw_datasets = load_dataset(file_extension, data_files=data_files, **data_args)

In [10]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Formula', 'Embedding'],
        num_rows: 1000
    })
})

In [33]:
def tokenize_function(examples):
    return tokenizer.encode(examples)

In [47]:
# everything is put together
# since we have encoded also BOS (2) and EOS (3) tokens, 
# it does not matter how long this string is
tokenized_datasets = tokenize_function(raw_datasets['train']['Formula'][:2])

In [70]:
def segment_list(input_list, block_size, padding_token=1):
    # Crea una lista di blocchi segmentati con padding se necessario
    # Segmenta la lista in blocchi
    segmented = [input_list[i:i + block_size] for i in range(0, len(input_list), block_size)]
    
    # Aggiungi padding all'ultimo blocco se necessario
    if len(segmented[-1]) < block_size:
        segmented[-1] = segmented[-1] + [padding_token] * (block_size - len(segmented[-1]))
    
    return segmented

In [59]:
# Se il parametro block_size non è stato specificato (cioè args.block_size è None), 
# il codice imposta block_size al valore massimo di lunghezza del modello (tokenizer.model_max_length)

if args["block_size"] is None:
    block_size = tokenizer.model_max_length
    if block_size > config.max_position_embeddings:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
            )
            block_size = min(1024, config.max_position_embeddings)
else:
    if args["block_size"] > tokenizer.model_max_length:
            logger.warning(
                f"The block_size passed ({args.block_size}) is larger than the maximum length for the model "
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
    block_size = min(args["block_size"], tokenizer.model_max_length)

block_size = args['block_size']

In [67]:
len(segment_list(tokenized_datasets, block_size))

3

In [68]:
len(tokenized_datasets)

268

In [72]:
# segment_list(tokenized_datasets, block_size)[2]

In [73]:
lm_datasets = segment_list(tokenized_datasets, block_size)

In [74]:
train_dataset = lm_datasets

# DataLoader creation
train_dataloader = DataLoader(train_dataset, batch_size=args['per_device_train_batch_size'], shuffle=True)

# Optimizer setup
optimizer = torch.optim.AdamW(model.parameters(), lr=args['learning_rate'], weight_decay=args['weight_decay'])

# Scheduler setup
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args['gradient_accumulation_steps'])
if args['max_train_steps'] is None:
    args['max_train_steps'] = args['num_train_epochs'] * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name='linear', optimizer=optimizer, num_warmup_steps=args['num_warmup_steps'],
    num_training_steps=args['max_train_steps']
)

# Prepare everything with accelerator
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, lr_scheduler
)

In [76]:
# Training Loop
for epoch in range(args['num_train_epochs']):
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch}")
    total_loss = 0
    for step, batch in enumerate(progress_bar):
        with accelerator.accumulate(model):
            print(batch)
            # outputs = model(**batch)
            # loss = outputs.loss
            # total_loss += loss.item()
            # accelerator.backward(loss)
            # optimizer.step()
            # lr_scheduler.step()
            # optimizer.zero_grad()

        progress_bar.set_postfix(loss=total_loss / (step + 1))

Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s]

[tensor([26,  1,  2], device='cuda:0'), tensor([20,  5,  1], device='cuda:0'), tensor([ 1,  1, 19], device='cuda:0'), tensor([17,  5,  0], device='cuda:0'), tensor([18,  1,  4], device='cuda:0'), tensor([27,  5,  1], device='cuda:0'), tensor([1, 1, 4], device='cuda:0'), tensor([13,  5,  1], device='cuda:0'), tensor([ 1,  0, 17], device='cuda:0'), tensor([25, 20, 18], device='cuda:0'), tensor([24,  1, 26], device='cuda:0'), tensor([30,  3,  1], device='cuda:0'), tensor([30,  1, 13], device='cuda:0'), tensor([30,  1,  1], device='cuda:0'), tensor([29,  1, 25], device='cuda:0'), tensor([ 1,  1, 24], device='cuda:0'), tensor([ 5,  1, 25], device='cuda:0'), tensor([ 1,  1, 30], device='cuda:0'), tensor([ 5,  1, 30], device='cuda:0'), tensor([ 1,  1, 32], device='cuda:0'), tensor([10,  1,  1], device='cuda:0'), tensor([ 1,  1, 10], device='cuda:0'), tensor([4, 1, 1], device='cuda:0'), tensor([ 1,  1, 17], device='cuda:0'), tensor([ 7,  1, 18], device='cuda:0'), tensor([19,  1, 26], device='c

Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s]

[tensor([ 1,  2, 26], device='cuda:0'), tensor([ 5,  1, 20], device='cuda:0'), tensor([ 1, 19,  1], device='cuda:0'), tensor([ 5,  0, 17], device='cuda:0'), tensor([ 1,  4, 18], device='cuda:0'), tensor([ 5,  1, 27], device='cuda:0'), tensor([1, 4, 1], device='cuda:0'), tensor([ 5,  1, 13], device='cuda:0'), tensor([ 0, 17,  1], device='cuda:0'), tensor([20, 18, 25], device='cuda:0'), tensor([ 1, 26, 24], device='cuda:0'), tensor([ 3,  1, 30], device='cuda:0'), tensor([ 1, 13, 30], device='cuda:0'), tensor([ 1,  1, 30], device='cuda:0'), tensor([ 1, 25, 29], device='cuda:0'), tensor([ 1, 24,  1], device='cuda:0'), tensor([ 1, 25,  5], device='cuda:0'), tensor([ 1, 30,  1], device='cuda:0'), tensor([ 1, 30,  5], device='cuda:0'), tensor([ 1, 32,  1], device='cuda:0'), tensor([ 1,  1, 10], device='cuda:0'), tensor([ 1, 10,  1], device='cuda:0'), tensor([1, 1, 4], device='cuda:0'), tensor([ 1, 17,  1], device='cuda:0'), tensor([ 1, 18,  7], device='cuda:0'), tensor([ 1, 26, 19], device='c

Epoch 2:   0%|          | 0/1 [00:00<?, ?it/s]

[tensor([ 2, 26,  1], device='cuda:0'), tensor([ 1, 20,  5], device='cuda:0'), tensor([19,  1,  1], device='cuda:0'), tensor([ 0, 17,  5], device='cuda:0'), tensor([ 4, 18,  1], device='cuda:0'), tensor([ 1, 27,  5], device='cuda:0'), tensor([4, 1, 1], device='cuda:0'), tensor([ 1, 13,  5], device='cuda:0'), tensor([17,  1,  0], device='cuda:0'), tensor([18, 25, 20], device='cuda:0'), tensor([26, 24,  1], device='cuda:0'), tensor([ 1, 30,  3], device='cuda:0'), tensor([13, 30,  1], device='cuda:0'), tensor([ 1, 30,  1], device='cuda:0'), tensor([25, 29,  1], device='cuda:0'), tensor([24,  1,  1], device='cuda:0'), tensor([25,  5,  1], device='cuda:0'), tensor([30,  1,  1], device='cuda:0'), tensor([30,  5,  1], device='cuda:0'), tensor([32,  1,  1], device='cuda:0'), tensor([ 1, 10,  1], device='cuda:0'), tensor([10,  1,  1], device='cuda:0'), tensor([1, 4, 1], device='cuda:0'), tensor([17,  1,  1], device='cuda:0'), tensor([18,  7,  1], device='cuda:0'), tensor([26, 19,  1], device='c

In [None]:


    # Evaluation
    model.eval()
    eval_loss = 0
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
            eval_loss += outputs.loss.item()

    eval_loss /= len(eval_dataloader)
    perplexity = math.exp(eval_loss) if eval_loss < 100 else float('inf')
    print(f"Epoch {epoch} evaluation loss: {eval_loss}, perplexity: {perplexity}")

    # Save checkpoint
    if args['checkpointing_steps'] == 'epoch' and (epoch + 1) % 1 == 0:
        output_dir = os.path.join(args['output_dir'], f"epoch_{epoch}")
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

# Final model saving
model.save_pretrained(args['output_dir'])
tokenizer.save_pretrained(args['output_dir'])

print("Training completed!")
