In [1]:
base_model = 'google/flan-t5-large'
dataset_path = "wikisql_tok_dataset"

from datasets import load_dataset
import shutil
import os
import torch
import numpy as np
from transformers import AutoTokenizer, T5ForConditionalGeneration
from datasets import load_from_disk

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = T5ForConditionalGeneration.from_pretrained(base_model)

#CKPT = 'facebook/bart-large'
#from transformers import AutoTokenizer, BartTokenizer, BartModel, BartForConditionalGeneration
#tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
#model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

train_data = load_dataset('wikisql', split="train[:100%]+validation[:100%]")
#train_data = load_dataset('wikisql', split="train+validation")
test_data = load_dataset('wikisql', split='test')

def format_dataset(example):
 return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)

def remove_dir(dir_path):
    try:
        shutil.rmtree(dir_path)
        print(f"Folder '{dir_path}' has been deleted.")
    except Exception as e:
        # Ignore errors, you can print a message if needed
        print(f"Folder '{dir_path}' has been deleted.")

  table = cls._concat_blocks(blocks, axis=0)


In [2]:
def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"{device_map} Memory Used: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_parameters(model)
print("\nData types:")
print_param_precision(model)

cuda:0 Memory Used: 2987.4805 MB

Parameters:
Total parameters: 783.1501 M

Data types:
torch.float32, 783.1501 M, 100.00 %


In [3]:
train_data

Dataset({
    features: ['input', 'target'],
    num_rows: 64776
})

In [4]:
def map_to_length(x):
  x["input_len"] = len(tokenizer(x["input"]).input_ids)
  x["input_longer_256"] = int(x["input_len"] > 256)
  x["input_longer_128"] = int(x["input_len"] > 128)
  x["input_longer_64"] = int(x["input_len"] > 64)
  x["out_len"] = len(tokenizer(x["target"]).input_ids)
  x["out_longer_256"] = int(x["out_len"] > 256)
  x["out_longer_128"] = int(x["out_len"] > 128)
  x["out_longer_64"] = int(x["out_len"] > 64)
  return x

sample_size = 10000
data_stats = train_data.select(range(sample_size)).map(map_to_length, num_proc=4)

def compute_and_print_stats(x):
  if len(x["input_len"]) == sample_size:
    print(
        "Input Mean: {}, %-Input > 256:{},  %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}".format(
            sum(x["input_len"]) / sample_size,
            sum(x["input_longer_256"]) / sample_size,
            sum(x["input_longer_128"]) / sample_size,
            sum(x["input_longer_64"]) / sample_size,   
            sum(x["out_len"]) / sample_size,
            sum(x["out_longer_256"]) / sample_size,
            sum(x["out_longer_128"]) / sample_size,
            sum(x["out_longer_64"]) / sample_size,
        )
    )

output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Input Mean: 19.8971, %-Input > 256:0.0,  %-Input > 128:0.0, %-Input > 64:0.0002 Output Mean:20.0403, %-Output > 256:0.0, %-Output > 128:0.0002, %-Output > 64:0.0005


In [5]:
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], truncation = True, padding="max_length", max_length=128)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], truncation = True, padding="max_length", max_length=128)
   
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'labels': target_encodings['input_ids']
    }

    return encodings

In [6]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

#columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']
columns = ['input_ids', 'labels']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

train_data.save_to_disk(os.path.join(dataset_path,"train"))
test_data.save_to_disk(os.path.join(dataset_path,"eval"))

Saving the dataset (0/1 shards):   0%|          | 0/64776 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15878 [00:00<?, ? examples/s]

In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from transformers import TrainerCallback, EarlyStoppingCallback, is_tensorboard_available
import os

In [8]:
training_output = "trainoutput-wikisql"
remove_dir(training_output)

training_args = Seq2SeqTrainingArguments(
    output_dir=training_output,
    per_device_train_batch_size=32,
    num_train_epochs=5, # Below 5 will result in failed inference.
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    report_to="tensorboard", #bypass MLflow
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    fp16=True, #lower VRAM utilization
    #bf16=True, #not working for every GPU
)

Folder 'trainoutput-wikisql' has been deleted.


In [9]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [10]:
def custom_rewrite_logs(d, mode):
    new_d = {}
    eval_prefix = "eval_"
    eval_prefix_len = len(eval_prefix)
    test_prefix = "test_"
    test_prefix_len = len(test_prefix)
    for k, v in d.items():
        if mode == 'eval' and k.startswith(eval_prefix):
            if k[eval_prefix_len:] == 'loss':
                new_d["combined/" + k[eval_prefix_len:]] = v
        elif mode == 'test' and k.startswith(test_prefix):
            if k[test_prefix_len:] == 'loss':
                new_d["combined/" + k[test_prefix_len:]] = v
        elif mode == 'train':
            if k == 'loss':
                new_d["combined/" + k] = v
    return new_d


class CombinedTensorBoardCallback(TrainerCallback):
    """
    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
    Args:
        tb_writer (`SummaryWriter`, *optional*):
            The writer to use. Will instantiate one if not set.
    """

    def __init__(self, tb_writers=None):
        has_tensorboard = is_tensorboard_available()
        if not has_tensorboard:
            raise RuntimeError(
                "TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or"
                " install tensorboardX."
            )
        if has_tensorboard:
            try:
                from torch.utils.tensorboard import SummaryWriter  # noqa: F401

                self._SummaryWriter = SummaryWriter
            except ImportError:
                try:
                    from tensorboardX import SummaryWriter

                    self._SummaryWriter = SummaryWriter
                except ImportError:
                    self._SummaryWriter = None
        else:
            self._SummaryWriter = None
        self.tb_writers = tb_writers

    def _init_summary_writer(self, args, log_dir=None):
        log_dir = log_dir or args.logging_dir
        if self._SummaryWriter is not None:
            self.tb_writers = dict(train=self._SummaryWriter(log_dir=os.path.join(log_dir, 'train')),
                                   eval=self._SummaryWriter(log_dir=os.path.join(log_dir, 'eval')))

    def on_train_begin(self, args, state, control, **kwargs):
        if not state.is_world_process_zero:
            return

        log_dir = None

        if state.is_hyper_param_search:
            trial_name = state.trial_name
            if trial_name is not None:
                log_dir = os.path.join(args.logging_dir, trial_name)

        if self.tb_writers is None:
            self._init_summary_writer(args, log_dir)

        for k, tbw in self.tb_writers.items():
            tbw.add_text("args", args.to_json_string())
            if "model" in kwargs:
                model = kwargs["model"]
                if hasattr(model, "config") and model.config is not None:
                    model_config_json = model.config.to_json_string()
                    tbw.add_text("model_config", model_config_json)
            # Version of TensorBoard coming from tensorboardX does not have this method.
            if hasattr(tbw, "add_hparams"):
                tbw.add_hparams(args.to_sanitized_dict(), metric_dict={})

    def on_log(self, args, state, control, logs=None, **kwargs):
        if not state.is_world_process_zero:
            return

        if self.tb_writers is None:
            self._init_summary_writer(args)

        for tbk, tbw in self.tb_writers.items():
            logs_new = custom_rewrite_logs(logs, mode=tbk)
            for k, v in logs_new.items():
                if isinstance(v, (int, float)):
                    tbw.add_scalar(k, v, state.global_step)
                else:
                    logger.warning(
                        "Trainer is attempting to log a value of "
                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                        "This invocation of Tensorboard's writer.add_scalar() "
                        "is incorrect so we dropped this attribute."
                    )
            tbw.flush()

    def on_train_end(self, args, state, control, **kwargs):
        for tbw in self.tb_writers.values():
            tbw.close()
        self.tb_writers = None

In [11]:
early_stopping = EarlyStoppingCallback(early_stopping_patience= 5, 
                                    early_stopping_threshold= 0.001)
train_dataset = load_from_disk(os.path.join(dataset_path, "train"))
eval_dataset = load_from_disk(os.path.join(dataset_path, "eval"))

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks= [early_stopping,CombinedTensorBoardCallback]
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
#trainer.evaluate()

In [None]:
import torch.profiler

class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()
        
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
                                        torch.profiler.ProfilerActivity.CUDA], 
                            schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2),
                            on_trace_ready=torch.profiler.tensorboard_trace_handler(training_output),
                            profile_memory=True,
                            with_stack=True,
                            record_shapes=True) as prof:
    trainer.add_callback(ProfCallback(prof=prof))
    trainer.train()

Epoch,Training Loss,Validation Loss


[W kineto_shim.cpp:372] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:372] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:372] Profiler is not initialized: skipping step() invocation
STAGE:2023-11-30 00:52:12 2184:2184 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-30 00:52:14 2184:2184 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-30 00:52:14 2184:2184 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
STAGE:2023-11-30 00:52:31 2184:2184 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-30 00:52:33 2184:2184 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-30 00:52:33 2184:2184 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [None]:
trainer.save_model()

In [None]:
tokenizer.save_pretrained(training_output)

In [None]:
trainer.create_model_card()

In [None]:
#clear kernel

In [None]:
#ft_model = "trainoutput-wikisql"
ft_model = training_output
#base_model = 't5-small'
#ft_model = base_model
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained(ft_model)
model = T5ForConditionalGeneration.from_pretrained(ft_model)
from datasets import load_dataset

test_data = load_dataset('wikisql', split='test')

def translate_to_sql(text):
    inputs = tokenizer(text, padding='longest', max_length=128, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    #output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)
    output = model.generate(input_ids, max_length=128)
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
for i in range(200,300,20):
  print('translate to SQL: ' + test_data[i]['question'])
  print('Predict. :' + translate_to_sql('translate to SQL: ' + test_data[i]['question']))
  print('Expected: ' + test_data[i]['sql']['human_readable'])
  print('=================================\n')
    

text = "translate to SQL: How many model with BERT architecture are in the HuggingFace Hub?"
#text = "translate to SQL: The stenhousemuir team had how many highest attendances?"
translate_to_sql(text)