In [None]:
base_model = 't5-large' #770 million
#base_model = 't5-base' #220 million
#base_model = 't5-small' #60 million
dataset = 'wikisql'
dataset_path = "wikisql_tok_dataset"
training_output = "trainoutput-wikisql"
batch_size = 32
eval_size = 32
epoch_num = 3

from datasets import load_dataset, load_from_disk
import shutil
import os
import torch
import numpy as np
import evaluate
import torch
from transformers import (AutoTokenizer,
                          GenerationConfig,
                          T5ForConditionalGeneration,
                          Seq2SeqTrainer, 
                          Seq2SeqTrainingArguments, 
                          TrainerCallback,
                          EarlyStoppingCallback,
                          is_tensorboard_available,
                          DataCollatorForSeq2Seq)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

model = T5ForConditionalGeneration.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

gen_cfg = GenerationConfig.from_model_config(model.config)
gen_cfg.max_new_tokens = 128
gen_cfg.min_length = 1

train_data = load_dataset(dataset, split="train[:100%]")
eval_data = load_dataset(dataset, split="validation[:100%]")
test_data = load_dataset(dataset, split="test[:100%]")

def format_dataset(example):
 return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)
eval_data = eval_data.map(format_dataset, remove_columns=eval_data.column_names)

def remove_dir(dir_path):
    try:
        shutil.rmtree(dir_path)
        print(f"Folder '{dir_path}' has been deleted.")
    except Exception as e:
        # Ignore errors, you can print a message if needed
        print(f"Folder '{dir_path}' has been deleted.")

In [None]:
def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"{device_map} Memory Used: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_parameters(model)
print("\nData types:")
print_param_precision(model)

In [None]:
train_data

In [None]:
# Find the entry with the least number of zero padding
tokenizer2 = AutoTokenizer.from_pretrained(base_model)
train_data2 = load_dataset(dataset, split="train[:100%]")

def format_dataset(example):
    return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

tokenized_dataset = train_data2.map(format_dataset, remove_columns=train_data2.column_names)
tokenized_dataset = tokenizer(tokenized_dataset["input"], padding=True, truncation=True, return_tensors="pt")

# Find the entry with the least number of zero padding
min_zeros_entry = None
min_zeros_count = float('inf')
non_zeros_count = None

for idx, input_ids in enumerate(tokenized_dataset["input_ids"]):
    # Assuming "input_ids" is a PyTorch tensor
    zeros_count = torch.sum(input_ids == 0).item()  # Counting the number of zeros (padding tokens)
    if zeros_count < min_zeros_count:
        min_zeros_count = zeros_count
        min_zeros_entry = idx
        non_zeros_count = len(input_ids) - zeros_count

# Print the result
print(f"The entry with the least number of zero padding is at index {min_zeros_entry}")
print(f"Total non-padded items of the tensor index {min_zeros_entry} is {non_zeros_count}")
print(f"Length of tensor at index {min_zeros_entry}: {len(input_ids)}")

In [None]:
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], truncation = True, padding="max_length", pad_to_max_length=True, max_length=96)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], truncation = True, padding="max_length", pad_to_max_length=True, max_length=96)
   
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'labels': target_encodings['input_ids']
    }

    return encodings

In [None]:
remove_dir(dataset_path)
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)
eval_data = eval_data.map(convert_to_features, batched=True, remove_columns=eval_data.column_names)

#columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']
columns = ['input_ids', 'labels']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)
eval_data.set_format(type='torch', columns=columns)

train_data.save_to_disk(os.path.join(dataset_path,"train"))
test_data.save_to_disk(os.path.join(dataset_path,"test"))
eval_data.save_to_disk(os.path.join(dataset_path,"eval"))

In [None]:
remove_dir(training_output)

training_args = Seq2SeqTrainingArguments(
    output_dir=training_output, 
    generation_max_length=128,
    generation_num_beams=4,
    generation_config=gen_cfg, # applicable for Seq2SeqTrainingArguments
    per_device_train_batch_size=batch_size, # Above 64 results in no traininglog and higher loss Seq2SeqTrainingArguments
    num_train_epochs=epoch_num, # Below 5 will result in failed inference.
    per_device_eval_batch_size=eval_size, #lower due to lower eval_dataset than train_dataset Seq2SeqTrainingArguments
    predict_with_generate=True, # False will increase VRAM and potentially OOM # applicable for Seq2SeqTrainingArguments
    evaluation_strategy="epoch",
    #evaluation_strategy="steps",
    load_best_model_at_end=True, 
    do_train=True,
    do_eval=True,
    save_strategy="epoch",
    overwrite_output_dir=True,
    fp16=False, #lower VRAM utilization #False for T5-large https://discuss.huggingface.co/t/t5-variants-return-training-loss-0-and-validation-loss-nan-while-fine-tuning/30839
    #bf16=True, #not working for every GPU
    report_to="tensorboard", #bypass MLflow
    logging_dir=f"{training_output}/logs",
    logging_strategy="steps",
    logging_steps=500,
)

In [None]:
from datasets import load_metric
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
def custom_rewrite_logs(d, mode):
    new_d = {}
    eval_prefix = "eval_"
    eval_prefix_len = len(eval_prefix)
    test_prefix = "test_"
    test_prefix_len = len(test_prefix)
    for k, v in d.items():
        if mode == 'eval' and k.startswith(eval_prefix):
            if k[eval_prefix_len:] == 'loss':
                new_d["combined/" + k[eval_prefix_len:]] = v
        elif mode == 'test' and k.startswith(test_prefix):
            if k[test_prefix_len:] == 'loss':
                new_d["combined/" + k[test_prefix_len:]] = v
        elif mode == 'train':
            if k == 'loss':
                new_d["combined/" + k] = v
    return new_d


class CombinedTensorBoardCallback(TrainerCallback):
    """
    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
    Args:
        tb_writer (`SummaryWriter`, *optional*):
            The writer to use. Will instantiate one if not set.
    """

    def __init__(self, tb_writers=None):
        has_tensorboard = is_tensorboard_available()
        if not has_tensorboard:
            raise RuntimeError(
                "TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or"
                " install tensorboardX."
            )
        if has_tensorboard:
            try:
                from torch.utils.tensorboard import SummaryWriter  # noqa: F401

                self._SummaryWriter = SummaryWriter
            except ImportError:
                try:
                    from tensorboardX import SummaryWriter

                    self._SummaryWriter = SummaryWriter
                except ImportError:
                    self._SummaryWriter = None
        else:
            self._SummaryWriter = None
        self.tb_writers = tb_writers

    def _init_summary_writer(self, args, log_dir=None):
        log_dir = log_dir or args.logging_dir
        if self._SummaryWriter is not None:
            self.tb_writers = dict(train=self._SummaryWriter(log_dir=os.path.join(log_dir, 'train')),
                                   eval=self._SummaryWriter(log_dir=os.path.join(log_dir, 'eval')))

    def on_train_begin(self, args, state, control, **kwargs):
        if not state.is_world_process_zero:
            return

        log_dir = None

        if state.is_hyper_param_search:
            trial_name = state.trial_name
            if trial_name is not None:
                log_dir = os.path.join(args.logging_dir, trial_name)

        if self.tb_writers is None:
            self._init_summary_writer(args, log_dir)

        for k, tbw in self.tb_writers.items():
            tbw.add_text("args", args.to_json_string())
            if "model" in kwargs:
                model = kwargs["model"]
                if hasattr(model, "config") and model.config is not None:
                    model_config_json = model.config.to_json_string()
                    tbw.add_text("model_config", model_config_json)
            # Version of TensorBoard coming from tensorboardX does not have this method.
            if hasattr(tbw, "add_hparams"):
                tbw.add_hparams(args.to_sanitized_dict(), metric_dict={})

    def on_log(self, args, state, control, logs=None, **kwargs):
        if not state.is_world_process_zero:
            return

        if self.tb_writers is None:
            self._init_summary_writer(args)

        for tbk, tbw in self.tb_writers.items():
            logs_new = custom_rewrite_logs(logs, mode=tbk)
            for k, v in logs_new.items():
                if isinstance(v, (int, float)):
                    tbw.add_scalar(k, v, state.global_step)
                else:
                    logger.warning(
                        "Trainer is attempting to log a value of "
                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                        "This invocation of Tensorboard's writer.add_scalar() "
                        "is incorrect so we dropped this attribute."
                    )
            tbw.flush()

    def on_train_end(self, args, state, control, **kwargs):
        for tbw in self.tb_writers.values():
            tbw.close()
        self.tb_writers = None

In [None]:
early_stopping = EarlyStoppingCallback(early_stopping_patience= 6, 
                                    early_stopping_threshold= 0.055)
train_dataset = load_from_disk(os.path.join(dataset_path, "train"))
eval_dataset = load_from_disk(os.path.join(dataset_path, "eval"))

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    #compute_metrics=compute_metrics, #slow down training
    train_dataset=train_dataset,
    eval_dataset=eval_dataset, # eval is slower with compute_metrics
    #callbacks= [CombinedTensorBoardCallback]
    callbacks= [early_stopping,CombinedTensorBoardCallback]
)

In [None]:
#trainer.evaluate()

In [None]:
import torch.profiler
import time

class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()

start_time = time.time()
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
                                        torch.profiler.ProfilerActivity.CUDA], 
                            schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2),
                            on_trace_ready=torch.profiler.tensorboard_trace_handler(training_output),
                            profile_memory=True,
                            with_stack=True,
                            record_shapes=True) as prof:
    trainer.add_callback(ProfCallback(prof=prof))
    trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")
with open("training_time.txt", "w") as file:
    file.write(f"Training took {elapsed_time:.2f} seconds")

In [None]:
trainer.save_model()

In [None]:
tokenizer.save_pretrained(training_output)

In [None]:
trainer.create_model_card()

In [None]:
#Reset kernel if OOM

In [1]:
ft_model = "trainoutput-wikisql"
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch import cuda

device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device_map)

#model = AutoModelForSeq2SeqLM.from_pretrained(ft_model,device_map=device_map)
model = AutoModelForSeq2SeqLM.from_pretrained(ft_model,device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained(ft_model)
from datasets import load_dataset

test_data = load_dataset('wikisql', split='test')

def translate_to_sql(text):
    inputs = tokenizer(text, padding='longest', max_length=128, return_tensors='pt').to("cpu")
    #inputs = tokenizer(text, padding='longest', max_length=128, return_tensors='pt').to(device_map)
    input_ids = inputs.input_ids
    #attention_mask = inputs.attention_mask
    #output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)
    output = model.generate(input_ids, max_length=96)
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

print("\nParameters:")
print_parameters(model)
print("\nData types:")
print_param_precision(model)

  from .autonotebook import tqdm as notebook_tqdm


cuda:0

Parameters:
Total parameters: 737.6681 M

Data types:
torch.float32, 737.6681 M, 100.00 %


In [2]:
import time
start_time = time.time()

for i in range(10,20,2):
  print('Test Instruction: ' + test_data[i]['question'])
  print('Model Prediction: ' + translate_to_sql('translate to SQL: ' + test_data[i]['question']))
  print('Expected Answer: ' + test_data[i]['sql']['human_readable'])
  print('=================================\n')

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Inference took {elapsed_time:.2f} seconds")
with open("Inference_time.txt", "w") as file:
    file.write(f"Inference took {elapsed_time:.2f} seconds")

Test Instruction: How many different nationalities do the players of New Jersey Devils come from?




Model Prediction: COUNT Nationality FROM table WHERE NHL team = New Jersey Devils
Expected Answer: SELECT COUNT Nationality FROM table WHERE NHL team = New Jersey Devils

Test Instruction: What is the nationality of the player from Vancouver Canucks?
Model Prediction: SELECT Nationality FROM table WHERE NHL team = Vancouver Canucks
Expected Answer: SELECT Nationality FROM table WHERE NHL team = Vancouver Canucks

Test Instruction: When were the ships launched that were laid down on september 1, 1964?
Model Prediction: SELECT Launched FROM table WHERE Laid Down = september 1, 1964
Expected Answer: SELECT Launched FROM table WHERE Laid down = September 1, 1964

Test Instruction: List the # for ships commissioned on september 30, 1967.
Model Prediction: SELECT # FROM table WHERE Commissioned = september 30, 1967
Expected Answer: SELECT # FROM table WHERE Commissioned = September 30, 1967

Test Instruction:  What could a spanish coronel be addressed as in the commonwealth military?
Model P