In [None]:
base_model = 't5-large' #770 million param
#base_model = 't5-base' #220 million param
#base_model = 't5-small' #60 million param
dataset = 'wikisql'
dataset_path = "wikisql_tok_dataset"
training_output = "trainoutput-wikisql"
batch_size = 32
eval_size = 32
epoch_num = 3

from datasets import load_dataset, load_from_disk
import shutil
import os
import torch
import numpy as np
import evaluate
import torch
from transformers import (AutoTokenizer,
                          GenerationConfig,
                          T5ForConditionalGeneration,
                          Seq2SeqTrainer, 
                          Seq2SeqTrainingArguments, 
                          TrainerCallback,
                          EarlyStoppingCallback,
                          is_tensorboard_available,
                          DataCollatorForSeq2Seq)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

model = T5ForConditionalGeneration.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

gen_cfg = GenerationConfig.from_model_config(model.config)
gen_cfg.max_new_tokens = 128
gen_cfg.min_length = 1

train_data = load_dataset(dataset, split="train[:100%]")
eval_data = load_dataset(dataset, split="validation[:100%]")
test_data = load_dataset(dataset, split="test[:100%]")

def format_dataset(example):
 return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)
eval_data = eval_data.map(format_dataset, remove_columns=eval_data.column_names)

def remove_dir(dir_path):
    try:
        shutil.rmtree(dir_path)
        print(f"Folder '{dir_path}' has been deleted.")
    except Exception as e:
        # Ignore errors, you can print a message if needed
        print(f"Folder '{dir_path}' has been deleted.")

In [None]:
def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"{device_map} Memory Used: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_parameters(model)
print("\nData types:")
print_param_precision(model)

In [None]:
train_data

In [None]:
# Find the entry with the least number of zero padding
tokenizer2 = AutoTokenizer.from_pretrained(base_model)
train_data2 = load_dataset(dataset, split="train[:100%]")

def format_dataset(example):
    return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

tokenized_dataset = train_data2.map(format_dataset, remove_columns=train_data2.column_names)
tokenized_dataset = tokenizer(tokenized_dataset["input"], padding=True, truncation=True, return_tensors="pt")

# Find the entry with the least number of zero padding
min_zeros_entry = None
min_zeros_count = float('inf')
non_zeros_count = None

for idx, input_ids in enumerate(tokenized_dataset["input_ids"]):
    # Assuming "input_ids" is a PyTorch tensor
    zeros_count = torch.sum(input_ids == 0).item()  # Counting the number of zeros (padding tokens)
    if zeros_count < min_zeros_count:
        min_zeros_count = zeros_count
        min_zeros_entry = idx
        non_zeros_count = len(input_ids) - zeros_count

# Print the result
print(f"The entry with the least number of zero padding is at index {min_zeros_entry}")
print(f"Total non-padded items of the tensor index {min_zeros_entry} is {non_zeros_count}")
print(f"Length of tensor at index {min_zeros_entry}: {len(input_ids)}")

In [None]:
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], truncation = True, padding="max_length", pad_to_max_length=True, max_length=96)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], truncation = True, padding="max_length", pad_to_max_length=True, max_length=96)
   
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'labels': target_encodings['input_ids']
    }

    return encodings

In [None]:
remove_dir(dataset_path)
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)
eval_data = eval_data.map(convert_to_features, batched=True, remove_columns=eval_data.column_names)

#columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']
columns = ['input_ids', 'labels']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)
eval_data.set_format(type='torch', columns=columns)

train_data.save_to_disk(os.path.join(dataset_path,"train"))
test_data.save_to_disk(os.path.join(dataset_path,"test"))
eval_data.save_to_disk(os.path.join(dataset_path,"eval"))