In [1]:
base_model = 't5-small'
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = T5ForConditionalGeneration.from_pretrained(base_model)

#CKPT = 'facebook/bart-large'
#from transformers import AutoTokenizer, BartTokenizer, BartModel, BartForConditionalGeneration
#tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
#model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

from datasets import load_dataset

train_data = load_dataset('wikisql', split="train[:100%]+validation[:100%]")
#train_data = load_dataset('wikisql', split="train+validation")
test_data = load_dataset('wikisql', split='test')

def format_dataset(example):
 return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)

import shutil

def remove_dir(dir_path):
    try:
        shutil.rmtree(dir_path)
        print(f"Folder '{dir_path}' has been deleted.")
    except Exception as e:
        # Ignore errors, you can print a message if needed
        print(f"Folder '{dir_path}' has been deleted.")
        


  table = cls._concat_blocks(blocks, axis=0)


In [2]:
import torch

def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"{device_map} Memory Used: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_parameters(model)
print("\nData types:")
print_param_precision(model)

cuda:0 Memory Used: 230.8145 MB

Parameters:
Total parameters: 60.5066 M

Data types:
torch.float32, 60.5066 M, 100.00 %


In [3]:
train_data

Dataset({
    features: ['input', 'target'],
    num_rows: 64776
})

In [4]:
# map article and summary len to dict as well as if sample is longer than 512 tokens
def map_to_length(x):
  x["input_len"] = len(tokenizer(x["input"]).input_ids)
  x["input_longer_256"] = int(x["input_len"] > 256)
  x["input_longer_128"] = int(x["input_len"] > 128)
  x["input_longer_64"] = int(x["input_len"] > 64)
  x["out_len"] = len(tokenizer(x["target"]).input_ids)
  x["out_longer_256"] = int(x["out_len"] > 256)
  x["out_longer_128"] = int(x["out_len"] > 128)
  x["out_longer_64"] = int(x["out_len"] > 64)
  return x

sample_size = 10000
data_stats = train_data.select(range(sample_size)).map(map_to_length, num_proc=4)

def compute_and_print_stats(x):
  if len(x["input_len"]) == sample_size:
    print(
        "Input Mean: {}, %-Input > 256:{},  %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}".format(
            sum(x["input_len"]) / sample_size,
            sum(x["input_longer_256"]) / sample_size,
            sum(x["input_longer_128"]) / sample_size,
            sum(x["input_longer_64"]) / sample_size,   
            sum(x["out_len"]) / sample_size,
            sum(x["out_longer_256"]) / sample_size,
            sum(x["out_longer_128"]) / sample_size,
            sum(x["out_longer_64"]) / sample_size,
        )
    )

output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Input Mean: 19.8508, %-Input > 256:0.0,  %-Input > 128:0.0, %-Input > 64:0.0002 Output Mean:20.0403, %-Output > 256:0.0, %-Output > 128:0.0002, %-Output > 64:0.0005


In [5]:
# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], truncation = True, padding="max_length", max_length=128)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], truncation = True, padding="max_length", max_length=128)
   
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'labels': target_encodings['input_ids']
    }

    return encodings

In [6]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

#columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']
columns = ['input_ids', 'labels']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback     

In [8]:
training_output = "trainoutput-wikisql"
remove_dir(training_output) 

training_args = Seq2SeqTrainingArguments(
    output_dir=training_output,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    #do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    report_to="tensorboard", #bypass MLflow
    overwrite_output_dir=True
    #fp16=True, 
)

Folder 'trainoutput-wikisql' has been deleted.


In [9]:
from datasets import load_metric
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  rouge = load_metric("rouge")


In [10]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.evaluate()

In [11]:
import torch.profiler

class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()
        
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
                                        torch.profiler.ProfilerActivity.CUDA], 
                            schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2),
                            on_trace_ready=torch.profiler.tensorboard_trace_handler(training_output),
                            profile_memory=True,
                            with_stack=True,
                            record_shapes=True) as prof:
    trainer.add_callback(ProfCallback(prof=prof))
    trainer.train()

Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,0.0997,0.080498,0.4059,0.3324,0.3585
2,0.0834,0.070028,0.5613,0.4757,0.5068
3,0.076,0.065481,0.6347,0.5491,0.5806
4,0.0742,0.063457,0.6212,0.5373,0.5682
5,0.0701,0.062872,0.6783,0.5951,0.626


[W kineto_shim.cpp:372] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:372] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:372] Profiler is not initialized: skipping step() invocation
STAGE:2023-11-29 09:45:06 5900:5900 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-29 09:45:06 5900:5900 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-29 09:45:06 5900:5900 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
STAGE:2023-11-29 09:45:10 5900:5900 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-11-29 09:45:11 5900:5900 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-11-29 09:45:11 5900:5900 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [12]:
trainer.save_model()

In [13]:
tokenizer.save_pretrained(training_output)

('trainoutput-wikisql/tokenizer_config.json',
 'trainoutput-wikisql/special_tokens_map.json',
 'trainoutput-wikisql/spiece.model',
 'trainoutput-wikisql/added_tokens.json',
 'trainoutput-wikisql/tokenizer.json')

In [14]:
trainer.create_model_card()

In [None]:
#clear kernel

In [15]:
ft_model = "trainoutput-wikisql"
#ft_model = 't5-base'
base_model = 't5-small'
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained(ft_model)
model = T5ForConditionalGeneration.from_pretrained(ft_model)
from datasets import load_dataset

test_data = load_dataset('wikisql', split='test')

def translate_to_sql(text):
    inputs = tokenizer(text, padding='longest', max_length=128, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    #output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)
    output = model.generate(input_ids, max_length=128)
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [16]:
for i in range(200,300,20):
  print('translate to SQL: ' + test_data[i]['question'])
  print('Predict. :' + translate_to_sql('translate to SQL: ' + test_data[i]['question']))
  print('Expected: ' + test_data[i]['sql']['human_readable'])
  print('=================================\n')
    

text = "translate to SQL: How many model with BERT architecture are in the HuggingFace Hub?"
#text = "translate to SQL: The stenhousemuir team had how many highest attendances?"
translate_to_sql(text)

translate to SQL: Who was the winning driver when the grand Prix was at Belgian Grand Prix?
Predict. :SELECT Winning driver FROM table WHERE Grand Prix = Belgian Grand Prix
Expected: SELECT Winning Driver FROM table WHERE Grand Prix = Belgian Grand Prix

translate to SQL: Which races did Paul Greifzu win?
Predict. :SELECT Races FROM table WHERE Winner = Paul Greifzu
Expected: SELECT Race Name FROM table WHERE Winning driver = Paul Greifzu

translate to SQL: What is the % of total capacity when the generators is 4048?
Predict. :SELECT % of total capacity FROM table WHERE generators = 4048
Expected: SELECT % of total Capacity FROM table WHERE Number of Generators = 4048

translate to SQL: What is the acronym for the school whose website is ul.edu.lb
Predict. :SELECT Acronyme FROM table WHERE Website = ul.edu.lb
Expected: SELECT Acronym FROM table WHERE Website = ul.edu.lb

translate to SQL: What is the population in the city of Pomorskie?
Predict. :SELECT COUNT Population in Pomorskie
Ex

'SELECT COUNT Model FROM table WHERE BERT architecture = HuggingFace Hub'

In [None]:
!deepspeed --hostfile {myhostfile} \
--launcher pdsh \
--num_gpus {worker_gpu} --num_nodes {NUM_WORKERS} \
--master_addr {MASTER_IP} \
--ssh_port {sshd_port} {train_script} \
--model_id google/flan-t5-large \
--dataset_path data \
--epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 8 \
--generation_max_length 129 \
--lr 1e-4 \
--deepspeed {deepspeed_cfg}

In [None]:
!deepspeed --hostfile {myhostfile} \
--launcher pdsh \
--num_gpus {worker_gpu} --num_nodes {NUM_WORKERS} \
--master_addr {MASTER_IP} \
--ssh_port {sshd_port} {train_script} \
--deepspeed --deepspeed_config {deepspeed_cfg}

In [None]:
def remove_dir(dir_path):
    try:
        shutil.rmtree(dir_path)
        print(f"Folder '{dir_path}' has been deleted.")
    except Exception as e:
        # Ignore errors, you can print a message if needed
        print(f"Folder '{dir_path}' has been deleted.")
        
base_model = "bloom-1b1"
base_model_name = "bloom-1b1"
merged_model = "merged_bloom-1b1"
training_output = "training_bloom-1b1"
remove_dir(training_output) 
remove_dir(merged_model)
remove_dir(trainlogs)

In [None]:
main_cmd = subprocess.Popen([f'bash -c "{cml_cmd}" '], shell=True)
main_cmd.communicate()

In [None]:
# Please restart the iPython kernel manually.

In [None]:
!deepspeed --hostfile {myhostfile} \
--launcher pdsh \
--num_gpus {worker_gpu} --num_nodes {NUM_WORKERS} \
--master_addr {MASTER_IP} \
--ssh_port {sshd_port} run_translation.py \
--model_name_or_path t5-large --per_device_train_batch_size 1 \
--deepspeed {deepspeed_cfg} \
--output_dir output_dir --overwrite_output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro