In [2]:
from datasets import load_dataset, load_metric, concatenate_datasets
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import DataLoader

In [3]:
device_name = torch.cuda.get_device_name()
print(device_name)

Tesla V100-PCIE-32GB


In [4]:
device = torch.device('cuda')

# Feature Extraction

In [5]:
train_data = load_dataset('wikisql', split='train')
val_data = load_dataset('wikisql', split='validation')
test_data = load_dataset('wikisql', split='test')

In [7]:
START_TOK = '[SOS] '
def format_dataset(example):
     return {'input': START_TOK+example['question'], 'target': example['sql']['human_readable']}

In [8]:
train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)
val_data = val_data.map(format_dataset, remove_columns=val_data.column_names)
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)

train_data[0]

{'input': '[SOS] Tell me what the notes are for South Australia ',
 'target': 'SELECT Notes FROM table WHERE Current slogan = SOUTH AUSTRALIA'}

# Tokenization

In [9]:
CHECKPOINT = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(CHECKPOINT)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

### Finding appropriate Max_Length

In [10]:
# map article and summary len to dict as well as if sample is longer than 512 tokens
def map_to_length(x):
    x["input_len"] = len(tokenizer(x["input"]).input_ids)
    x["input_longer_128"] = int(x["input_len"] > 128)
    x["input_longer_64"] = int(x["input_len"] > 64)
    x["input_longer_32"] = int(x["input_len"] > 32)

    x["out_len"] = len(tokenizer(x["target"]).input_ids)
    x["out_longer_128"] = int(x["out_len"] > 128)
    x["out_longer_64"] = int(x["out_len"] > 64)
    x["out_longer_32"] = int(x["out_len"] > 32)
    return x

In [11]:
train_stats = train_data.map(map_to_length, num_proc=4)

Map (num_proc=4):   0%|          | 0/56355 [00:00<?, ? examples/s]

In [12]:
val_stats = val_data.map(map_to_length, num_proc=4)

Map (num_proc=4):   0%|          | 0/8421 [00:00<?, ? examples/s]

In [13]:
test_stats = test_data.map(map_to_length, num_proc=4)

Map (num_proc=4):   0%|          | 0/15878 [00:00<?, ? examples/s]

In [14]:
all_merged = concatenate_datasets([train_stats,
                                   val_stats,
                                  test_stats])

##### Some Analysis on lengths

In [15]:
def compute_and_print_stats(x, sample_size):
    if len(x["input_len"]) == sample_size:
        print(
            "Input Max: {}, Input Mean: {:.5f}, Input>32:{},  Input>128:{:.5f}, Input>64:{:.5f} \nOutput Max: {}, Output Mean:{:.5f}, Output>32:{}, Output>128:{:.5f}, Output>64:{:.5f}".format(
                max(x["input_len"]),
                sum(x["input_len"]) / sample_size,
                sum(x["input_longer_32"]) / sample_size,
                sum(x["input_longer_128"]) / sample_size,
                sum(x["input_longer_64"]) / sample_size,
                max(x["out_len"]),
                sum(x["out_len"]) / sample_size,
                sum(x["out_longer_32"]) / sample_size,
                sum(x["out_longer_128"]) / sample_size,
                sum(x["out_longer_64"]) / sample_size,
            )
        )

In [16]:
# All Data
output = all_merged.map(
  lambda x: compute_and_print_stats(x, all_merged.shape[0]), 
  batched=True,
  batch_size=-1,
)

Map:   0%|          | 0/80654 [00:00<?, ? examples/s]

Input Max: 106, Input Mean: 21.46123, Input>32:0.057914052619832866,  Input>128:0.00000, Input>64:0.00041 
Output Max: 149, Output Mean:16.98596, Output>32:0.015857861978327174, Output>128:0.00002, Output>64:0.00032


In [17]:
# Train Data
output = train_stats.map(
  lambda x: compute_and_print_stats(x, train_stats.shape[0]), 
  batched=True,
  batch_size=-1,
)

Map:   0%|          | 0/56355 [00:00<?, ? examples/s]

Input Max: 106, Input Mean: 21.44598, Input>32:0.05681838346198208,  Input>128:0.00000, Input>64:0.00041 
Output Max: 149, Output Mean:16.98566, Output>32:0.015420104693461095, Output>128:0.00004, Output>64:0.00035


In [18]:
# Val Data
output = val_stats.map(
  lambda x: compute_and_print_stats(x, val_stats.shape[0]), 
  batched=True,
  batch_size=-1,
)

Map:   0%|          | 0/8421 [00:00<?, ? examples/s]

Input Max: 83, Input Mean: 21.49792, Input>32:0.057712860705379405,  Input>128:0.00000, Input>64:0.00036 
Output Max: 78, Output Mean:16.87341, Output>32:0.014843842774017338, Output>128:0.00000, Output>64:0.00012


### Tokenizing and Padding

In [19]:
BUFFER = 2 # start end tokens
MAX_LENGTH = 64 + BUFFER

In [20]:
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], padding='max_length', max_length=MAX_LENGTH, truncation=True)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], padding='max_length', max_length=MAX_LENGTH, truncation=True)
    
    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }


    return encodings

In [21]:
finaltrain_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names, num_proc=4)
finalval_data = val_data.map(convert_to_features, batched=True, remove_columns=val_data.column_names, num_proc=4)


Map (num_proc=4):   0%|          | 0/56355 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/8421 [00:00<?, ? examples/s]

In [22]:
columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

In [23]:
finaltrain_data.set_format(type='torch', columns=columns, device=device)
finalval_data.set_format(type='torch', columns=columns, device=device)


In [24]:
finaltrain_data[1]['input_ids'], finaltrain_data[0]['input_ids'].shape

(tensor([    0, 10975,   104,  3196,   742,   653,    16,     5,   595,   651,
           147,     5,    92,   651,   880,    11,   502,  1466,   116,     2,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1], device='cuda:0'),
 torch.Size([66]))

In [25]:
tokenizer.decode(finaltrain_data[0]['input_ids'])

'<s>[SOS] Tell me what the notes are for South Australia </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

# Training

In [26]:
model_dir = "/home/athani.sh/Config_path"

In [27]:
args = Seq2SeqTrainingArguments(model_dir,
                               dataloader_pin_memory=False,
                               fp16=True,  # Use mixed precision training (requires GPU with Tensor Cores)
                                per_device_train_batch_size=100,  # Adjust batch size based on your GPU memory
                                per_device_eval_batch_size=100,
                                gradient_accumulation_steps=2,  # Accumulate gradients to increase effective batch size
                                evaluation_strategy="steps",
                                eval_steps=500,
                                logging_steps=100,
                                save_steps=500,
                                save_total_limit=2,  # Limit the total number of checkpoints
                                load_best_model_at_end=True,)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [28]:
model = BartForConditionalGeneration.from_pretrained(CHECKPOINT, device_map=device)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=finaltrain_data,
    eval_dataset=finalval_data,
)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [29]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.1106,0.094992


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=846, training_loss=0.462212065432934, metrics={'train_runtime': 1102.2449, 'train_samples_per_second': 153.382, 'train_steps_per_second': 0.768, 'total_flos': 6644156469350400.0, 'train_loss': 0.462212065432934, 'epoch': 3.0})

In [30]:
trainer.save_model('/home/athani.sh/Model_path')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


## Generating SQL

In [31]:
def translate_to_sql(local_model, text):
    inputs = tokenizer(text, padding='longest', max_length=MAX_LENGTH, truncation=True, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    output = local_model.generate(input_ids, attention_mask=attention_mask, max_length=64)


    return tokenizer.decode(output[0], skip_special_tokens=True)

def generate_sql_on_test(data, local_model):
    length = data.shape[0]
    query = data['input']
    expected = data['target']
        
    for i in range(length):
        print(f"QUERY - {query[i]}")
        translated = translate_to_sql(local_model, query[i])
        print(f"Prediction - {translated}")
        print(f"Expected = {expected[i]}")
        print("="*50)

In [32]:
generate_sql_on_test(test_data.select(range(10)), model.to("cpu"))

QUERY - [SOS] What is terrence ross' nationality
Prediction - SELECT Nationality FROM table WHERE Player = Terrence Ross
Expected = SELECT Nationality FROM table WHERE Player = Terrence Ross
QUERY - [SOS] What clu was in toronto 1995-96
Prediction - SELECT Clu FROM table WHERE Year = 1995-96 AND City = Toronto
Expected = SELECT School/Club Team FROM table WHERE Years in Toronto = 1995-96
QUERY - [SOS] which club was in toronto 2003-06
Prediction - SELECT Club FROM table WHERE Venue = toronto 2003-06
Expected = SELECT School/Club Team FROM table WHERE Years in Toronto = 2003-06
QUERY - [SOS] how many schools or teams had jalen rose
Prediction - SELECT COUNT School/Team FROM table WHERE Player = Jalen Rose
Expected = SELECT COUNT School/Club Team FROM table WHERE Player = Jalen Rose
QUERY - [SOS] Where was Assen held?
Prediction - SELECT Location FROM table WHERE Team = Assen
Expected = SELECT Round FROM table WHERE Circuit = Assen
QUERY - [SOS] What was the number of race that Kevin Cur