In [1]:
from pprint import pprint
import json
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # To order device based on pci bus id
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" # As the name suggests these are the VISIBLE GPUS, you need to make use of them using cuda:0/1
# Don't set the below variables, they are always relative
# os.environ["HF_DATASETS_CACHE"] = "/$HOME/.cache/huggingface/datasets" 
# os.environ["TRANSFORMERS_CACHE"] = "/$HOME/.cache/huggingface/hub" 

In [2]:
import torch
torch.cuda.device_count()

4

## Data

### HF Spider
Note that Hugging Face's spider dataset does not work with Picard T5 because it does not serialize the DB schemas.

import datasets

dataset = datasets.load_dataset("spider")
ds_train = dataset["train"] # To get the training rows
pprint(ds_train[0:1].keys())
pprint(ds_train[0])

### Yale Spider
The original Spider dataset was downloaded from their [homepage](https://yale-lily.github.io/spider).
The 2 important files are described on their [github](https://github.com/taoyds/spider) as follows:
1. **train.json/dev.json**
    - `question`: the natural language question
    - `question_toks`: the natural language question tokens
    - `db_id`: the database id to which this question is addressed.
    - `query`: the SQL query corresponding to the question.
    - `query_toks`: the SQL query tokens corresponding to the question.
    - `sql`: parsed results of this SQL query using process_sql.py. Please refer to parsed_sql_examples.sql in thepreprocess directory for the detailed documentation.
2. **tables.json** contains the schema of all tables
    - `db_id`: database id
    - `table_names_original`: original table names stored in the database.
    - `table_names`: cleaned and normalized table names. We make sure the table names are meaningful. [to be changed]
    - `column_names_original`: original column names stored in the database. Each column looks like: [0, "id"]. 0 is the index of table names in table_names, which is city in this case. "id" is the column name.
    - `column_names`: cleaned and normalized column names. We make sure the column names are meaningful. [to be changed]
    - `column_types`: data type of each column
    - `foreign_keys`: foreign keys in the database. [3, 8] means column indices in the column_names. These two columns are foreign keys of two different tables.
    - `primary_keys`: primary keys in the database. Each number is the index of column_names.

In [3]:
spider_dir_path = "./data/spider"

print(os.listdir(spider_dir_path))
print()

train_json_filename = "train_spider.json"
print(f"---{train_json_filename}---")
with open(os.path.join(spider_dir_path, train_json_filename)) as f:
    train_json = json.load(f)
print(train_json[0].keys())
print()

tables_json_filename = "tables.json"
print(f"---{tables_json_filename}---")
with open(os.path.join(spider_dir_path, tables_json_filename)) as f:
    tables_json = json.load(f)
print(tables_json[0].keys())
print()

dev_json_filename = "dev.json"
print(f"---{dev_json_filename}---")
with open(os.path.join(spider_dir_path, dev_json_filename)) as f:
    dev_json = json.load(f)
print(dev_json[0].keys())


['train_others.json', 'dev.json', 'README.txt', 'database', '.DS_Store', 'train_spider.json', 'tables.json', 'train_gold.sql', 'dev_gold.sql']

---train_spider.json---
dict_keys(['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql'])

---tables.json---
dict_keys(['column_names', 'column_names_original', 'column_types', 'db_id', 'foreign_keys', 'primary_keys', 'table_names', 'table_names_original'])

---dev.json---
dict_keys(['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql'])


Tables json is a list, so we load in memory as a dictionary (hashtable) for quicker access.

In [4]:
tables_dict_by_db = {}
for table in tables_json:
    tables_dict_by_db[table["db_id"]] = table 
print("Length of both data structures match:", len(tables_json) == len(tables_dict_by_db))

Length of both data structures match: True


Here is an example of one entry.

In [5]:
list(tables_dict_by_db.keys())[99] #movie_1
movie_1 = tables_dict_by_db["movie_1"]
print("Databse:", movie_1["db_id"])
for key in movie_1:
    # Skip db_id since it is a string
    if key == "db_id":
        continue
    print(f"---{key}---")
    item_collated = "|".join(str(item) for item in movie_1[key])
    print(item_collated)


Databse: movie_1
---column_names---
[-1, '*']|[0, 'movie id']|[0, 'title']|[0, 'year']|[0, 'director']|[1, 'reviewer id']|[1, 'name']|[2, 'reviewer id']|[2, 'movie id']|[2, 'rating stars']|[2, 'rating date']
---column_names_original---
[-1, '*']|[0, 'mID']|[0, 'title']|[0, 'year']|[0, 'director']|[1, 'rID']|[1, 'name']|[2, 'rID']|[2, 'mID']|[2, 'stars']|[2, 'ratingDate']
---column_types---
text|number|text|number|text|number|text|number|number|number|time
---foreign_keys---
[7, 5]|[8, 1]
---primary_keys---
1|5
---table_names---
movie|reviewer|rating
---table_names_original---
Movie|Reviewer|Rating


We now try to serialize each database's schema in accordance with Tscholak (who in turn bases it of Shaw)

In [6]:
#TODO find out how to describe fields
delimiter = " | "
def serialize_spider_db(db):

    # First group column names by their table id
    columns = db["column_names"]
    column_strings = {}
    for column in columns:
        table_idx = column[0]
        if table_idx not in column_strings:
            column_strings[table_idx] = [] 
        # Note that the white spaces in column names were replaced with underscores (arbitrarily I suppose)
        column_strings[table_idx].append(column[1].replace(" ","_"))
    
    # Next combine table name with column names
    tables = db["table_names"]
    table_strings = [db["db_id"]]
    for table_idx in range(len(tables)):
        table_name = tables[table_idx]
        columns_serialized = ", ".join(column_strings[table_idx])
        table_serialized = table_name + " : " + columns_serialized
        table_strings.append(table_serialized)
    
    # Lastly combine all serialized table names together with the db id
    schema_serialized = delimiter.join(table_strings)
    return schema_serialized

serialize_spider_db(movie_1) # 'movie_1 | movie : movie id, title, year, director | reviewer : reviewer id, name | rating : reviewer id, movie id, rating stars, rating date'

'movie_1 | movie : movie_id, title, year, director | reviewer : reviewer_id, name | rating : reviewer_id, movie_id, rating_stars, rating_date'

We will no create some utility functions that will help convert the spider dataset into a testing set.

In [7]:
def test_2_preprocessed(test_case):
    db_id = test_case['db_id']
    db_schema = tables_dict_by_db[db_id]
    db_schema_serialized = serialize_spider_db(db_schema)
    question = test_case["question"]
    return delimiter.join([question, db_schema_serialized])

def test_2_answer(test_case):
    return test_case['query']

def convert_to_training(test_cases : list):
    # This takes in a list and outputs two lists
    if not isinstance(test_cases, list):
        test_cases = [test_cases]

    training_set = {
        "input" : [],
        "output": [],
    }

    def helper(test_case):
        training_set["input"].append(test_2_preprocessed(test_case))
        training_set["output"].append(test_2_answer(test_case))

    for test_case in test_cases:
        helper(test_case)

    return training_set


train_example = train_json[0]
print("Example input:", test_2_preprocessed(train_example))
print("Example output:", test_2_answer(train_example))
print("Example training data:")
pprint(convert_to_training(train_example))

Example input: How many heads of the departments are older than 56 ? | department_management | department : department_id, name, creation, ranking, budget_in_billions, num_employees | head : head_id, name, born_state, age | management : department_id, head_id, temporary_acting
Example output: SELECT count(*) FROM head WHERE age  >  56
Example training data:
{'input': ['How many heads of the departments are older than 56 ? | '
           'department_management | department : department_id, name, '
           'creation, ranking, budget_in_billions, num_employees | head : '
           'head_id, name, born_state, age | management : department_id, '
           'head_id, temporary_acting'],
 'output': ['SELECT count(*) FROM head WHERE age  >  56']}


And to get the final variables for training.

In [8]:
training_set = convert_to_training(train_json)
training_inputs = training_set['input']
training_outputs = training_set['output']
pprint(training_inputs[:2])
pprint(training_outputs[:2])

['How many heads of the departments are older than 56 ? | '
 'department_management | department : department_id, name, creation, ranking, '
 'budget_in_billions, num_employees | head : head_id, name, born_state, age | '
 'management : department_id, head_id, temporary_acting',
 'List the name, born state and age of the heads of departments ordered by '
 'age. | department_management | department : department_id, name, creation, '
 'ranking, budget_in_billions, num_employees | head : head_id, name, '
 'born_state, age | management : department_id, head_id, temporary_acting']
['SELECT count(*) FROM head WHERE age  >  56',
 'SELECT name ,  born_state ,  age FROM head ORDER BY age']


In [9]:
from datasets import Dataset
ins = [{'in':x[0],'out':x[1]} for x in zip(training_inputs, training_outputs)]
dataset = Dataset.from_list(ins)
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['in', 'out'],
        num_rows: 6300
    })
    test: Dataset({
        features: ['in', 'out'],
        num_rows: 700
    })
})

## Model
Loading model into RAM.

### Picard + T5
From Tscholak

import torch 
torch.cuda.set_device(2)
torch.cuda.current_device()

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("tscholak/cxmefzzi")
model = AutoModelForSeq2SeqLM.from_pretrained("tscholak/cxmefzzi").to("cuda")

### Baseline
Try to follow the HF tutorial with tscholak

#### Preprocessing With a Tokenizer
The example sentence was taken from the HF website.

raw_inputs = [
    "How many singers do we have? | concert_singer | stadium : stadium_id, location, name, capacity, highest, lowest, average | singer : singer_id, name, country, song_name, song_release_year, age, is_male | concert : concert_id, concert_name, theme, stadium_id, year | singer_in_concert : concert_id, singer_id",
]
# raw_inputs = map(test_2_preprocessed, train_json[0:50])
inputs = tokenizer(list(raw_inputs), padding=True, return_tensors="pt")
print(inputs["input_ids"].size())

#### Running Inference
The model generates the desired response. It seems that the SQL tokens are generally lowercase.

outputs = model.generate(inputs.input_ids, max_new_tokens=512)
# outputs = model.generate(**inputs, decoder_input_ids=decoder_inputs.input_ids, max_new_tokens=1024)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
response

### T5-Small
For practice we will try to re-train T5 using the spider dataset. Picard based itself of the [T5ForConditionalGeneration](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration).

I roughly followed the [Hugging Face fine-tuning pre-trained models guide](https://huggingface.co/docs/transformers/training#finetune-a-pretrained-model).

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration, T5Config
from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map, load_checkpoint_and_dispatch
from huggingface_hub import hf_hub_download

model_name = "t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto") # Device_map splits the load over multiple GPUs, this seems to be quite new
model.hf_device_map

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

{'shared': 0,
 'decoder.embed_tokens': 0,
 'encoder.embed_tokens': 0,
 'lm_head': 0,
 'encoder.block.0': 0,
 'encoder.block.1': 0,
 'encoder.block.2': 0,
 'encoder.block.3': 0,
 'encoder.block.4': 0,
 'encoder.block.5': 1,
 'encoder.block.6': 1,
 'encoder.block.7': 1,
 'encoder.block.8': 1,
 'encoder.block.9': 1,
 'encoder.block.10': 1,
 'encoder.block.11': 1,
 'encoder.final_layer_norm': 1,
 'encoder.dropout': 1,
 'decoder.block.0': 1,
 'decoder.block.1': 1,
 'decoder.block.2': 2,
 'decoder.block.3': 2,
 'decoder.block.4': 2,
 'decoder.block.5': 2,
 'decoder.block.6': 2,
 'decoder.block.7': 2,
 'decoder.block.8': 2,
 'decoder.block.9': 3,
 'decoder.block.10': 3,
 'decoder.block.11': 3,
 'decoder.final_layer_norm': 3,
 'decoder.dropout': 3}

The Dataset class supports chaining use of maps (like a monad). We tokenize the raw input strings and split into a train and test set.

Note that dataset returns data as lists, you have to [explicitly](https://discuss.huggingface.co/t/dataset-map-return-only-list-instead-torch-tensors/15767/2) set the format to tensors. This function returns nothing.

In [11]:
def tokenize_wrapper(examples):
    return tokenizer(examples["in"], text_target=examples["out"], padding='max_length', truncation=True, return_tensors="pt")
tokenized_datasets = dataset.map(tokenize_wrapper).remove_columns(["in", "out"])

tokenized_datasets.set_format("pt", columns=["input_ids"], output_all_columns=True)

small_train_dataset = tokenized_datasets["train"].shuffle()#.select(range(100))
small_test_dataset = tokenized_datasets["test"].shuffle()#.select(range(100))
tokenized_datasets

Map:   0%|          | 0/6300 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6300
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 700
    })
})

### Training on Single GPU
Note that the model is located across the vram of multiple GPUs, but I'm not sure if the training itself leverages that.

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir = "checkpoints/text2sql-t5small-spider")

import numpy as np
import evaluate
# Make the Metric
# dont forget to pip uninstall once we change to a proper metric, you should probably leave sk learn in

metric = evaluate.load("accuracy")

def compute_metric(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Make the Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    compute_metrics=compute_metric,
)

And lastly to train

In [13]:
trainer.train()



ValueError: too many values to unpack (expected 2)

### Accelerator for Training

To work with Accelerator, we have to use the native pyTorch APIs. HuggingFace provides a guide for using [native PyTorch](https://huggingface.co/docs/transformers/training#train-in-native-pytorch), and a reference for usinge these to [accelerate](https://huggingface.co/docs/transformers/accelerate#prepare-to-accelerate).

In [None]:
# # DataLoader
# from torch.utils.data import DataLoader
# from accelerate import Accelerator

# train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
# test_dataloader = DataLoader(small_test_dataset, batch_size=8)

# accelerator = Accelerator(project_dir=f"checkpoints/text2sql-{model_name}-spider-accelerate")
# type(small_train_dataset["input_ids"][0:2])
# # Optimizer and Learning Rate shceduler
# from torch.optim import AdamW

# optimizer = AdamW(model.parameters(), lr=5e-5)


# from transformers import get_scheduler

# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
# )
# train_dataloader, test_dataloader, model, optimizer = accelerator.prepare(
#     train_dataloader, test_dataloader, model, optimizer
# )
# batch.keys()
# batch["input_ids"].size()
# # Logging maybe try WandB?
# from tqdm.auto import tqdm

# progress_bar = tqdm(range(num_training_steps))

# model.train()
# for epoch in range(num_epochs):
#     for batch in train_dataloader:
#         # batch = {k: v.to(0) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         accelerator.backward(loss)

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

In [None]:
# import evaluate

# metric = evaluate.load("accuracy") # Idk what is an accuracy metric
# model.eval()
# for batch in eval_dataloader:
#     batch = {k: v.to(0) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])

# metric.compute() 

## Inferencing

This is the example given on the Picard HF website. The expected output is `SELECT COUNT (*) FROM singer`

In [None]:

raw_inputs = [
    "How many singers do we have? | concert_singer | stadium : stadium_id, location, name, capacity, highest, lowest, average | singer : singer_id, name, country, song_name, song_release_year, age, is_male | concert : concert_id, concert_name, theme, stadium_id, year | singer_in_concert : concert_id, singer_id",
]
# raw_inputs = map(test_2_preprocessed, train_json[0:50])
inputs = tokenizer(list(raw_inputs), padding=True, return_tensors="pt")
print(inputs["input_ids"].size())

In [None]:
inputs.to(0)
outputs = model.generate(inputs.input_ids, max_new_tokens=512)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
response

In [None]:
# model_inputs = tokenizer(training_inputs[0:100], text_target=training_outputs[0:100], padding=True, return_tensors="pt")

# model_inputs.keys()
# # training
# inputs = tokenizer(training_inputs[0:100], padding=True, return_tensors="pt")
# labels = tokenizer(training_outputs[0:100], padding=True, return_tensors="pt")
# outputs = model(input_ids=inputs.input_ids, labels=labels.input_ids)
# loss = outputs.loss
# logits = outputs.logits