<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/FineTuning_T2SQL_GNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 , L4  IN GOOGLE COLAB
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet

!pip install mistral_inference -q

!pip install trl==0.8.6 -q


!pip install torch-geometric -q
!pip install sqlparse networkx -q

!pip install bitsandbytes -q

In [4]:
import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

#print(access_token_write)

login(
  token=access_token_write,
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
     pipeline,
)
from trl import SFTTrainer

In [6]:
# set device
device = 'cuda'

In [7]:
torch.__version__

'2.3.1+cu121'

In [8]:
!python --version
!nvcc --version
!nvidia-smi

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Sat Jul 13 04:37:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   48C    P8              17W /  72W |      4MiB / 23034MiB |      0%      Default |
|                       

MISTRAL

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format


from huggingface_hub import login


login(
  token=access_token_write,
  add_to_git_credential=True
)

print()

# Hugging Face model id
model_id = "mistralai/Mistral-7B-Instruct-v0.3" #24 JUNE 2024

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True)
tokenizer.padding_side = 'right' # to prevent warnings

# We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GNN

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch_geometric.data import Data, Batch # Import Batch here
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool # Import global_mean_pool here
import torch.optim as optim
from tqdm.auto import tqdm
import evaluate
import numpy as np


# 1. Graph Construction
def sql_to_graph(question, schema, answer):
    # TODO: Replace with actual conversion logic (this is the most crucial part)
    # Example: You might use SQL parsing libraries and heuristics to extract entities and relationships
    nodes = ["SELECT", "*", "FROM", "table1", "WHERE", "column1", ">", "5"]
    edges = [(0, 1), (0, 3), (3, 5), (5, 6), (6, 7)]
    node_features = torch.eye(len(nodes))
    edge_features = torch.ones(len(edges), 1)

    # Attempt to convert the answer to an integer. If it fails, assume it's a string and assign a default value.
    try:
        answer_tensor = torch.tensor([int(answer)])
    except ValueError:
        answer_tensor = torch.tensor([0])  # Replace 0 with a suitable default value or encoding for string answers

    answer_tokens = answer.split()
    answer_tensor = torch.tensor([0] * len(answer_tokens))  # Replace 0 with appropriate token indices

    return Data(x=node_features, edge_index=torch.tensor(edges).t().contiguous(), y=answer_tensor)

class SQLGraphDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        entry = self.data[index]
        question = entry["question"]
        schema = entry["context"]
        answer = entry["answer"]
        return sql_to_graph(question, schema, answer)

# 1. Placeholder Conversion Function
def convert_to_gnn(dataset):
    # TODO: Replace with actual conversion logic
    # This should iterate through the dataset and call sql_to_graph for each entry

    dataset_gnn = []
    for i in tqdm(range(len(dataset)), desc="Converting to GNN"):
        question = dataset[i]["question"]
        schema = dataset[i]["context"]
        answer = dataset[i]["answer"]
        graph = sql_to_graph(question, schema, answer)
        dataset_gnn.append(graph)
    return dataset_gnn

# 2. GNN Model

from torch_geometric.nn import GATConv

class SQLGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8):
        super(SQLGNN, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)  # Global Mean Pooling for graph-level representation
        return x

# 3. Load and Prepare Data
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle(seed=42).select(range(12500))

# Convert to GNN format
dataset_gnn = convert_to_gnn(dataset)

# Split dataset
train_size = int(0.8 * len(dataset_gnn))
train_dataset = dataset_gnn[:train_size]
val_dataset = dataset_gnn[train_size:]

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=Batch.from_data_list) # Use the correct collate function
val_loader = DataLoader(val_dataset, batch_size=4, collate_fn=Batch.from_data_list) # Use the correct collate function

# 4. Initialize Model, Loss, and Optimizer
input_dim = train_dataset[0].num_node_features
hidden_dim = 64
output_dim = 128 # Assuming a generation task for simplicity
model = SQLGNN(input_dim, hidden_dim, output_dim)


# For generation:
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding in loss calculation

# For classification:
# criterion = torch.nn.BCEWithLogitsLoss()  # Or other suitable loss

optimizer = optim.Adam(model.parameters(), lr=0.001)

# 5. Training and Evaluation Functions
bleu = evaluate.load("bleu")
def train(model, loader, optimizer, epoch, num_epochs):
    model.train()
    total_loss = 0
    loop = tqdm(loader, total=len(loader), desc=f"Epoch {epoch+1}/{num_epochs}")
    for data in loop:
        optimizer.zero_grad()
        out = model(data) ## GNN MODEL

        #mistral_output = model(data)
        #input_ids = torch.randint(0, tokenizer.vocab_size, (mistral_output.size(0), 10))

        # Prepare Mistral model inputs (assuming you want to generate text)
        #mistral_inputs = tokenizer(input_ids, return_tensors="pt")


      # Adjust target labels to match the output after pooling
        # Calculate the actual batch size based on the number of graphs in the batch
        batch_size = data.batch.max().item() + 1
        # Flatten the target labels and select only the relevant ones
        target = data.y.view(-1)[:batch_size]

        # Generation task loss calculation
        loss = criterion(out, target)  # Remove the view(-1, out.size(-1))

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item(), avg_loss=total_loss / (loop.n + 1))

def evaluate(model, loader, epoch):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_references = []

    loop = tqdm(loader, desc="Evaluating")
    with torch.no_grad():
        for data in loop:
            out = model(data)

            # Adjust target labels to match the output after pooling
            # Calculate the actual batch size based on the number of graphs in the batch
            batch_size = data.batch.max().item() + 1
            # Flatten the target labels and select only the relevant ones
            target = data.y.view(-1)[:batch_size]

            # Generation task loss calculation
            loss = criterion(out, target)  # Remove the view(-1, out.size(-1))

  # Decode predictions and references back to strings
    all_predictions_decoded = [[str(p)] for p in all_predictions]  # Wrap each prediction in a list
    all_references_decoded = [[str(r)] for r in all_references]  # Wrap each reference in a list

    bleu_scores = [bleu.compute(predictions=p, references=r)['bleu'] for p, r in zip(all_predictions_decoded, all_references_decoded)]



    #bleu_scores = [bleu.compute(predictions=[p], references=[r])['bleu'] for p, r in zip(all_predictions, all_references)]
    avg_bleu = np.mean(bleu_scores)

    #return total_loss / len(loader), avg_bleu
    return total_loss / len(loader), avg_bleu, None, None  # Return two extra None values

print('\n\n')
print('Training & Evaluation')
print('\n')

# Training Loop (with tqdm progress bar for epochs)
num_epochs = 3
for epoch in tqdm(range(num_epochs), desc="Overall Training Progress"):
    train(model, train_loader, optimizer, epoch, num_epochs)
    val_loss, val_bleu, _, _ = evaluate(model, val_loader, epoch)
    print(f"Epoch {epoch+1}/{num_epochs}: Val Loss = {val_loss:.4f}, Val BLEU = {val_bleu:.4f}")
    print('\n')


Converting to GNN:   0%|          | 0/12500 [00:00<?, ?it/s]




Training & Evaluation




Overall Training Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/3:   0%|          | 0/2500 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/625 [00:00<?, ?it/s]

Epoch 1/3: Val Loss = 0.0000, Val BLEU = nan




Epoch 2/3:   0%|          | 0/2500 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/625 [00:00<?, ?it/s]

Epoch 2/3: Val Loss = 0.0000, Val BLEU = nan




Epoch 3/3:   0%|          | 0/2500 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/625 [00:00<?, ?it/s]

Epoch 3/3: Val Loss = 0.0000, Val BLEU = nan


