<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/NEWSOLUTION_FINE_TUNING_SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets networkx -q

!pip install torch_geometric -q


# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 , L4  IN GOOGLE COLAB
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet

!pip install mistral_inference -q

!pip install trl==0.8.6 -q


!pip install torch-geometric -q
!pip install sqlparse networkx -q

!pip install bitsandbytes -q


!pip uninstall -y torchvision -q
!pip install torchvision --no-cache-dir -q
import evaluate

!pip install -U sentence-transformers

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset

from peft import LoraConfig, get_peft_model, TaskType
f
rom transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
)

from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import spacy
import numpy as np

from torch_geometric.nn import GAT

from trl import setup_chat_format


In [None]:
# Load spaCy English model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # Download if not already downloaded
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


# 1. Load and Prepare Data
dataset = load_dataset("b-mc2/sql-create-context", split="train").shuffle(seed=42)
, val_dataset, test_dataset = dataset.train_test_split(test_size=0.3, seed=42).values()

# Load dataset from disk
#train_dataset = load_dataset("json", data_files="/content/drive/MyDrive/datasets/train_dataset_gnn.json", split="train")
#test_dataset = load_dataset("json", data_files="/content/drive/MyDrive/datasets/test_dataset_gnn.json", split="train")

# 2. Load Mistral Model and Tokenizer
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

### MODEL CONFIG
config = AutoModelForCausalLM.from_pretrained(model_id).config
config.output_hidden_states = True
config.use_cache = False
# Explicitly set the output type for hidden states in config
config.torch_dtype = torch.float32

# Load model and tokenizer
mistral_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    config=config,
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
#tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, skip_special_tokens=True )
tokenizer.padding_side = "left"  # Pad on the left side

# Tokenizer Setup
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# Set chat template to OAI chatML (if needed)
model, tokenizer = setup_chat_format(model, tokenizer)


# Device configuration (set once at the top level)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")



# 3. Create PyTorch Dataset for Text-to-SQL
class TextToSQLDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        #text = item['question'] + " " + item['context']
        text = item['question']
        target_text = item['answer']

        tokenized_input = self.tokenizer(text, truncation=True, padding=True, return_tensors="pt")
        tokenized_target = self.tokenizer(target_text, truncation=True, padding=True, return_tensors="pt")

        # Dependency Parsing for Edge Index
        doc = nlp(text)
        edges = []
        for token in doc:
            if token.i < len(doc) - 1:
                edges.append([token.i, token.head.i])
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

        return {
            'input_ids': tokenized_input['input_ids'].flatten(),
            'attention_mask': tokenized_input['attention_mask'].flatten(),
            'labels': tokenized_target['input_ids'].flatten(),
            'edge_index': edge_index,
        }

# create all dataset from the  TextToSQLDataset
train_dataset = TextToSQLDataset(train_dataset, tokenizer)
val_dataset = TextToSQLDataset(val_dataset, tokenizer)
test_dataset = TextToSQLDataset(test_dataset, tokenizer)


# 4. GAT (Graph Attention Network) Layer
class GATLayer(torch.nn.Module):
    def __init__(self, in_features, out_features, num_heads=8):
        super(GATLayer, self).__init__()
        self.gat = GAT(in_features, out_features, heads=num_heads, concat=False)

    def forward(self, x, edge_index):
        return self.gat(x, edge_index)

# 5. Model integrating GAT
class GraphModel(torch.nn.Module):
    def __init__(self, encoder):
        super(GraphModel, self).__init__()
        self.encoder = encoder
        self.gat = GATLayer(4096, 4096)
        # PyTorch Implementation of scatter_mean
        self.pool = lambda x, batch: torch.mean(x, dim=0, keepdim=True)
        self.lm_head = torch.nn.Linear(4096, tokenizer.vocab_size)

    def forward(self, **inputs):
        x = inputs['input_ids'].to(mistral_model.device)
        edge_index = inputs['edge_index'].to(mistral_model.device)
        embeddings = self.encoder(x).last_hidden_state.cpu()
        graph_out = self.gat(embeddings, edge_index.cpu())
        pooled = self.pool(graph_out, inputs.get('batch', torch.zeros(graph_out.size(0), dtype=torch.long)))
        out = self.lm_head(pooled.to(mistral_model.device))
        return {"logits": out}

# 6. Model Setup, PEFT/LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=['gat']
)
model = GraphModel(mistral_model)
model = get_peft_model(model, peft_config)

# 7. Evaluation Metric (Semantic Similarity)
sentence_transformer_model = SentenceTransformer('all-mpnet-base-v2')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = tokenizer.batch_decode(logits, skip_special_tokens=True)
    references = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute embeddings for predictions and references
    prediction_embeddings = sentence_transformer_model.encode(predictions, convert_to_tensor=True)
    reference_embeddings = sentence_transformer_model.encode(references, convert_to_tensor=True)

    # Calculate cosine similarities
    cosine_similarities = util.cos_sim(prediction_embeddings, reference_embeddings)
    similarities = torch.diag(cosine_similarities).cpu().numpy()  # Extract similarities for corresponding pairs

    # Return average similarity as the metric
    return {"semantic_similarity": np.mean(similarities)}


# 9. Training Arguments and Trainer
training_args = TrainingArguments(
    "graph-T2SQL",
    logging_dir="graph-T2SQL",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    push_to_hub=False,
    dataloader_pin_memory=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_semantic_similarity",
    report_to="tensorboard",
)


optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [None]:
torch.cuda.empty_cache()
model.config.use_cache=False
model.gradient_checkpointing_enable() #enable gradient checkpoint

In [None]:
# 10. Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator=lambda x: x,
    compute_metrics = compute_metrics,
    optimizers=(optimizer, None)
)

# TO BE TEST IT WORK FOR trainer = SFTTrainer( ; from trl import SFTTrainer; max_seq_length = 2048 # max sequence length for model and packing of the dataset
# https://github.com/frank-morales2020/MLxDL/blob/main/FineTuning_LLM_Mistral_7B_Instruct_v0_1_for_text_to_SQL_EVALDATA.ipynb



from transformers import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

# 11. Train the model
trainer.train()

# 12. Evaluate on the test set
test_results = trainer.evaluate(test_dataset)
print(f'Test Semantic Similarity: {test_results["eval_semantic_similarity"]:.4f}')