## Trying SQL Coder Model On Local PC 

In [None]:
! pip install -q -U bitsandbytes
! pip install -q -U git+https://github.com/huggingface/transformers.git 
! pip install -q -U git+https://github.com/huggingface/peft.git
! pip install -q -U git+https://github.com/huggingface/accelerate.git
! pip install -q datasets


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.version.cuda)  # Should show CUDA version

False
None


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset
import torch
import os

### Load model

In [3]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Use the confirmed working model
model_id = 'defog/sqlcoder2'
local_model_dir = './sqlcoder'

# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

def load_or_download_model():
    try:
        # Check if model exists locally
        if os.path.exists(local_model_dir) and any(f.startswith(('pytorch_model', 'model.safetensors')) 
                                                 for f in os.listdir(local_model_dir)):
            print('Loading model from local directory...')
            return AutoModelForCausalLM.from_pretrained(
                local_model_dir,
                device_map="auto",
                torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
                offload_folder="./offload"  # Add offload folder
            )
        else:
            print('Downloading model from Hugging Face Hub...')
            os.makedirs(local_model_dir, exist_ok=True)
            os.makedirs("./offload", exist_ok=True)  # Create offload directory
            
            # Load with proper settings
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
                offload_folder="./offload",  # Specify offload folder
                low_cpu_mem_usage=True
            )
            
            # Save model
            model.save_pretrained(local_model_dir, safe_serialization=True)
            return model
            
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Falling back to basic CPU loading...")
        return AutoModelForCausalLM.from_pretrained(
            model_id if not os.path.exists(local_model_dir) else local_model_dir,
            device_map="cpu",
            torch_dtype=torch.float32
        )

# Load model and tokenizer
model = load_or_download_model()
tokenizer = AutoTokenizer.from_pretrained(model_id if not os.path.exists(local_model_dir) else local_model_dir)

# Verify
print(f"\nModel loaded successfully!")
print(f"Device: {next(model.parameters()).device}")
print(f"Memory footprint: {model.get_memory_footprint()/1024**3:.2f} GB")

Using device: cpu
Loading model from local directory...


Loading checkpoint shards: 100%|██████████| 7/7 [00:00<00:00, 699.78it/s]
Some parameters are on the meta device because they were offloaded to the cpu and disk.



Model loaded successfully!
Device: cpu
Memory footprint: 29.92 GB


### FineTuning: Preparing the model for LoRA finetuning by enabling gradient checkpointing (to save VRAM) and freezing all of the model weights

In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

### QLoRA Setup

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=['q_proj', 'v_proj'], # Attention layers
    lora_dropout=0.1,
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, config) # Attaches LoRA adapter, initialized with random values 
print_trainable_parameters(model)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


trainable params: 3407872 || all params: 8033669120 || trainable%: 0.04241987003816259


### Loading Dataset

In [7]:
data = load_dataset('gretelai/synthetic_text_to_sql')
data = data.filter(lambda row: row['domain'] == 'financial services' and row['sql_task_type'] == 'analytics and reporting')
data = data.select_columns(['sql_prompt', 'sql_context', 'sql'])

print(data)

DatasetDict({
    train: Dataset({
        features: ['sql_prompt', 'sql_context', 'sql'],
        num_rows: 1077
    })
    test: Dataset({
        features: ['sql_prompt', 'sql_context', 'sql'],
        num_rows: 66
    })
})


### Applying Prompt Templates to Dataset

In [8]:
PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Generate a syntatically correct ClickHouse SQL query to answer this question: `{{ .Prompt }}`

DDL statements:
{{ .System }}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>The following SQL query best answers the question `{{ .Prompt }}`:
```sql
"""

def apply_prompt_template(user_prompt, system_prompt):
    """
    Applies SQLCoder official prompt template, refer to https://huggingface.co/defog/llama-3-sqlcoder-8b#prompt 
    """
    text = PROMPT_TEMPLATE

    text = text.replace('{{ .Prompt }}', user_prompt)
    text = text.replace('{{ .System }}', system_prompt)

    return text


data = data.map(lambda row: {**row, 'sql_prompt': apply_prompt_template(row['sql_prompt'], row['sql_context'])})
data = data.remove_columns('sql_context')
print(data)

DatasetDict({
    train: Dataset({
        features: ['sql_prompt', 'sql'],
        num_rows: 1077
    })
    test: Dataset({
        features: ['sql_prompt', 'sql'],
        num_rows: 66
    })
})


### Tokenizing Dataset¶
### During training, all sequences in a batch must have the same length. Sequences shorter than max_length are padded to match it, while those longer will be truncated. These extra padding tokens increases the computation time, despite providing no useful information

In [9]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize(samples):
    tokenized_inputs = tokenizer(
        samples['sql_prompt'],
        max_length=300,
        truncation=True,
        padding="max_length"
    )
    
    labels = tokenizer(
        samples['sql'],
        max_length=300,
        truncation=True,
        padding="max_length"
    )

    # Ignore padding tokens during loss computation
    tokenized_inputs['labels'] = [
        label if label != tokenizer.pad_token_id else -100 
        for label in labels['input_ids']
    ]
    
    return tokenized_inputs


data = data.map(tokenize, batched=True, remove_columns=['sql_prompt','sql'])
print(data)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1077
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 66
    })
})


### Training

In [18]:
! pip install --upgrade transformers accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-win_amd64.whl.metadata (5.1 kB)
Downloading bitsandbytes-0.45.4-py3-none-win_amd64.whl (75.4 MB)
   ---------------------------------------- 0.0/75.4 MB ? eta -:--:--
   ---------------------------------------- 0.8/75.4 MB 4.8 MB/s eta 0:00:16
   - -------------------------------------- 2.1/75.4 MB 5.6 MB/s eta 0:00:14
   - -------------------------------------- 3.1/75.4 MB 5.6 MB/s eta 0:00:13
   -- ------------------------------------- 4.5/75.4 MB 6.0 MB/s eta 0:00:12
   -- ------------------------------------- 5.5/75.4 MB 5.5 MB/s eta 0:00:13
   ---- ----------------------------------- 7.6/75.4 MB 6.3 MB/s eta 0:00:11
   ----- ---------------------------------- 9.7/75.4 MB 6.9 MB/s eta 0:00:10
   ------ --------------------------------- 12.3/75.4 MB 7.7 MB/s eta 0:00:09
   ------- -------------------------------- 14.7/75.4 MB 8.1 MB/s eta 0:00:08
   --------- ------------------------------ 17.0/75.4 MB 8.5 MB/s eta 0


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Faced Challenges While Training On Local Therefore Moved To Google Collab

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)

# 1. Configure 8-bit quantization (more stable than 4-bit on Windows)
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Using 8-bit instead of 4-bit for stability
    llm_int8_skip_modules=["lm_head"]  # Keep final layer in full precision
)

# 2. Load a smaller model (Phi-2 is more memory efficient)
model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# 3. Load model with memory optimizations
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

# 4. Configure training with RAM conservation
training_args = TrainingArguments(
    output_dir="./phi2-finetuned",
    per_device_train_batch_size=1,  # Critical for 16GB RAM
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,    # Saves memory
    optim="adafactor",              # More stable than paged_adamw_8bit
    learning_rate=2e-5,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    report_to="none",
    fp16=True  # Use mixed precision
)

# 5. Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 6. Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# 7. Start training with memory guard
try:
    print("Starting training...")
    trainer.train()
except Exception as e:
    print(f"Training failed: {e}")
    print("Emergency cleanup...")
    torch.cuda.empty_cache()
    del model, trainer
    import gc; gc.collect()
    print("Try reducing batch size further or sequence length")

ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`