<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/fine_tuning_phi4_vision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers bitsandbytes -q
!pip install  trl peft datasets -q
!pip install accelerate -q
!pip install colab-env --quiet

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load the Phi-4 model and tokenizer with 4-bit quantization
model_name = "microsoft/phi-4"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Explicitly set low_cpu_mem_usage=True (or False if you don't want it)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True # Explicitly setting low_cpu_mem_usage to true.
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set the pad_token_id for the model explicitly to avoid the warning
model.generation_config.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

tokenizer.pad_token = tokenizer.eos_token
model.pad_token_id = model.config.eos_token_id

low_cpu_mem_usage=False  # Explicitly set to False

def generate_text(prompt, max_length=512):
  """
  Generates text using the 4-bit quantized Phi-4 model.

  Args:
    prompt: The input text to start the generation.
    max_length: The maximum length of the generated text.

  Returns:
    The generated text.
  """

  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_length=max_length)
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated_text

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [4]:
import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [7]:
import warnings

warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import bitsandbytes as bnb
from trl import SFTTrainer


# Define LoRA configuration
# LoRA config based on gemini session 22/06/2024
lora_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.05,
    r=128,
    bias="none",
    target_modules=["qkv_proj", "o_proj"],  # Try targeting these linear layers within the attention mechanism
    task_type="CAUSAL_LM",
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load the FGVC Aircraft dataset (only the 'train' split)
dataset = load_dataset("Multimodal-Fatima/FGVC_Aircraft_train", split="train")

# Split the dataset into train and test sets
train_test_dataset = dataset.train_test_split(test_size=0.2)  # Use 80% for training, 20% for testing

# Access the train and test splits
train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']  # Access the 'test' split from the dictionary

#def generate_captions(images, batch_size=4):  # Adjust batch_size as needed
#    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", batch_size=batch_size)
#    return captioner(images)

#def preprocess_function(examples):
#    examples['text'] = generate_captions(examples['image'])
#S    return examples



In [12]:
# Preprocessing (using an image captioning model and tokenizer)
def preprocess_function(examples):
    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    examples['text'] = []
    examples['input_ids'] = []
    examples['attention_mask'] = []
    max_length = 64 # set your desired max_length
    for img in examples['image']:
        try:
            caption = captioner(img)[0]['generated_text']
            examples['text'].append(caption)
            inputs = tokenizer(
                caption,
                return_tensors="pt",
                padding="max_length",  # Pad to max_length
                truncation=True,       # Truncate to max_length
                max_length=max_length   # Set max_length
            )
            examples['input_ids'].append(inputs['input_ids'][0])
            examples['attention_mask'].append(inputs['attention_mask'][0])
        except (IndexError, KeyError, TypeError) as e:
            print(f"Error processing image: {e}")
            print(f"Captioner output: {captioner(img)}")
            examples['text'].append("")
            # Handle empty captions by adding padding tokens to reach max_length
            inputs = tokenizer(
                "",
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=max_length
            )
            examples['input_ids'].append(inputs['input_ids'][0])
            examples['attention_mask'].append(inputs['attention_mask'][0])
    return examples

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2667 [00:00<?, ? examples/s]

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Map:   0%|          | 0/667 [00:00<?, ? examples/s]

Device set to use cuda:0


In [None]:
training_args = TrainingArguments(
    output_dir="./lora_fine_tuned_phi-4_quantized",
    per_device_train_batch_size=4,
    learning_rate=1e-3,
    num_train_epochs=3,
    fp16=True,
    push_to_hub=True,
    hub_model_id="frankmorales2020/lora_fine_tuned_phi-4_quantized_vision",
    evaluation_strategy="epoch",
    run_name="my_fine_tuned_phi-4_run", # Set a specific run name
    report_to="none",  # Disable all integrations with reporting libraries
    remove_unused_columns=True  # Set to True to remove unused columns, including 'image'
)


trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    #peft_config=peft_config,
    #max_seq_length=max_seq_length,
    tokenizer=tokenizer,
)


# start training
trainer.train()

  trainer = SFTTrainer(


Epoch,Training Loss,Validation Loss
1,2.6251,2.031918
2,1.8613,1.754392
