<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/fine_tuning_phi4_vision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers bitsandbytes -q
!pip install  trl peft datasets -q
!pip install accelerate -q
!pip install colab-env --quiet

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load the Phi-4 model and tokenizer with 4-bit quantization
model_name = "microsoft/phi-4"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Explicitly set low_cpu_mem_usage=True (or False if you don't want it)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True # Explicitly setting low_cpu_mem_usage to true.
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set the pad_token_id for the model explicitly to avoid the warning
model.generation_config.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

tokenizer.pad_token = tokenizer.eos_token
model.pad_token_id = model.config.eos_token_id

low_cpu_mem_usage=False  # Explicitly set to False

def generate_text(prompt, max_length=512):
  """
  Generates text using the 4-bit quantized Phi-4 model.

  Args:
    prompt: The input text to start the generation.
    max_length: The maximum length of the generated text.

  Returns:
    The generated text.
  """

  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_length=max_length)
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated_text

In [4]:
import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

Mounted at /content/gdrive


In [5]:
import warnings

warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import bitsandbytes as bnb
from trl import SFTTrainer


# Define LoRA configuration
# LoRA config based on gemini session 22/06/2024
lora_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.05,
    r=128,
    bias="none",
    target_modules=["qkv_proj", "o_proj"],  # Try targeting these linear layers within the attention mechanism
    task_type="CAUSAL_LM",
)


# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load the FGVC Aircraft dataset (only the 'train' split)
dataset = load_dataset("Multimodal-Fatima/FGVC_Aircraft_train", split="train")

# Split the dataset into train and test sets
train_test_dataset = dataset.train_test_split(test_size=0.2)  # Use 80% for training, 20% for testing

# Access the train and test splits
train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']  # Access the 'test' split from the dictionary

#def generate_captions(images, batch_size=4):  # Adjust batch_size as needed
#    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", batch_size=batch_size)
#    return captioner(images)

#def preprocess_function(examples):
#    examples['text'] = generate_captions(examples['image'])
#S    return examples



In [None]:
# Preprocessing (using an image captioning model and tokenizer)
def preprocess_function(examples):
    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    examples['text'] = []
    examples['input_ids'] = []
    examples['attention_mask'] = []
    max_length = 64 # set your desired max_length
    for img in examples['image']:
        try:
            caption = captioner(img)[0]['generated_text']
            examples['text'].append(caption)
            inputs = tokenizer(
                caption,
                return_tensors="pt",
                padding="max_length",  # Pad to max_length
                truncation=True,       # Truncate to max_length
                max_length=max_length   # Set max_length
            )
            examples['input_ids'].append(inputs['input_ids'][0])
            examples['attention_mask'].append(inputs['attention_mask'][0])
        except (IndexError, KeyError, TypeError) as e:
            print(f"Error processing image: {e}")
            print(f"Captioner output: {captioner(img)}")
            examples['text'].append("")
            # Handle empty captions by adding padding tokens to reach max_length
            inputs = tokenizer(
                "",
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=max_length
            )
            examples['input_ids'].append(inputs['input_ids'][0])
            examples['attention_mask'].append(inputs['attention_mask'][0])
    return examples

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

In [9]:
training_args = TrainingArguments(
    output_dir="./lora_fine_tuned_phi-4_quantized",
    per_device_train_batch_size=4,
    learning_rate=1e-3,
    num_train_epochs=3,
    fp16=True,
    push_to_hub=True,
    hub_model_id="frankmorales2020/lora_fine_tuned_phi-4_quantized_vision",
    run_name="my_fine_tuned_phi-4_run", # Set a specific run name
    report_to="none",  # Disable all integrations with reporting libraries
    remove_unused_columns=True,  # Set to True to remove unused columns, including 'image'
    load_best_model_at_end=True,   #NEW FROM HERE
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    #load_best_model_at_end=True,

)



trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    #peft_config=peft_config,
    #max_seq_length=max_seq_length,
    tokenizer=tokenizer, #processing_class
)


# start training
trainer.train()

  trainer = SFTTrainer(


Step,Training Loss,Validation Loss
500,2.5659,2.029854
1000,2.136,2.334942
1500,1.7284,1.854036
2000,1.4827,1.528747


TrainOutput(global_step=2001, training_loss=1.9777080506935292, metrics={'train_runtime': 934.0091, 'train_samples_per_second': 8.566, 'train_steps_per_second': 2.142, 'total_flos': 4.382347075190784e+16, 'train_loss': 1.9777080506935292, 'epoch': 3.0})

## EVALUATION

Restart the engine

In [10]:
!pip install transformers bitsandbytes -q
!pip install  trl peft datasets -q
!pip install accelerate -q
!pip install colab-env --quiet

In [11]:
!pip install evaluate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
import evaluate
from transformers import AutoTokenizer, pipeline
from peft import PeftModel
import numpy as np
from datasets import load_dataset

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-4")

# Load the base model
# Explicitly set low_cpu_mem_usage=True
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-4",
    quantization_config=bnb_config,
    low_cpu_mem_usage=True
)

# Load the locally fine-tuned model with LoRA adapter, applying it to the base model
model = PeftModel.from_pretrained(
    base_model,  # Pass the base model instance
    "/content/lora_fine_tuned_phi-4_quantized",  # Path to your locally saved model directory
    device_map={"": 0},
)

# Load the FGVC Aircraft dataset (only the 'train' split)
dataset = load_dataset("Multimodal-Fatima/FGVC_Aircraft_train", split="train")

# Split the dataset into train and test sets
train_test_dataset = dataset.train_test_split(test_size=0.2)

# Access the test split
test_dataset = train_test_dataset['test']

# Preprocessing (using an image captioning model and tokenizer) - Same as in fine_tuning.py
def preprocess_function(examples):
    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    examples['text'] = []
    examples['input_ids'] = []
    examples['attention_mask'] = []
    max_length = 64  # Set your desired max_length
    for img in examples['image']:
        try:
            caption = captioner(img)[0]['generated_text']
            examples['text'].append(caption)
            inputs = tokenizer(
                caption,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=max_length
            )
            examples['input_ids'].append(inputs['input_ids'][0])
            examples['attention_mask'].append(inputs['attention_mask'][0])
        except (IndexError, KeyError, TypeError) as e:
            print(f"Error processing image: {e}")
            print(f"Captioner output: {captioner(img)}")
            examples['text'].append("")
            inputs = tokenizer(
                "",
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=max_length
            )
            examples['input_ids'].append(inputs['input_ids'][0])
            examples['attention_mask'].append(inputs['attention_mask'][0])
    return examples

test_dataset = test_dataset.map(preprocess_function, batched=True)

# Define the evaluation metric (e.g., BLEU)
metric = evaluate.load("bleu")

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Map:   0%|          | 0/667 [00:00<?, ? examples/s]

Device set to use cuda:0


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [13]:
# Set the pad_token_id for the model explicitly to avoid the warning
model.generation_config.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

tokenizer.pad_token = tokenizer.eos_token
model.pad_token_id = model.config.eos_token_id



# Generate predictions
def generate_predictions(examples):
    try:
        inputs = tokenizer(examples['text'], return_tensors="pt", padding=True, truncation=True).to("cuda:0")
        outputs = model.generate(**inputs)
        predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        print('\n')
        print("Generated predictions:", predictions)  # Print the generated predictions
        return {'predictions': predictions}
    except Exception as e:
        print(f"Error generating predictions: {e}")
        # Return a list of empty strings with the correct length instead of an empty list
        return {'predictions': [""] * len(examples['text'])}

predictions = test_dataset.map(generate_predictions, batched=True)

# Compute the evaluation metric (using input_ids and attention_mask)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result

# Evaluate the model (without using the Trainer)
try:
    print('\n')
    print("Computing BLEU score...")
    results = metric.compute(predictions=predictions['predictions'], references=[[example['text']] for example in test_dataset])
    print("BLEU score:", results)  # Print the BLEU score
except Exception as e:
    print(f"Error computing BLEU score: {e}")

Map:   0%|          | 0/667 [00:00<?, ? examples/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.




Generated predictions: ['two airplanes are parked on the tarmacl the backgroundcl field a runway the tarmacl background parked the runway at the airport airport airport airport', 'a red and white jet on a runway at an airport airport airport airport backgroundcl parked the airport airport airport airport airport airport airport backgroundclcl', 'a white plane on the ground with an airport airport airport airport background on the airport runway at an airport airport airport airport', 'a fighter jet flying through the sky in an sky at an airport sky at an airport sky at an sky at an airport sky at an', 'a small white plane parked on a runway a field at the background field at the airport airport airport airport airport airport backgroundclcl field at the', 'a blue and white plane on the runway of an airport airport airport airport backgroundcl parked a runway an airport airport airport airport background', 'a large white airplane on a runway at an airport airport airport airport backg