# Fine-tuning an Image Caption model

In [1]:
# Check GPU is detected by CUDA
import torch
print(torch.cuda.is_available())


True


In [2]:
# Check if CPU is supported by IPEX
#import intel_extension_for_pytorch as ipex
#
#print(ipex.cpu.runtime.is_runtime_ext_enabled())


## Download pretrained model

In [3]:
from transformers import BlipProcessor, BlipForConditionalGeneration, default_data_collator, get_linear_schedule_with_warmup
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model, prepare_model_for_int8_training

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
checkpoint = "Salesforce/blip-image-captioning-base"
model_name = checkpoint.split("/")[1]

config = LoraConfig(r=32, lora_alpha=64, target_modules=["qkv"], lora_dropout=0.05, bias="none")

processor = BlipProcessor.from_pretrained(checkpoint)
model = BlipForConditionalGeneration.from_pretrained(checkpoint, load_in_8bit=True)#.to(device)
#print(model)

print(model.get_memory_footprint())

model.save_pretrained(f"{model_name}-8bit-pre-peft")
processor.save_pretrained(f"{model_name}-8bit-pre-peft")


2023-11-27 16:55:46.365531: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-27 16:55:46.387230: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-27 16:55:46.387247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-27 16:55:46.387823: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-27 16:55:46.391527: I tensorflow/core/platform/cpu_feature_guar

296122608


## PEFT Prep

In [4]:
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)
print(model.print_trainable_parameters())

print(model.get_memory_footprint())


trainable params: 1,179,648 || all params: 248,624,248 || trainable%: 0.4744702133799918
None
398189024




In [5]:
model.save_pretrained(f"{model_name}-quantized")
processor.save_pretrained(f"{model_name}-quantized")


## Load data

In [6]:
import pandas as pd

df_train = pd.read_csv('ids_train.csv')
df_test = pd.read_csv('ids_test.csv')


In [7]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /home/christian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/christian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


In [9]:
from nltk.tokenize import word_tokenize

# Remove Stopwords
# "Why?" - this shouldn't be necessary but the model is pretty much only
# outputting stopwords; we're removing them to try to force it to not
# do this. This may break semantic understanding slightly but hopefully
# will give a better result.
def remove_stopwords(text):
  tokens = word_tokenize(text)
  tokens = [tok for tok in tokens if tok.lower() not in stop_words]
  return " ".join(tokens)


In [10]:
from datasets import Dataset, Image

def prep_data(df):
  files = [f"media/{media_id}.jpg" for media_id in df['media_id'].to_list()]
  descriptions = [remove_stopwords(text) for text in df['description'].to_list()]

  return Dataset.from_dict({ "image": files, "text": descriptions }).cast_column("image", Image())


In [11]:
ds_train = prep_data(df_train)
ds_test = prep_data(df_test)

ds_train[0]


{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x747 at 0x7F01C11C51E0>,
 'text': 'Landscape two horsemen enter composition left foreground , one falcon standing left hand . Falcons hunt birds background . Framing line bottom .'}

In [12]:
del df_train
del df_test


In [13]:
#from textwrap import wrap
#import matplotlib.pyplot as plt
#import numpy as np
#
#def plot_images(images, captions):
#    plt.figure(figsize=(20, 20))
#    for i in range(len(images)):
#        ax = plt.subplot(1, len(images), i + 1)
#        caption = captions[i]
#        caption = "\n".join(wrap(caption, 12))
#        plt.title(caption)
#        plt.imshow(images[i])
#        plt.axis("off")
#
#sample_images_to_visualize = [np.array(ds_train[i]["image"]) for i in range(5)]
#sample_captions = [ds_train[i]["text"] for i in range(5)]
#plot_images(sample_images_to_visualize, sample_captions)


In [14]:
# https://huggingface.co/docs/transformers/tasks/image_captioning
def transforms(example_batch):
    images = [x.convert("RGB").resize((100,100)) for x in example_batch["image"]]
    captions = [x for x in example_batch["text"]]
    inputs = processor(images=images, text=captions, padding="max_length")
    inputs.update({"labels": inputs["input_ids"]})
    return inputs


ds_train.set_transform(transforms)
ds_test.set_transform(transforms)


## Train

In [15]:
import evaluate
import torch

#rouge = evaluate.load('rouge')
#wer = evaluate.load('wer')
bleu = evaluate.load('bleu')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predicted = logits.argmax(-1)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
    #score = rouge.compute(predictions=decoded_predictions, references=decoded_labels)
    #score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
    score = bleu.compute(predictions=decoded_predictions, references=decoded_labels)
    return {"score": score}


In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    # Model saving
    output_dir=f"{model_name}-wip",
    push_to_hub=False,
    # Hardware support
    #fp16=True,
    #use_cpu=True,
    #use_ipex=True,
    # Basics
    num_train_epochs=5,
    learning_rate=5e-5,
    label_names=["labels"],
    # Other
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    save_total_limit=3,
    evaluation_strategy="no",
    #evaluation_strategy="steps",
    #eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    logging_steps=50,
    remove_unused_columns=False,
    #load_best_model_at_end=True,

)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    compute_metrics=compute_metrics,
)


In [18]:
import gc

gc.collect()

#torch.cuda.empty_cache()


38

In [19]:
trainer.train()




Step,Training Loss
50,13.0478
100,12.6403
150,9.9986
200,9.2945
250,9.0957
300,8.9983
350,8.9212
400,8.8766
450,8.8576
500,8.8199




TrainOutput(global_step=11330, training_loss=8.54155695108008, metrics={'train_runtime': 16657.1441, 'train_samples_per_second': 5.44, 'train_steps_per_second': 0.68, 'total_flos': 5.406712717285786e+19, 'train_loss': 8.54155695108008, 'epoch': 5.0})

In [20]:
model.save_pretrained(f"{model_name}-finetuned")
processor.save_pretrained(f"{model_name}-finetuned")
