# Complete Project Setup

In [4]:
from PIL import Image
import requests
from transformers import AutoProcessor, Blip2ForConditionalGeneration, BitsAndBytesConfig
import torch
import os
import re
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import numpy as np
from tqdm import tqdm
import io
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import datasets
from sklearn.model_selection import train_test_split


## Import Model

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

model = Blip2ForConditionalGeneration.from_pretrained("ybelkada/blip2-opt-2.7b-fp16-sharded", device_map="auto", load_in_8bit=True)

## Process Dataset

In [6]:
split_size = 0.2 
dataset_name = 'sd' # sd/coco

In [7]:
# Note: For dataset download instructions, please refer to the README.md file

if dataset_name == 'sd':
    datasets = pd.read_parquet('sd_ext_prompts.parquet')
elif dataset_name == 'coco':
    datasets = pd.read_parquet('cocoprompts.parquet')

train_split, test_split = train_test_split(datasets, test_size=split_size, random_state=42)
print(f"Train dataset size: {len(train_split)}")
print(f"Validation dataset size: {len(test_split)}")

In [8]:
# Convert parquet file to datasets format
train_dataset = datasets.Dataset.from_pandas(train_split)  
test_dataset = datasets.Dataset.from_pandas(test_split)      
  

In [9]:
# View an example
train_dataset[0]

## Training Configuration

In [10]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import pandas as pd

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = Image.open(item['image_path'])
        encoding = self.processor(images=image, padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        if dataset_name == 'sd':
            encoding["text"] = item["Prompt"]
        elif dataset_name == 'coco':
            encoding["text"] = item["caption"]
        return encoding

In [11]:
def collate_fn(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
    return processed_batch

In [12]:
# Define the LoraConfig
config = LoraConfig(
    r=64,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

In [13]:
train_ft_dataset = ImageCaptioningDataset(train_dataset, processor)
train_dataloader = DataLoader(train_ft_dataset, batch_size=16, shuffle=True,collate_fn=collate_fn)

subset_size = int(0.1 * len(test_dataset))
test_ft_dataset = ImageCaptioningDataset(test_dataset.take(subset_size), processor)
test_dataloader = DataLoader(test_ft_dataset, batch_size=16,shuffle=True,collate_fn=collate_fn)

## Start Training and Save Best Model

In [14]:
def generate_captions_batch(batch, model, processor):
    generated_ids = model.generate(batch['pixel_values'], max_length=77)
    captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return captions

In [15]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import datetime

judge_model = SentenceTransformer('all-MiniLM-L6-v2')


def cal_cossim(reference:list,generated:list,judge_model)->dict:

    scores = []

    for seq in range(len(reference)):
        embedding1 = judge_model.encode(reference[seq], convert_to_tensor=True)
        embedding2 = judge_model.encode(generated[seq], convert_to_tensor=True)

        cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
        scores.append(cosine_similarity.item())
    
    mean_score = np.mean(scores)

    return {"mean_score":mean_score,"scores":scores}


In [16]:
def train_and_evaluate(model,processor, train_dataloader, test_dataloader, optimizer, epochs=10):
    best_val_cos =0
    train_loss = []
    val_cos = []
    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")


    for epoch in range(epochs):
        model.train()
        total_loss = 0

        print(f"Epoch {epoch+1}")

        for batch in tqdm(train_dataloader,total=len(train_dataloader),desc="Train Progress"):
            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device, torch.float16)
            attention_mask = batch.pop("attention_mask").to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids,
                            pixel_values=pixel_values,
                            labels=input_ids,
                            attention_mask=attention_mask)
        
            loss = outputs.loss

            #print("Loss:", loss.item())

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_dataloader)
        train_loss.append(avg_train_loss)
        print(f"Epoch {epoch+1} Train Loss: {avg_train_loss}")

        model.eval()

        with torch.no_grad():
            total_genval_captions = []
            total_ref_captions = []
            for batch in tqdm(test_dataloader,total=len(test_dataloader),desc="Evaluation Progress"):
                
                captions = generate_captions_batch(batch, model, processor)
                refs = processor.batch_decode(batch['input_ids'], skip_special_tokens=True)
                total_genval_captions.extend(captions)
                total_ref_captions.extend(refs)

            cosscore_val = cal_cossim(reference=total_ref_captions,generated=total_genval_captions,judge_model=judge_model)

        val_cos.append(cosscore_val['mean_score'])
        print(f"Validation Cosine-similarity: {cosscore_val['mean_score']}")

        if cosscore_val['mean_score'] >= best_val_cos:
            best_val_cos = cosscore_val['mean_score']
            print(f"Saving new best model with val_cos: {best_val_cos}")
            model_path = f"blip2_ft_{dataset_name}/{now}/blip2_{len(train_dataset)}_{split_size}_{epochs}/best"
            model.save_pretrained(model_path)
            #processor.save_pretrained(model_path)
            similarity_file = open(model_path+'/cosine_similarity.txt', 'w')
            similarity_file.write('Epoch, Cosine Similarity\n')  # 写入标题行
            similarity_file.write(f'{epoch+1}, {best_val_cos}\n')

    return {"bst_model_path":model_path,"train_avg_loss/e":train_loss,"val_avg_cos/e":val_cos}



In [18]:
num_epoches = 10
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
best_model_output = train_and_evaluate(model=model,processor=processor,train_dataloader=train_dataloader,test_dataloader=test_dataloader,optimizer=optimizer,epochs=num_epoches) 

In [19]:
# Load best model
# model = Blip2ForConditionalGeneration.from_pretrained(bast_model_output['bst_model_path'], device_map="auto", load_in_8bit=True)
# Load processor
# processor = AutoProcessor.from_pretrained(bast_model_output['bst_model_path'])

model.save_pretrained(f'blip2_di_{dataset_name}/final_model/blip2_{len(train_dataset)}_{split_size}_{num_epoches}',safe_serialization=False)
processor.save_pretrained(f'blip2_di_{dataset_name}/final_model/blip2_{len(train_dataset)}_{split_size}_{num_epoches}')

# Generate initial results with newly trained model

In [20]:
import requests
from PIL import Image
from tqdm import tqdm
import io

def generate_cap(df,u_model,u_processor)->list:
    u_model.eval()
    generation = []
    for index,row in tqdm(df.iterrows(),total = len(df) ):

        image = Image.open(row['image_path']).convert('RGB')
        inputs = u_processor(images=image, return_tensors="pt").to(device)

        pixel_values = inputs.pixel_values

        generated_ids = u_model.generate(pixel_values=pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        # generated_ids = model.generate(**inputs)
        # generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        generation.append(generated_text)
    
    return generation

In [21]:
test_generated_caption = generate_cap(test_df,model,processor)
if dataset_name == 'sd':
    test_reference_prompt = test_df['Prompt'].tolist()
elif dataset_name == 'coco':
    test_reference_prompt = test_df['caption'].tolist()

In [22]:
test_compare_df = pd.DataFrame({'image_path':test_df['image_path'],'generated_caption':test_generated_caption,'reference_prompt':test_reference_prompt})
test_compare_df.to_parquet(f'blip2_DI_{dataset_name}/result/blip2_{split_size}_{num_epoches}_test.parquet')

In [23]:
test_compare_df