In [1]:
import os
import pandas as pd
from PIL import Image
from datasets import Dataset, load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer
import torch

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
W0606 03:02:23.365138 22476 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(


In [6]:
model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)



In [7]:
# 3. Freeze Vision Encoder
for param in model.vision_model.parameters():
    param.requires_grad = False

In [8]:
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd

class ImageCaptionDataset(Dataset):
    def __init__(self, csv_path, processor, max_length=64):
        self.data = pd.read_csv(csv_path)
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = self.data.iloc[idx]["img_path"]
        caption = self.data.iloc[idx]["caption"]

        image = Image.open(image_path).convert("RGB")
        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )

        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": inputs["input_ids"].squeeze(0)
        }


train_dataset = ImageCaptionDataset("veriseti.csv", processor)


In [9]:
train_dataset

<__main__.ImageCaptionDataset at 0x20fd185edc0>

In [10]:
sample = train_dataset[0]
print(sample.keys())  # hangi alanlar var
print(sample["pixel_values"].shape)  # görüntü tensor'ü
print(sample["input_ids"][:10])  # caption token'ları


dict_keys(['pixel_values', 'input_ids', 'attention_mask', 'labels'])
torch.Size([3, 384, 384])
tensor([  101,  1996,  3746,  2838,  1037,  5888, 27983,  5997, 10775,  1037])


In [11]:
# 5. Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./blip_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    remove_unused_columns=False,
    report_to="none"
)

def custom_collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor,
    data_collator=custom_collate_fn
)

In [12]:
trainer.train()



Step,Training Loss
10,8.2799
20,6.8494
30,6.0984
40,5.3162
50,4.5565
60,3.8214
70,3.1818
80,2.6087
90,2.2896
100,2.2047




TrainOutput(global_step=8013, training_loss=1.8704246762955397, metrics={'train_runtime': 47553.9092, 'train_samples_per_second': 1.348, 'train_steps_per_second': 0.169, 'total_flos': 3.803904953127626e+19, 'train_loss': 1.8704246762955397, 'epoch': 3.0})

In [13]:
from PIL import Image
import torch

def generate_caption_from_model(image_path, model, processor, max_length=64, device=None):
    """
    Eğitilmiş (RAM'deki) model ile görselden açıklama üret.

    Args:
        image_path (str): Test edilecek görselin dosya yolu
        model: Fine-tune edilmiş BLIP PyTorch modeli (RAM'de yüklü)
        processor: BLIPProcessor nesnesi (aynı tokenizer+preprocess)
        max_length (int): Üretilecek caption'ın maksimum uzunluğu
        device (str): Kullanılacak cihaz ("cuda" / "cpu"). Otomatik seçilir.

    Returns:
        str: Üretilen caption
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    model.to(device)
    model.eval()

    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            pixel_values=inputs["pixel_values"],
            max_length=max_length
        )

    caption = processor.decode(output_ids[0], skip_special_tokens=True)
    return caption


In [16]:
caption = generate_caption_from_model("C:/Users/omerf/Desktop/stajveri/train/7.jpg", model, processor)
print("Üretilen Caption:", caption)


Üretilen Caption: a man in a suit speaks at a podium while a woman and a man are seated nearby


In [25]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv("veriseti.csv")  # içinde img_path ve caption olan CSV
raw_dataset = Dataset.from_pandas(data)


In [37]:
# Kayıt klasörü (değiştirebilirsin)
save_dir = "./blip_finetuned"

# Modeli kaydet
model.save_pretrained(save_dir)

# Processor'ı (hem tokenizer hem image processor) kaydet
processor.save_pretrained(save_dir)

print(f"Model ve processor '{save_dir}' klasörüne kaydedildi.")


Model ve processor './blip_finetuned' klasörüne kaydedildi.


In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from datasets import Dataset
from PIL import Image
import evaluate
import torch
import pandas as pd


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
W0606 16:48:26.860500 23016 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(


In [2]:
def evaluate_bleu_from_saved_model(csv_path,model_dir,n=100,n_gram=4,img_col="img_path",caption_col="caption",device=None):
    """
    Kaydedilmiş BLIP modelini kullanarak bir CSV dataset üzerinden BLEU skorunu hesaplar.

    Args:
        csv_path (str): Caption dataset'in CSV yolu (img_path, caption içermeli)
        model_dir (str): save_pretrained ile kaydedilen klasör
        n (int): İlk kaç örnekte test yapılacak
        n_gram (int): BLEU n-gram seviyesi (varsayılan 4)
        img_col (str): CSV'deki görsel yolu sütun adı
        caption_col (str): CSV'deki caption sütun adı
        device (str): "cuda" veya "cpu" (varsayılan: otomatik seçim)

    Returns:
        float: BLEU skoru (0-1 arası)
    """
    # Cihaz ayarı
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Model ve processor yükle
    model = BlipForConditionalGeneration.from_pretrained(model_dir).to(device)
    processor = BlipProcessor.from_pretrained(model_dir)
    model.eval()

    # Dataset yükle
    df = pd.read_csv(csv_path)
    dataset = Dataset.from_pandas(df)

    # BLEU metrik
    bleu_metric = evaluate.load("bleu")
    references = []
    predictions = []

    for i in range(min(n, len(dataset))):
        row = dataset[i]
        image = Image.open(row[img_col]).convert("RGB")
        ref_caption = row[caption_col]

        inputs = processor(images=image, return_tensors="pt").to(device)

        with torch.no_grad():
            output_ids = model.generate(pixel_values=inputs["pixel_values"], max_length=64)

        gen_caption = processor.decode(output_ids[0], skip_special_tokens=True)

        references.append([ref_caption])  # dikkat! liste içinde referans
        predictions.append(gen_caption)

    result = bleu_metric.compute(predictions=predictions, references=references, max_order=n_gram)
    return result["bleu"]


In [3]:
score = evaluate_bleu_from_saved_model(csv_path="veriseti.csv",model_dir="./blip_finetuned",n=100)

print(f"BLEU-4 skoru (ilk 100 örnek): {score:.4f}")


BLEU-4 skoru (ilk 100 örnek): 0.2752


In [4]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

def generate_caption_from_saved_model(image_path, model_dir="./blip_finetuned", max_length=64, device=None):
    """
    Kayıtlı (save_pretrained) BLIP modelinden görsel için caption üretir.

    Args:
        image_path (str): Test etmek istediğin görselin yolu.
        model_dir (str): Modelin ve processor'ın kayıtlı olduğu klasör.
        max_length (int): Üretilecek caption uzunluğu (varsayılan 64).
        device (str): "cuda" veya "cpu" (varsayılan: otomatik seçilir).

    Returns:
        str: Üretilen caption.
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Kayıtlı model ve processor'ı yükle
    model = BlipForConditionalGeneration.from_pretrained(model_dir).to(device)
    processor = BlipProcessor.from_pretrained(model_dir)
    model.eval()

    # Görseli yükle
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Caption üret
    with torch.no_grad():
        output_ids = model.generate(pixel_values=inputs["pixel_values"], max_length=max_length)

    caption = processor.decode(output_ids[0], skip_special_tokens=True)
    return caption


In [8]:
caption = generate_caption_from_saved_model("C:/Users/omerf/Desktop/yatak.jpg", model_dir="./blip_finetuned")
print("Üretilen Caption:", caption)


Üretilen Caption: a cozy bedroom scene featuring a bed with a patterned blanket a wooden table and a window with curtains


In [None]:
import os
os._exit(00)  # Kernel'i yeniden başlatır