# Multimodal Review Summarizer

#### Part II: Image Captioning

In [3]:
# Import necessary libraries
import json
import os
import logging

import pandas as pd
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("image_captioning_pipeline.log"),
        logging.StreamHandler(),
    ],
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Define a class for the image captioning pipeline
class ImageCaptioningPipeline:
    def __init__(self, reviews_path, image_size, image_folder, model_name):
        self.reviews_path = reviews_path
        self.image_size = image_size
        self.image_folder = image_folder
        self.model_name = model_name
        self.reviews_data = None
        self.df_reviews = None
        self.loaded_images = {}
        self.blip_processor = None
        self.blip_model = None

    def load_reviews(self):
        """Loads the reviews JSON file and converts it to a DataFrame."""
        try:
            with open(self.reviews_path, "r", encoding="utf-8") as f:
                self.reviews_data = json.load(f)
            self.df_reviews = pd.DataFrame(self.reviews_data)
            logging.info("Reviews data successfully loaded.")
        except Exception as e:
            logging.error(f"Failed to load reviews data: {e}")
            raise

    def extract_image_names(self):
        """Extracts image filenames from the reviews data."""
        try:
            image_files = [
                img[f"{self.image_size}_image_url"].split("/")[-1]
                for review in self.reviews_data
                for img in review.get("images", [])
            ]
            logging.info(f"Extracted {len(image_files)} image filenames.")
            return image_files
        except Exception as e:
            logging.error(f"Failed to extract image filenames: {e}")
            raise

    def load_images(self, image_names):
        """Loads images from the specified folder."""
        for img_name in image_names:
            img_path = os.path.join(self.image_folder, img_name)
            try:
                img = Image.open(img_path).convert("RGB")
                self.loaded_images[img_name] = img
            except Exception as e:
                logging.warning(f"Error loading image {img_name}: {e}")
        logging.info(f"Loaded {len(self.loaded_images)} images successfully.")

    def initialize_model(self):
        """Loads the BLIP model and processor."""
        try:
            self.blip_processor = BlipProcessor.from_pretrained(self.model_name)
            self.blip_model = BlipForConditionalGeneration.from_pretrained(
                self.model_name
            )
            logging.info("BLIP model and processor successfully loaded.")
        except Exception as e:
            logging.error(f"Failed to initialize BLIP model and processor: {e}")
            raise

    def generate_image_captions(self, prompt):
        """Generates captions for the loaded images."""
        captions = {}
        for img_name, img in self.loaded_images.items():
            try:
                inputs = self.blip_processor(
                    images=img, text=prompt, return_tensors="pt"
                )
                with torch.no_grad():
                    caption_ids = self.blip_model.generate(
                        **inputs,
                        max_length=128,
                        num_beams=5,
                        early_stopping=True,
                        no_repeat_ngram_size=2,
                        length_penalty=2.0,
                    )
                decoded_caption = self.blip_processor.decode(
                    caption_ids[0], skip_special_tokens=True
                )
                if prompt.lower() in decoded_caption.lower():
                    decoded_caption = decoded_caption[len(prompt) :].strip()
                captions[img_name] = decoded_caption or " "
                logging.info(f"Generated caption for {img_name}: {decoded_caption}")
            except Exception as e:
                logging.warning(f"Failed to generate caption for {img_name}: {e}")
        logging.info(f"Generated captions for {len(captions)} images.")
        return captions

    def map_captions_to_reviews(self, captions_map):
        """Maps generated captions to the reviews data."""
        try:

            def map_image_captions(images):
                return ", ".join(
                    [
                        captions_map.get(
                            img[f"{self.image_size}_image_url"].split("/")[-1], ""
                        )
                        for img in images
                    ]
                )

            self.df_reviews["image_caption"] = self.df_reviews["images"].apply(
                map_image_captions
            )
            logging.info("Mapped captions to reviews successfully.")
        except Exception as e:
            logging.error(f"Failed to map captions to reviews: {e}")
            raise

    def save_results(self, output_path):
        """Saves the DataFrame with captions to a JSON file."""
        try:
            df_reviews_list = self.df_reviews.to_dict(orient="records")
            with open(output_path, "w") as f:
                json.dump(df_reviews_list, f, indent=4)
            logging.info(f"Results successfully saved to {output_path}.")
        except Exception as e:
            logging.error(f"Failed to save results: {e}")
            raise

In [5]:
# Configuration
REVIEW_IMG = "medium"
DATA_DIR = "../data"
REVIEW_IMG_DIR = f"{DATA_DIR}/review_images/{REVIEW_IMG}"
REVIEWS = f"{DATA_DIR}/filtered_reviews_cleaned.json"
REVIEWS_CAPTION = f"{DATA_DIR}/filtered_reviews_with_img_captions.json"
MODEL_NAME = "Salesforce/blip-image-captioning-large"

pipeline = ImageCaptioningPipeline(REVIEWS, REVIEW_IMG, REVIEW_IMG_DIR, MODEL_NAME)

# Step-by-step execution
pipeline.load_reviews()
image_names = pipeline.extract_image_names()
pipeline.load_images(image_names)
pipeline.initialize_model()

# Define prompt and generate captions
prompt = "The image contains a humidifier or its components which is or has"
captions = pipeline.generate_image_captions(prompt)

# Map captions to reviews and save results
pipeline.map_captions_to_reviews(captions)
pipeline.save_results(REVIEWS_CAPTION)

2025-09-17 01:33:49,754 - INFO - Reviews data successfully loaded.
2025-09-17 01:33:49,759 - INFO - Extracted 917 image filenames.
2025-09-17 01:33:58,648 - INFO - Loaded 906 images successfully.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
2025-09-17 01:34:03,253 - INFO - BLIP model and processor successfully loaded.
2025-09-17 01:34:17,292 - INFO - Generated caption for 41AiZulKABL._SL800_.jpg: a blue light on it
2025-09-17 01:34:20,697 - INFO - Generated caption for 410tQ5qADNL._SL800_.jpg: been placed on the floor
2025-09-17 01:34:23,249 - INFO - Generated caption for 51MTMKxaCpL._SL800_.jpg: a high level of heat
2025-09-17 01:34:25,772 - INFO - Generat