In [1]:
import os
import glob
import pandas as pd

In [2]:
from frame_extraction import extract_frames
from image_captioning import load_captioning_model, generate_caption

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_ID = "fancyfeast/llama-joycaption-alpha-two-hf-llava"
IMAGE_PATH_1 = "test_image/image.png"

In [4]:
processor, model, device = load_captioning_model(model_name=MODEL_ID, device_map="auto")

Loading model 'fancyfeast/llama-joycaption-alpha-two-hf-llava'...
Using bfloat16.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 4/4 [00:21<00:00,  5.40s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


Model 'fancyfeast/llama-joycaption-alpha-two-hf-llava' loaded successfully on device: cuda:0


In [5]:
# caption1 = generate_caption(
#                 image_path=IMAGE_PATH_1,
#                 processor=processor,
#                 model=model,
#                 device=device
#             )

# print(caption1)

# Step 1: Split video into frames

In [6]:
input_video_file = 'test_video/_MXxJT8Mk4k.mp4'
output_frames_folder = 'video_frames_output'
interval = 30 # Extract every Nth frame
img_format = 'png'

In [7]:
extract_frames(input_video_file, output_frames_folder, frame_interval=interval, image_format=img_format)

Output directory 'video_frames_output' already exists. Files may be overwritten.
------------------------------
Video Info:
  Path: test_video/_MXxJT8Mk4k.mp4
  FPS: 25.00
  Total Frames: 328
  Duration: 13.12 seconds
  Output Folder: video_frames_output
  Frame Interval: 30 (saving every 30 frame(s))
  Image Format: .png
------------------------------
------------------------------
Finished processing.
Total frames read from video: 328
Total frames saved to 'video_frames_output': 11
------------------------------


# Step 2: Caption all frames

In [8]:
if not os.path.isdir(output_frames_folder):
    print(f"Error: Output folder '{output_frames_folder}' not found. Please run Step 1 first.")
    df_captions = pd.DataFrame(columns=['image_name', 'caption'])
else:
    search_pattern = os.path.join(output_frames_folder, f"*.{img_format}")
    image_files = sorted(glob.glob(search_pattern))

    if not image_files:
        print(f"No images with format '.{img_format}' found in '{output_frames_folder}'.")
        df_captions = pd.DataFrame(columns=['image_name', 'caption'])
    else:
        print(f"Found {len(image_files)} image(s) to caption.")

        caption_data_list = []

        for i, image_path in enumerate(image_files):
            image_filename = os.path.basename(image_path)
            print(f"\nProcessing image {i+1}/{len(image_files)}: {image_filename}")

            caption_result = None
            try:
                caption_result = generate_caption(
                    image_path=image_path,
                    processor=processor,
                    model=model,
                    device=device
                )

                if caption_result is not None:
                    print(f"Caption: {caption_result}")
                else:
                    print(f"Skipping caption for {image_filename} due to generation error.")

            except Exception as e:
                print(f"An unexpected error occurred while processing {image_filename}: {e}")
                
            caption_data_list.append({
                'image_name': image_filename,
                'caption': caption_result
            })

        print("\n--- Caption Generation Complete ---")
        print(f"Processed {len(image_files)} images.")

        print("Creating DataFrame...")
        df_captions = pd.DataFrame(caption_data_list)
        print("DataFrame created successfully.")

        successful_captions = df_captions['caption'].notna().sum()
        print(f"Successfully generated {successful_captions} captions.")
        print(f"Failed to generate captions for {len(df_captions) - successful_captions} images.")

        print("\nDataFrame Head:")
        print(df_captions.head())
        print("\nDataFrame Info:")
        df_captions.info()

Found 11 image(s) to caption.

Processing image 1/11: frame_000000.png
Caption: This is a photograph of a pair of hands using scissors to cut open a clear plastic glove against a bright blue background. The word "glove" is prominently displayed in white text on a red square at the top of the image. The glove has a shiny, reflective texture, and the scissors have black handles. The scene is brightly lit, emphasizing the crispness of the glove and the sharpness of the scissors.

Processing image 2/11: frame_000001.png
Caption: This is a photograph featuring a close-up of a hand pouring white, crystalline vinegar from a transparent glass bottle into a clear glass measuring cup against a bright blue background. The hand, with light skin, holds the bottle with a firm grip. The word "vinegar" is prominently displayed in bold, white text within a red rectangular box at the top of the image. The overall scene is simple and clean, emphasizing the clear liquids and the vibrant blue background.



In [10]:
df_captions.head()

Unnamed: 0,image_name,caption
0,frame_000000.png,This is a photograph of a pair of hands using ...
1,frame_000001.png,This is a photograph featuring a close-up of a...
2,frame_000002.png,This photograph features a close-up of a hand ...
3,frame_000003.png,The image is a photograph featuring a close-up...
4,frame_000004.png,This is a photograph featuring a close-up of t...


# Step 3: Map captions to video ID

In [11]:
video_id = os.path.basename(input_video_file)

# --- Processing ---
# Check if the df_captions DataFrame exists and has data
if not df_captions.empty:

    # Extract the 'caption' column
    captions_series = df_captions['caption']

    # Filter out any None or NaN values (captions that failed generation)
    successful_captions_list = captions_series.dropna().tolist()

    if not successful_captions_list:
        print(f"Warning: No successful captions were generated for video '{video_id}'. The captions array will be empty.")
    else:
        print(f"Found {len(successful_captions_list)} successful captions to consolidate for video '{video_id}'.")

    # Create the data for the new DataFrame
    # It will have one row for this video
    video_summary_data = [
        {
            'video_id': video_id,
            'captions_array': successful_captions_list # Store the list here
        }
    ]

    # Create the final summary DataFrame
    df_video_summary = pd.DataFrame(video_summary_data)

    print("\nVideo Summary DataFrame:")
    # Pandas might truncate the display of the list, which is normal
    print(df_video_summary)

    # Optional: Verify the content of the list in the first row
    print("\nContent of 'captions_array' for the first video:")
    # Use .iloc[0] to access the first (and only) row
    # Use ['captions_array'] to access the column containing the list
    print(df_video_summary.iloc[0]['captions_array'])


elif 'df_captions' in locals() and df_captions.empty:
     print("The 'df_captions' DataFrame is empty. Cannot create video summary DataFrame.")
     # Define an empty DataFrame to avoid errors if referenced later
     df_video_summary = pd.DataFrame(columns=['video_id', 'captions_array'])

else:
    print("Error: 'df_captions' DataFrame not found. Please run the caption generation and initial DataFrame creation cells first.")
    # Define an empty DataFrame to avoid errors if referenced later
    df_video_summary = pd.DataFrame(columns=['video_id', 'captions_array'])

Found 11 successful captions to consolidate for video '_MXxJT8Mk4k.mp4'.

Video Summary DataFrame:
          video_id                                     captions_array
0  _MXxJT8Mk4k.mp4  [This is a photograph of a pair of hands using...

Content of 'captions_array' for the first video:
['This is a photograph of a pair of hands using scissors to cut open a clear plastic glove against a bright blue background. The word "glove" is prominently displayed in white text on a red square at the top of the image. The glove has a shiny, reflective texture, and the scissors have black handles. The scene is brightly lit, emphasizing the crispness of the glove and the sharpness of the scissors.', 'This is a photograph featuring a close-up of a hand pouring white, crystalline vinegar from a transparent glass bottle into a clear glass measuring cup against a bright blue background. The hand, with light skin, holds the bottle with a firm grip. The word "vinegar" is prominently displayed in bold, whit

In [12]:
df_video_summary.head()

Unnamed: 0,video_id,captions_array
0,_MXxJT8Mk4k.mp4,[This is a photograph of a pair of hands using...


# Step 4: Send captions array and questions to LLM