# Notes from example
- telling names for functions
- use type hinting for arguments and function returns
- 1 feature per function

# TODOs

- [ ] dataset import as recommended by Cagatay
- [ ] detailed explanation and annotation in english
- [ ] Discuss whether post-processing/formatting of the images, e.g. color map, inversion of the grayscale, further processing of the real depth values should be carried out (apply_model)
- [ ] Adapt the postfix in job_agent() or the string value of the prediction_models.value if desired

Optionals:
- OPTIONAL: If desired, integrate a appropriate return value or any feedback messages
- OPTIONAL: Alterantive way to work with a file list instead of directory

# Produktiv-Code

In [None]:
import cv2
import numpy as np
from accelerate.test_utils.testing import get_backend
from transformers import pipeline, Pipeline
from typing import Optional
from PIL import Image
from tqdm import tqdm
from enum import Enum
import glob
import os
from pathlib import Path


In [65]:
# Video Handling
#TODO: Data-Import like in Example

def open_video(path: str) -> cv2.VideoCapture:
    """
    Opens a video file using OpenCV.

    Args:
        path (str): Path to the video file.

    Returns:
        cv2.VideoCapture: The opened video capture object.

    Raises:
        RuntimeError: If the video file cannot be opened.
    """
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        exit()
    return cap

def next_image_from_video(cap: cv2.VideoCapture) -> Optional[Image.Image]:
    ret, frame = cap.read()                                 #frame as array with bgr values
    if not ret:                                             #if read was not successfull 
        return None

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)      #Image as Array with RGB Values
    image = Image.fromarray(frame_rgb)                      #PIL Image
    return image

def image_to_video(image: Image.Image, video_writer: cv2.VideoWriter):
    #convert the video to the required format
    image_np_rgb = np.array(image)
    depth_bgr = cv2.cvtColor(image_np_rgb, cv2.COLOR_RGB2BGR)

    # write frame to output video file
    video_writer.write(depth_bgr)

class prediction_models(Enum):
    """
    Enumeration of available prediction model types in our pipeline.

    Attributes:
        GRAYSCALE: converts the video from rgb to grayscale - for for test purposes, a low-computing alternative
        DEPTH_ANYTHING_V2: Represents the 'Depth Anything V2' model with Hugging Face ID 'Depth-Anything-V2-base-hf'.
    """
    GRAYSCALE = "gray"
    DEPTH_ANYTHING_V2 = "Depth-Anything-V2-base-hf"

def apply_model(frame: Image.Image ,model_selection: prediction_models) -> Image.Image:
    device, _, _ = get_backend()
    if model_selection == prediction_models.DEPTH_ANYTHING_V2:
        checkpoint = "depth-anything/Depth-Anything-V2-base-hf"
        pipe = pipeline("depth-estimation", model=checkpoint, device=device)
        predictions = pipe(frame)
        image_w_pred = predictions['depth']


    if model_selection == prediction_models.GRAYSCALE:
        image_rgb = np.array(frame)
        image_w_pred = Image.fromarray(cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY))
        

    # If all models are called in the same way via the transformers library/pipeline, we can remove the general part and only put the parameterization in the if clause
    
    return image_w_pred

def get_videowriter(cap: cv2.VideoCapture, target_path: str) -> cv2.VideoWriter:
    """
    Creates a cv2.VideoWriter object based on the properties of an existing VideoCapture.

    Args:
        cap (cv2.VideoCapture): OpenCV video capture object from which to read properties (width, height, fps).
        target_path (str): Path to the output video file (e.g., "output.mp4").

    Returns:
        cv2.VideoWriter: OpenCV video writer object configured with H.264 codec.
    """
    # get the video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    

    # set the output stream (MP4 mit H.264 - should provide a good video player compatibility) 
    fourcc = cv2.VideoWriter_fourcc(*'H264')  
    video_writer = cv2.VideoWriter(target_path, fourcc, fps, (width, height), isColor=True)
    return video_writer

def convert_video(src_path: str, target_path: str,selected_model:prediction_models,test_mode = False):
    # run through the video frame by frame, look for errors in the image
    cap = open_video(src_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    #initialize the video output 
    video_writer = get_videowriter(cap,target_path)

    #test_mode: if activated only the first 5 seconds will be converted in order to need less computing time
    if test_mode==True:
        frame_count = int(5*fps)
    fps

    for i in tqdm(range(frame_count),desc="Run through video frame per frame"):
        frame = next_image_from_video(cap)   #PIL Image
        if frame == None:
            tqdm.write("WARNUNG: Kein weiteres Bild gelesen - Video zu Ende oder Fehler beim Zugriff.")
            break
        new_image = apply_model(frame, model_selection=selected_model)
        image_to_video(new_image,video_writer)
    
    cap.release()
    video_writer.release() #After release of video_writer the video will be stored

def create_target_path(path_file: str,target_dir: str, selected_model: prediction_models) -> str:
    path_valid = True
    counter = 0
    postfix_raw = selected_model.value
    postfix = postfix_raw
    while(path_valid):
        path_valid = False
        basename = Path(path_file).stem
        target_name = basename+"_"+postfix+".mp4"
        #target_dir = os.path.split(file)[0]
        target_dir = target_dir
        target_path = target_dir + r"\\" + target_name
        if os.path.exists(target_path):
            counter+=1
            path_valid = True
            postfix = postfix_raw + "_"+str(counter)
    
    return target_path

def job_agent(src_directory: str,prediction_models_list: list[prediction_models],target_directory: str):
    #OPTIONAL: If desired, integrate a appropriate return value or any feedback messages
    #OPTIONAL: Alterantive way to work with a file list instead of directory
    #TODO adapt postfix if desired
    #not tested yet
    mp4_files = glob.glob(os.path.join(src_directory,"*.mp4"))
    for file in mp4_files:
        for model in prediction_models_list:
            target_path = create_target_path(file,target_directory,model)
            convert_video(file,target_path,model)


In [66]:
#example
src_directory = r"C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img"
prediction_models_list = [prediction_models.DEPTH_ANYTHING_V2]
target_directory = r"C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\output"
job_agent(src_directory,prediction_models_list,target_directory)

Run through video frame per frame:   0%|          | 0/554 [00:00<?, ?it/s]Device set to use cpu
Run through video frame per frame:   0%|          | 1/554 [00:03<35:00,  3.80s/it]Device set to use cpu
Run through video frame per frame:   0%|          | 2/554 [00:07<32:30,  3.53s/it]Device set to use cpu
Run through video frame per frame:   1%|          | 3/554 [00:10<30:44,  3.35s/it]Device set to use cpu
Run through video frame per frame:   1%|          | 4/554 [00:13<30:19,  3.31s/it]Device set to use cpu
Run through video frame per frame:   1%|          | 5/554 [00:16<29:20,  3.21s/it]Device set to use cpu
Run through video frame per frame:   1%|          | 6/554 [00:19<29:17,  3.21s/it]Device set to use cpu
Run through video frame per frame:   1%|▏         | 7/554 [00:23<29:32,  3.24s/it]Device set to use cpu
Run through video frame per frame:   1%|▏         | 8/554 [00:26<29:06,  3.20s/it]Device set to use cpu
Run through video frame per frame:   2%|▏         | 9/554 [00:29<29:05, 

KeyboardInterrupt: 

# Test Snippets

### Extract single example image

In [5]:
#extract single image from video in variable image_example [Image.Image]
import pandas as pd

path_example = r"C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img\episode_000002.mp4"
cap = cv2.VideoCapture(path_example)
if not cap.isOpened():
    print("Error: Could not open video.")
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

df_video_data = pd.DataFrame({
        'Breite (px)': [width],
        'Höhe (px)': [height],
        'FPS:':[fps],
        'frames_':[frame_count]
    })

display(df_video_data)


frame_number = 315

if frame_number<frame_count:
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
    print("cap - set")
    ret, frame = cap.read()
    print("cap - read ")
    if ret:             #read was sucesssful
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)      #Image as Array with RGB Values
        image_example = Image.fromarray(frame_rgb)  
        image_example.show()
        #image_example.save(r"C:\Users\lehrm\Downloads\images\cam1.jpg")
    else:
        print("Warning: Frame could not be read!")


cap.release()


Unnamed: 0,Breite (px),Höhe (px),FPS:,frames_
0,320,240,10.0,528


cap - set
cap - read 


### Test the model application on single image

In [6]:
new_image_example = apply_model(image_example,model_selection=prediction_models.DEPTH_ANYTHING_V2)
new_image_example.show()
#new_image_example.save(r"C:\Users\lehrm\Downloads\images\cam1_depth.jpg")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu


### Simple run on a video - using the test_mode with editing only 5 sec of the given video

In [51]:
path_output = r"C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\output\grayscale2.mp4"
convert_video(path_example, path_output,prediction_models.DEPTH_ANYTHING_V2,test_mode=True)

Run through video frame per frame:   0%|          | 0/20 [00:00<?, ?it/s]Device set to use cpu
Run through video frame per frame:   5%|▌         | 1/20 [00:03<01:09,  3.64s/it]Device set to use cpu
Run through video frame per frame:  10%|█         | 2/20 [00:06<00:59,  3.32s/it]Device set to use cpu
Run through video frame per frame:  15%|█▌        | 3/20 [00:10<00:59,  3.52s/it]Device set to use cpu
Run through video frame per frame:  20%|██        | 4/20 [00:14<00:57,  3.57s/it]Device set to use cpu
Run through video frame per frame:  25%|██▌       | 5/20 [00:18<00:55,  3.73s/it]Device set to use cpu
Run through video frame per frame:  30%|███       | 6/20 [00:21<00:52,  3.75s/it]Device set to use cpu
Run through video frame per frame:  35%|███▌      | 7/20 [00:25<00:48,  3.72s/it]Device set to use cpu
Run through video frame per frame:  40%|████      | 8/20 [00:29<00:44,  3.67s/it]Device set to use cpu
Run through video frame per frame:  45%|████▌     | 9/20 [00:32<00:39,  3.59s/it]

### Test Datei-Filter

In [None]:
import glob
import os

path_file_dir = r"C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img"

mp4_files = glob.glob(os.path.join(path_file_dir,"*.mp4"))

for datei in mp4_files:
    print(datei)

C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img\episode_000000.mp4
C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img\episode_000001.mp4
C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img\episode_000002.mp4
C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img\episode_000003.mp4
C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_lokal\repos\2025_p03_policy_learning\dataset\studytable_open_drawer\videos\chunk-000\observation.image.camera1_img\episode_000004.mp4
C:\Users\lehrm\Daten\Arbeit_u_Studium\Studium\5_Master_