In [1]:
import os
import cv2
from SegTracker import SegTracker
from model_args import aot_args,sam_args,segtracker_args
from PIL import Image
from aot_tracker import _palette
import numpy as np
import torch
import imageio
import matplotlib.pyplot as plt
from scipy.ndimage import binary_dilation
import gc



def save_prediction(pred_mask,output_dir,file_name):
    save_mask = Image.fromarray(pred_mask.astype(np.uint8))
    save_mask = save_mask.convert(mode='P')
    save_mask.putpalette(_palette)
    save_mask.save(os.path.join(output_dir,file_name))
def colorize_mask(pred_mask):
    save_mask = Image.fromarray(pred_mask.astype(np.uint8))
    save_mask = save_mask.convert(mode='P')
    save_mask.putpalette(_palette)
    save_mask = save_mask.convert(mode='RGB')
    return np.array(save_mask)
def draw_mask(img, mask, alpha=0.7, id_countour=False):
    img_mask = np.zeros_like(img)
    img_mask = img
    if id_countour:
        # very slow ~ 1s per image
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[obj_ids!=0]

        for id in obj_ids:
            # Overlay color on  binary mask
            if id <= 255:
                color = _palette[id*3:id*3+3]
            else:
                color = [0,0,0]
            foreground = img * (1-alpha) + np.ones_like(img) * alpha * np.array(color)
            binary_mask = (mask == id)

            # Compose image
            img_mask[binary_mask] = foreground[binary_mask]

            countours = binary_dilation(binary_mask,iterations=1) ^ binary_mask
            img_mask[countours, :] = 0
    else:
        binary_mask = (mask!=0)
        countours = binary_dilation(binary_mask,iterations=1) ^ binary_mask
        foreground = img*(1-alpha)+colorize_mask(mask)*alpha
        img_mask[binary_mask] = foreground[binary_mask]
        img_mask[countours,:] = 0
        
    return img_mask.astype(img.dtype)

def create_directories(path):    
    dir_path = os.path.dirname(path)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print(f"Directory created: {dir_path}")
    else:
        print(f"Directory already exists: {dir_path}")

    return path

def video_detection(io_args, segtracker_args, sam_args, aot_args, grounding_caption, box_threshold, text_threshold, box_size_threshold, reset_image):
    # source video to segment
    import shutil
    shutil.copy(io_args['input_video'], io_args['original_video'])
    
    cap = cv2.VideoCapture(io_args['input_video'])
    fps = cap.get(cv2.CAP_PROP_FPS)
    # output masks
    output_dir = io_args['output_mask_dir']
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    pred_list = []
    masked_pred_list = []
    det_count_one = []
    det_count_frames = []
    
    torch.cuda.empty_cache()
    gc.collect()
    sam_gap = segtracker_args['sam_gap']
    frame_idx = 0
    segtracker = SegTracker(segtracker_args, sam_args, aot_args)
    segtracker.restart_tracker()
    
    with torch.cuda.amp.autocast():
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
            if frame_idx == 0:
                pred_mask, _ = segtracker.detect_and_seg(frame, grounding_caption, box_threshold, text_threshold, box_size_threshold, reset_image)
                torch.cuda.empty_cache()
                gc.collect()
                segtracker.add_reference(frame, pred_mask)
            elif (frame_idx % sam_gap) == 0:
                seg_mask, _ = segtracker.detect_and_seg(frame, grounding_caption, box_threshold, text_threshold, box_size_threshold, reset_image)
                save_prediction(seg_mask, './debug/seg_result', str(frame_idx)+'.png')
                torch.cuda.empty_cache()
                gc.collect()
                track_mask = segtracker.track(frame)
                save_prediction(track_mask, './debug/aot_result', str(frame_idx)+'.png')
                new_obj_mask = segtracker.find_new_objs(track_mask, seg_mask)
                if np.sum(new_obj_mask > 0) >  frame.shape[0] * frame.shape[1] * 0.4:
                    new_obj_mask = np.zeros_like(new_obj_mask)
                save_prediction(new_obj_mask,output_dir,str(frame_idx)+'_new.png')
                pred_mask = track_mask + new_obj_mask
                segtracker.add_reference(frame, pred_mask)
            else:
                pred_mask = segtracker.track(frame,update_memory=True)
            torch.cuda.empty_cache()
            gc.collect()
            
            save_prediction(pred_mask,output_dir,str(frame_idx)+'.png')
            pred_list.append(pred_mask)

            obj_ids = np.unique(pred_mask)
            obj_ids = obj_ids[obj_ids!=0]
            det_count_frames.append(len(obj_ids))

            print("processed frame {}, obj_num {}".format(frame_idx,segtracker.get_obj_num()),end='\r')
               
            frame_idx += 1
        # # det_count_one = np.sum(det_count_frames)/frame_idx
        # det_count_avg = np.sum(det_count_frames) / frame_idx
        # det_count_one = np.sum(np.abs(det_count_frames - det_count_avg))/frame_idx
        # cap.release()
        
        # draw pred mask on frame and save as a video
        cap = cv2.VideoCapture(io_args['input_video'])
        fps = cap.get(cv2.CAP_PROP_FPS)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        if io_args['input_video'][-3:]=='mp4':
            fourcc =  cv2.VideoWriter_fourcc(*"mp4v")
        elif io_args['input_video'][-3:] == 'avi':
            fourcc =  cv2.VideoWriter_fourcc(*"MJPG")
            # fourcc = cv2.VideoWriter_fourcc(*"XVID")
        else:
            fourcc = int(cap.get(cv2.CAP_PROP_FOURCC))
        out = cv2.VideoWriter(io_args['output_video'], fourcc, fps, (width, height))
        frame_idx = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
            pred_mask = pred_list[frame_idx]
            masked_frame = draw_mask(frame,pred_mask)
            # masked_frame = masked_pred_list[frame_idx]
            masked_frame = cv2.cvtColor(masked_frame,cv2.COLOR_RGB2BGR)
            out.write(masked_frame)
            print('frame {} writed'.format(frame_idx),end='\r')
            frame_idx += 1
        out.release()
        cap.release()
        print("\n{} saved".format(io_args['output_video']))
        # save colorized masks as a gif
        duration = 1000 / fps
        imageio.mimsave(io_args['output_gif'],pred_list,duration=duration)
        print("{} saved".format(io_args['output_gif']))
        print('\nfinished')
        
    return pred_list, det_count_frames 


In [2]:
# print(video_paths[1])
# print(prompt_paths[1])

### Conduct evaluation using COCO classes

In [None]:
import cv2
import os
import re
import logging
import time
import wandb

#### Detection on videos ####
# task = 'fulljourney_videos'
task = 'pikavideos'
# Create the directory if it doesn't exist
timestamp = time.strftime("%Y%m%d-%H%M%S")

os.makedirs(f"./{task}/{timestamp}", exist_ok=True)
wandb.init(project="Vid Model Eval",name=str(timestamp)+"_"+task)

# Set up logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# File handler for writing logs to a file
file_handler = logging.FileHandler(filename=f"./{task}/{timestamp}/object_det_record.txt")
file_handler.setFormatter(logging.Formatter("%(asctime)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
logger.addHandler(file_handler)

# Stream handler for displaying logs in the terminal
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter("%(asctime)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
logger.addHandler(stream_handler)

# settings for detection function
# choose good parameters in sam_args based on the first frame segmentation result
# other arguments can be modified in model_args.py
# note the object number limit is 255 by default, which requires < 10GB GPU memory with amp
sam_args['generator_args'] = {
        'points_per_side': 30,
        'pred_iou_thresh': 0.8,
        'stability_score_thresh': 0.9,
        'crop_n_layers': 1,
        'crop_n_points_downscale_factor': 2,
        'min_mask_region_area': 200,
    }

# For every sam_gap frames, we use SAM to find new objects and add them for tracking
# larger sam_gap is faster but may not spot new objects in time
segtracker_args = {
    'sam_gap': 49, # the interval to run sam to segment new objects
    'min_area': 200, # minimal mask area to add a new mask as a new object
    'max_obj_num': 255, # maximal object number to track in a video
    'min_new_obj_iou': 0.8, # the area of a new object in the background should > 80% 
}

# Set Text args
'''
parameter:
    grounding_caption: Text prompt to detect objects in key-frames
    box_threshold: threshold for box 
    text_threshold: threshold for label(text)
    box_size_threshold: If the size ratio between the box and the frame is larger than the box_size_threshold, the box will be ignored. This is used to filter out large boxes.
    reset_image: reset the image embeddings for SAM
'''
# grounding_caption = "car.suv"
grounding_caption = "" #must have this class in the image, otherwise go wrong
box_threshold, text_threshold, box_size_threshold, reset_image = 0.6, 0.5, 0.5, True 

# COCO dataset
keywords = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
                   'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
                   'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
                   'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
                   'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
                   'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
                   'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
                   'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
                   'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
                   'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
                   'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
                   'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
                   'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
                   'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] #coco classes, https://github.com/matlab-deep-learning/Object-Detection-Using-Pretrained-YOLO-v2/blob/main/+helper/coco-classes.txt
    
    
dir_videos = f'/apdcephfs_cq3/share_1290939/shadowcun/evals_dev/{task}/samples/'
# dir_prompts = f'/apdcephfs_cq3/share_1290939/shadowcun/evals_dev/{task}/prompts/'
# prompt_paths = [os.path.join(dir_prompts, x) for x in os.listdir('/apdcephfs_cq3/share_1290939/shadowcun/evals_dev/'+task+'/prompts')]
dir_prompts = f'/apdcephfs_cq3/share_1290939/raphaelliu/evals_dev/{task}/prompts_updated/'
prompt_paths = [os.path.join(dir_prompts, x) for x in os.listdir('/apdcephfs_cq3/share_1290939/raphaelliu/evals_dev/'+task+'/prompts_updated')]

video_paths = [os.path.join(dir_videos, os.path.splitext(os.path.basename(x))[0]+'.mp4') for x in prompt_paths]
prompt_paths_object = []
video_paths_object = []
det_count = []
keyword_object = []
prompt_object = []
det_count_mse_all = []
det_num = 0
object_count = 0
pos_num = 0
vid_num = 0
last_video_name = ' '
start_time = time.time()  # Start the timer
logger.info(f"Total video num {len(prompt_paths)}")
wandb.log({"Total video num": len(prompt_paths)
            })

for i in range(len(prompt_paths)):
    with open(prompt_paths[i], "r") as f:
        data = f.read() 
    for keyword in keywords:
        num = len(re.findall(r'\b' + re.escape(keyword) + r'(s|es)?\b', data, re.IGNORECASE))      
        if num > 0:
            video_name = os.path.splitext(os.path.basename(prompt_paths[i]))[0]
            if video_name != last_video_name:
                vid_num +=1
            last_video_name = video_name
            io_args = {
                'input_video': f'/apdcephfs_cq3/share_1290939/shadowcun/evals_dev/{task}/samples/{video_name}.mp4',
                'output_mask_dir': f'./{task}/{timestamp}/{keyword}/masks/{video_name}', # save pred masks
                'original_video': f'./{task}/{timestamp}/{keyword}/original_{video_name}.mp4', 
                'output_video': f'./{task}/{timestamp}/{keyword}/mask_{video_name}.mp4', # mask+frame vizualization, mp4 or avi, else the same as input video
                'output_gif': f'./{task}/{timestamp}/{keyword}/gif_{video_name}.gif', # mask visualization
            }
            path = create_directories(io_args['output_mask_dir'])
            
            ## detection 
            grounding_caption = keyword
            pred_list, det_count_frames = video_detection(io_args, segtracker_args, sam_args, aot_args, grounding_caption, box_threshold, text_threshold, box_size_threshold, reset_image)
            det_num += 1 
            det_count_avg = np.sum(det_count_frames) / len(det_count_frames)
            det_count_mse = np.sum(np.abs(det_count_frames - det_count_avg) / max(np.max(det_count_frames - det_count_avg), 0.001)) / len(det_count_frames)
            if det_count_avg:
                pos_num += 1
            det_count_mse_all.append(det_count_mse)
            prompt_paths_object.append(prompt_paths[i])
            keyword_object.append(keyword)
            prompt_object.append(data)
            acc  = pos_num / det_num
            object_metric = 1 - sum(det_count_mse_all) / det_num
            logging.info(f"Det num: {det_num}, Positive num: {pos_num}, Acc: {acc}, Step obj-gen score: {det_count_mse}, Whole obj-gen score: {object_metric}, Vid num: {vid_num}, Prompt: {data}, Path index: {i}, Video name: {video_name}")
            wandb.log({\
                "Positive rate": acc,
                "Obj-gen score":object_metric,
                "Det num": det_num,
                "Positive num": pos_num,
            })
            
object_metric = pos_num/det_num
# for i in range(len(obj_num)):
#     object_count += abs(det_count[i] - obj_num[i])/(et_count[i] + obj_num[i]) 
# object_count_metric = 1 - (object_count/len(obj_num))  
end_time = time.time()  # Stop the timer
elapsed_time = end_time - start_time
logger.info(f"Object generation metric: {object_metric:.2f} ")
# logger.info(f"N objects generation metric: {object_count_metric:.2f} ")
logger.info(f"Total processing time: {elapsed_time:.2f} seconds")

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666861826670356, max=1.0)…

2023-08-25 18:09:07 Total video num 53622
2023-08-25 18:09:07 Total video num 53622


Directory created: ./pikavideos/20230825-180853/bird/masks


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized




frame 23 writed 23, obj_num 0
./pikavideos/20230825-180853/bird/mask_Imagine_stepping_into_a_vast_and_enchanting_forest._Towering_trees_stretch_upwards_their_branches_f_seed6872407215330904296.mp4 saved


2023-08-25 18:09:20 Det num: 1, Positive num: 0, Acc: 0.0, Step obj-gen score: 0.0, Whole obj-gen score: 1.0, Vid num: 1, Prompt:  Imagine stepping into a vast and enchanting forest. Towering trees stretch upwards, their branches forming a lush canopy that filters sunlight and creates dancing patterns on the forest floor. The air is filled with the earthy aroma of damp moss, pine needles, and the subtle scent of wildflowers.  As you walk deeper into the woods, a sense of serenity envelops you. The ground is blanketed with a rich layer of fallen leaves and soft undergrowth, muffling your footsteps. The rustling of leaves and the distant call of birds create a soothing symphony of nature.  Shafts of sunlight pierce through the gaps in the canopy, illuminating patches of vibrant green ferns and delicate wildflowers. Mushrooms of various shapes and colors decorate the base of ancient trees, adding a touch of whimsy to the scene.  A small stream trickles nearby, its clear water bubbling ove

./pikavideos/20230825-180853/bird/gif_Imagine_stepping_into_a_vast_and_enchanting_forest._Towering_trees_stretch_upwards_their_branches_f_seed6872407215330904296.gif saved

finished
Directory created: ./pikavideos/20230825-180853/elephant/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 1
./pikavideos/20230825-180853/elephant/mask_Naruto_chasing_an_Elephant_in_Africa_anime_Naruto-punk_cute_cartoon_fast_action_seed12455689736558127025.mp4 saved


2023-08-25 18:09:31 Det num: 2, Positive num: 1, Acc: 0.5, Step obj-gen score: 0.0, Whole obj-gen score: 0.5, Vid num: 2, Prompt:  Naruto chasing an Elephant in Africa, anime Naruto-punk cute cartoon fast action  Author: @RubenTainoAI, Path index: 12, Video name: Naruto_chasing_an_Elephant_in_Africa_anime_Naruto-punk_cute_cartoon_fast_action_seed12455689736558127025
2023-08-25 18:09:31 Det num: 2, Positive num: 1, Acc: 0.5, Step obj-gen score: 0.0, Whole obj-gen score: 0.5, Vid num: 2, Prompt:  Naruto chasing an Elephant in Africa, anime Naruto-punk cute cartoon fast action  Author: @RubenTainoAI, Path index: 12, Video name: Naruto_chasing_an_Elephant_in_Africa_anime_Naruto-punk_cute_cartoon_fast_action_seed12455689736558127025


./pikavideos/20230825-180853/elephant/gif_Naruto_chasing_an_Elephant_in_Africa_anime_Naruto-punk_cute_cartoon_fast_action_seed12455689736558127025.gif saved

finished
Directory created: ./pikavideos/20230825-180853/dog/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 1
./pikavideos/20230825-180853/dog/mask_attractive_angry_female_zombie_wearing_dog_collar_and_chain_running_trying_to_attack_camera_fierce_seed14017085236677006407.mp4 saved


2023-08-25 18:09:41 Det num: 3, Positive num: 2, Acc: 0.6666666666666666, Step obj-gen score: 0.0, Whole obj-gen score: 0.3333333333333333, Vid num: 3, Prompt:  attractive angry female zombie wearing dog collar and chain, running, trying to attack camera fiercely and with determination. Ultra photo realistic 8k resolution --ar 16:9  Author: @IzzyAi, Path index: 14, Video name: attractive_angry_female_zombie_wearing_dog_collar_and_chain_running_trying_to_attack_camera_fierce_seed14017085236677006407
2023-08-25 18:09:41 Det num: 3, Positive num: 2, Acc: 0.6666666666666666, Step obj-gen score: 0.0, Whole obj-gen score: 0.3333333333333333, Vid num: 3, Prompt:  attractive angry female zombie wearing dog collar and chain, running, trying to attack camera fiercely and with determination. Ultra photo realistic 8k resolution --ar 16:9  Author: @IzzyAi, Path index: 14, Video name: attractive_angry_female_zombie_wearing_dog_collar_and_chain_running_trying_to_attack_camera_fierce_seed1401708523667

./pikavideos/20230825-180853/dog/gif_attractive_angry_female_zombie_wearing_dog_collar_and_chain_running_trying_to_attack_camera_fierce_seed14017085236677006407.gif saved

finished
Directory created: ./pikavideos/20230825-180853/horse/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 1
./pikavideos/20230825-180853/horse/mask_a_red_horse_with_robotic_legs_and_eyes_running_alongside_the_beach_-_ar_9_16_seed18364374028985219773.mp4 saved


2023-08-25 18:09:51 Det num: 4, Positive num: 3, Acc: 0.75, Step obj-gen score: 0.0, Whole obj-gen score: 0.25, Vid num: 4, Prompt:  a red horse with robotic legs and eyes, running alongside the beach - ar 9:16  Author: @imgcatz, Path index: 38, Video name: a_red_horse_with_robotic_legs_and_eyes_running_alongside_the_beach_-_ar_9_16_seed18364374028985219773
2023-08-25 18:09:51 Det num: 4, Positive num: 3, Acc: 0.75, Step obj-gen score: 0.0, Whole obj-gen score: 0.25, Vid num: 4, Prompt:  a red horse with robotic legs and eyes, running alongside the beach - ar 9:16  Author: @imgcatz, Path index: 38, Video name: a_red_horse_with_robotic_legs_and_eyes_running_alongside_the_beach_-_ar_9_16_seed18364374028985219773


./pikavideos/20230825-180853/horse/gif_a_red_horse_with_robotic_legs_and_eyes_running_alongside_the_beach_-_ar_9_16_seed18364374028985219773.gif saved

finished
Directory created: ./pikavideos/20230825-180853/bear/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 0
./pikavideos/20230825-180853/bear/mask_MY_TESTAMENT_When_I_am_dead_bury_me_In_my_beloved_Ukraine_My_tomb_upon_a_grave_mound_high_Amid_the_seed3133089823467916576.mp4 saved


2023-08-25 18:10:01 Det num: 5, Positive num: 3, Acc: 0.6, Step obj-gen score: 0.0, Whole obj-gen score: 0.2, Vid num: 5, Prompt:  MY TESTAMENT When I am dead, bury me In my beloved Ukraine, My tomb upon a grave mound high Amid the spreading plain, So that the fields, the boundless steppes, The Dnipro's plunging shore My eyes could see, my ears could hear The mighty river roar. When from Ukraine the Dnipro bears Into the deep blue sea The blood of foes ... then will I leave These hills and fertile fields -- I'll leave them all and fly away To the abode of God, And then I'll pray ....  But till that dayI nothing know of God. bury me, then rise ye up And break your heavy chains And water with the tyrants' blood The freedom you have gained. And in the great new family, The family of the free, With softly spoken, kindly word Remember also me.  Author: @Ashtray25, Path index: 42, Video name: MY_TESTAMENT_When_I_am_dead_bury_me_In_my_beloved_Ukraine_My_tomb_upon_a_grave_mound_high_Amid_the_s

./pikavideos/20230825-180853/bear/gif_MY_TESTAMENT_When_I_am_dead_bury_me_In_my_beloved_Ukraine_My_tomb_upon_a_grave_mound_high_Amid_the_seed3133089823467916576.gif saved

finished
Directory created: ./pikavideos/20230825-180853/cat/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 21 writed 23, obj_num 1

2023-08-25 18:10:11 Det num: 6, Positive num: 4, Acc: 0.6666666666666666, Step obj-gen score: 0.0, Whole obj-gen score: 0.16666666666666666, Vid num: 6, Prompt:  Prompt: Cinematic Composition of a cat walking in a space  Author: @mksparks, Path index: 49, Video name: Prompt__Cinematic_Composition_of_a_cat_walking_in_a_space_seed7257964282371214865
2023-08-25 18:10:11 Det num: 6, Positive num: 4, Acc: 0.6666666666666666, Step obj-gen score: 0.0, Whole obj-gen score: 0.16666666666666666, Vid num: 6, Prompt:  Prompt: Cinematic Composition of a cat walking in a space  Author: @mksparks, Path index: 49, Video name: Prompt__Cinematic_Composition_of_a_cat_walking_in_a_space_seed7257964282371214865


frame 23 writed
./pikavideos/20230825-180853/cat/mask_Prompt__Cinematic_Composition_of_a_cat_walking_in_a_space_seed7257964282371214865.mp4 saved
./pikavideos/20230825-180853/cat/gif_Prompt__Cinematic_Composition_of_a_cat_walking_in_a_space_seed7257964282371214865.gif saved

finished
Directory created: ./pikavideos/20230825-180853/toilet/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 0
./pikavideos/20230825-180853/toilet/mask_war_scene_with_toilets_and_mutatant_camera_head_people_with_explosions_and_broken_toilets_on_fire_wi_seed9022266552524930572.mp4 saved


2023-08-25 18:10:21 Det num: 7, Positive num: 4, Acc: 0.5714285714285714, Step obj-gen score: 0.0, Whole obj-gen score: 0.14285714285714285, Vid num: 7, Prompt:  war scene with toilets and mutatant camera head people with explosions and broken toilets on fire with explosions in the background and giant flying cameras  Author: @astralmoonveil, Path index: 57, Video name: war_scene_with_toilets_and_mutatant_camera_head_people_with_explosions_and_broken_toilets_on_fire_wi_seed9022266552524930572
2023-08-25 18:10:21 Det num: 7, Positive num: 4, Acc: 0.5714285714285714, Step obj-gen score: 0.0, Whole obj-gen score: 0.14285714285714285, Vid num: 7, Prompt:  war scene with toilets and mutatant camera head people with explosions and broken toilets on fire with explosions in the background and giant flying cameras  Author: @astralmoonveil, Path index: 57, Video name: war_scene_with_toilets_and_mutatant_camera_head_people_with_explosions_and_broken_toilets_on_fire_wi_seed9022266552524930572


./pikavideos/20230825-180853/toilet/gif_war_scene_with_toilets_and_mutatant_camera_head_people_with_explosions_and_broken_toilets_on_fire_wi_seed9022266552524930572.gif saved

finished
Directory already exists: ./pikavideos/20230825-180853/dog/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 21 writed 23, obj_num 2

2023-08-25 18:10:31 Det num: 8, Positive num: 5, Acc: 0.625, Step obj-gen score: 0.0, Whole obj-gen score: 0.125, Vid num: 8, Prompt: @Robi A Swiss Mountain Dog sits gracefully in front of the Grand Canyon. The sun shines brightly, and a gentle breeze 4K -ar 16:9  Author: @Robi@Kali, Path index: 64, Video name: A_Swiss_Mountain_Dog_sits_gracefully_in_front_of_the_Grand_Canyon._The_sun_shines_brightly_and_a_ge_seed8816516681728766960
2023-08-25 18:10:31 Det num: 8, Positive num: 5, Acc: 0.625, Step obj-gen score: 0.0, Whole obj-gen score: 0.125, Vid num: 8, Prompt: @Robi A Swiss Mountain Dog sits gracefully in front of the Grand Canyon. The sun shines brightly, and a gentle breeze 4K -ar 16:9  Author: @Robi@Kali, Path index: 64, Video name: A_Swiss_Mountain_Dog_sits_gracefully_in_front_of_the_Grand_Canyon._The_sun_shines_brightly_and_a_ge_seed8816516681728766960


frame 23 writed
./pikavideos/20230825-180853/dog/mask_A_Swiss_Mountain_Dog_sits_gracefully_in_front_of_the_Grand_Canyon._The_sun_shines_brightly_and_a_ge_seed8816516681728766960.mp4 saved
./pikavideos/20230825-180853/dog/gif_A_Swiss_Mountain_Dog_sits_gracefully_in_front_of_the_Grand_Canyon._The_sun_shines_brightly_and_a_ge_seed8816516681728766960.gif saved

finished
Directory created: ./pikavideos/20230825-180853/cow/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 20 writed 23, obj_num 0

2023-08-25 18:10:41 Det num: 9, Positive num: 5, Acc: 0.5555555555555556, Step obj-gen score: 0.0, Whole obj-gen score: 0.1111111111111111, Vid num: 9, Prompt:  a video of President Bolsoaro in the midst of several cows, super realistic --ar 16:9  Author: @INTERNACIONA, Path index: 79, Video name: a_video_of_President_Bolsoaro_in_the_midst_of_several_cows_super_realistic_--ar_16_9
2023-08-25 18:10:41 Det num: 9, Positive num: 5, Acc: 0.5555555555555556, Step obj-gen score: 0.0, Whole obj-gen score: 0.1111111111111111, Vid num: 9, Prompt:  a video of President Bolsoaro in the midst of several cows, super realistic --ar 16:9  Author: @INTERNACIONA, Path index: 79, Video name: a_video_of_President_Bolsoaro_in_the_midst_of_several_cows_super_realistic_--ar_16_9


frame 23 writed
./pikavideos/20230825-180853/cow/mask_a_video_of_President_Bolsoaro_in_the_midst_of_several_cows_super_realistic_--ar_16_9.mp4 saved
./pikavideos/20230825-180853/cow/gif_a_video_of_President_Bolsoaro_in_the_midst_of_several_cows_super_realistic_--ar_16_9.gif saved

finished
Directory already exists: ./pikavideos/20230825-180853/dog/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 1
./pikavideos/20230825-180853/dog/mask_a_white_Canaan_dog_flying_with_a_red_cape_seed4449926014351470217.mp4 saved


2023-08-25 18:10:51 Det num: 10, Positive num: 6, Acc: 0.6, Step obj-gen score: 0.0, Whole obj-gen score: 0.1, Vid num: 10, Prompt:  a white Canaan dog flying with a red cape  Author: @poeticoncept, Path index: 86, Video name: a_white_Canaan_dog_flying_with_a_red_cape_seed4449926014351470217
2023-08-25 18:10:51 Det num: 10, Positive num: 6, Acc: 0.6, Step obj-gen score: 0.0, Whole obj-gen score: 0.1, Vid num: 10, Prompt:  a white Canaan dog flying with a red cape  Author: @poeticoncept, Path index: 86, Video name: a_white_Canaan_dog_flying_with_a_red_cape_seed4449926014351470217


./pikavideos/20230825-180853/dog/gif_a_white_Canaan_dog_flying_with_a_red_cape_seed4449926014351470217.gif saved

finished
Directory created: ./pikavideos/20230825-180853/cake/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 1
./pikavideos/20230825-180853/cake/mask_cinematic_black_and_white_noir_close_up_film_of_frog_chefs_throwing_cake_at_each_other_exploding_ca_seed17018909320981396447.mp4 saved


2023-08-25 18:11:01 Det num: 11, Positive num: 7, Acc: 0.6363636363636364, Step obj-gen score: 0.0, Whole obj-gen score: 0.09090909090909091, Vid num: 11, Prompt:  cinematic black and white noir close up film of frog chefs throwing cake at each other, exploding cakes, surreal, weird, twisted, retro, 8k  Author: @plannetina, Path index: 87, Video name: cinematic_black_and_white_noir_close_up_film_of_frog_chefs_throwing_cake_at_each_other_exploding_ca_seed17018909320981396447
2023-08-25 18:11:01 Det num: 11, Positive num: 7, Acc: 0.6363636363636364, Step obj-gen score: 0.0, Whole obj-gen score: 0.09090909090909091, Vid num: 11, Prompt:  cinematic black and white noir close up film of frog chefs throwing cake at each other, exploding cakes, surreal, weird, twisted, retro, 8k  Author: @plannetina, Path index: 87, Video name: cinematic_black_and_white_noir_close_up_film_of_frog_chefs_throwing_cake_at_each_other_exploding_ca_seed17018909320981396447


./pikavideos/20230825-180853/cake/gif_cinematic_black_and_white_noir_close_up_film_of_frog_chefs_throwing_cake_at_each_other_exploding_ca_seed17018909320981396447.gif saved

finished
Directory already exists: ./pikavideos/20230825-180853/bear/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 0
./pikavideos/20230825-180853/bear/mask_bear_breakdancing_in_the_trading_room_in_front_of_the_screens_depicting_candlestick_charts_red_going.mp4 saved


2023-08-25 18:11:11 Det num: 12, Positive num: 7, Acc: 0.5833333333333334, Step obj-gen score: 0.0, Whole obj-gen score: 0.08333333333333333, Vid num: 12, Prompt:  bear breakdancing in the trading room in front of the screens depicting candlestick charts red going down -neg cartoon, green -sg 99  Author: @MickeyMoloch, Path index: 103, Video name: bear_breakdancing_in_the_trading_room_in_front_of_the_screens_depicting_candlestick_charts_red_going
2023-08-25 18:11:11 Det num: 12, Positive num: 7, Acc: 0.5833333333333334, Step obj-gen score: 0.0, Whole obj-gen score: 0.08333333333333333, Vid num: 12, Prompt:  bear breakdancing in the trading room in front of the screens depicting candlestick charts red going down -neg cartoon, green -sg 99  Author: @MickeyMoloch, Path index: 103, Video name: bear_breakdancing_in_the_trading_room_in_front_of_the_screens_depicting_candlestick_charts_red_going


./pikavideos/20230825-180853/bear/gif_bear_breakdancing_in_the_trading_room_in_front_of_the_screens_depicting_candlestick_charts_red_going.gif saved

finished
Directory created: ./pikavideos/20230825-180853/book/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 0
./pikavideos/20230825-180853/book/mask_A_student_sitting_at_a_desk_with_a_book_showing_a_determined_expression_while_studying.hype_realist_seed10023452760357934235.mp4 saved


2023-08-25 18:11:21 Det num: 13, Positive num: 7, Acc: 0.5384615384615384, Step obj-gen score: 0.0, Whole obj-gen score: 0.07692307692307693, Vid num: 13, Prompt:  A student sitting at a desk with a book, showing a determined expression while studying.hype realistic, 8k  Author: @Raihan, Path index: 104, Video name: A_student_sitting_at_a_desk_with_a_book_showing_a_determined_expression_while_studying.hype_realist_seed10023452760357934235
2023-08-25 18:11:21 Det num: 13, Positive num: 7, Acc: 0.5384615384615384, Step obj-gen score: 0.0, Whole obj-gen score: 0.07692307692307693, Vid num: 13, Prompt:  A student sitting at a desk with a book, showing a determined expression while studying.hype realistic, 8k  Author: @Raihan, Path index: 104, Video name: A_student_sitting_at_a_desk_with_a_book_showing_a_determined_expression_while_studying.hype_realist_seed10023452760357934235


./pikavideos/20230825-180853/book/gif_A_student_sitting_at_a_desk_with_a_book_showing_a_determined_expression_while_studying.hype_realist_seed10023452760357934235.gif saved

finished
Directory created: ./pikavideos/20230825-180853/boat/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 21 writed 23, obj_num 1

2023-08-25 18:11:31 Det num: 14, Positive num: 8, Acc: 0.5714285714285714, Step obj-gen score: 0.0, Whole obj-gen score: 0.07142857142857142, Vid num: 14, Prompt:  huge waves crash the boat, 4k, storm, epic, cinematic  Author: @Mike, Path index: 113, Video name: huge_waves_crash_the_boat_4k_storm_epic_cinematic_seed1568166036430126724
2023-08-25 18:11:31 Det num: 14, Positive num: 8, Acc: 0.5714285714285714, Step obj-gen score: 0.0, Whole obj-gen score: 0.07142857142857142, Vid num: 14, Prompt:  huge waves crash the boat, 4k, storm, epic, cinematic  Author: @Mike, Path index: 113, Video name: huge_waves_crash_the_boat_4k_storm_epic_cinematic_seed1568166036430126724


frame 23 writed
./pikavideos/20230825-180853/boat/mask_huge_waves_crash_the_boat_4k_storm_epic_cinematic_seed1568166036430126724.mp4 saved
./pikavideos/20230825-180853/boat/gif_huge_waves_crash_the_boat_4k_storm_epic_cinematic_seed1568166036430126724.gif saved

finished
Directory created: ./pikavideos/20230825-180853/car/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 2
./pikavideos/20230825-180853/car/mask_super_cars_and_superbikes_racing_in_konkan_slow_motion_hyper_realistic_4k_resolution_high_octane_seed4192569648458836432.mp4 saved


2023-08-25 18:11:42 Det num: 15, Positive num: 9, Acc: 0.6, Step obj-gen score: 0.9166666666666664, Whole obj-gen score: 0.005555555555555573, Vid num: 15, Prompt:  super cars and superbikes racing in konkan, slow motion, hyper realistic, 4k resolution, high octane render, golden hour lighting, intelligent render, beyond human perspective, dynamic shot, blade runner 2049 style  Author: @rohitmokashi90, Path index: 121, Video name: super_cars_and_superbikes_racing_in_konkan_slow_motion_hyper_realistic_4k_resolution_high_octane_seed4192569648458836432
2023-08-25 18:11:42 Det num: 15, Positive num: 9, Acc: 0.6, Step obj-gen score: 0.9166666666666664, Whole obj-gen score: 0.005555555555555573, Vid num: 15, Prompt:  super cars and superbikes racing in konkan, slow motion, hyper realistic, 4k resolution, high octane render, golden hour lighting, intelligent render, beyond human perspective, dynamic shot, blade runner 2049 style  Author: @rohitmokashi90, Path index: 121, Video name: super_car

./pikavideos/20230825-180853/car/gif_super_cars_and_superbikes_racing_in_konkan_slow_motion_hyper_realistic_4k_resolution_high_octane_seed4192569648458836432.gif saved

finished
Directory already exists: ./pikavideos/20230825-180853/elephant/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 2
./pikavideos/20230825-180853/elephant/mask_African_Elephant_nature_photography_expansive_awe-inspiring_breathtaking_vivid_colors_dramatic__seed6798599620705850248.mp4 saved


2023-08-25 18:11:52 Det num: 16, Positive num: 10, Acc: 0.625, Step obj-gen score: 0.0, Whole obj-gen score: 0.0052083333333333495, Vid num: 16, Prompt:  African Elephant nature photography, expansive, awe-inspiring, breathtaking, vivid colors, dramatic lighting, wide-angle, sharp focus, good exposure, golden hour -ar 16:9  Author: @efadiman, Path index: 127, Video name: African_Elephant_nature_photography_expansive_awe-inspiring_breathtaking_vivid_colors_dramatic__seed6798599620705850248
2023-08-25 18:11:52 Det num: 16, Positive num: 10, Acc: 0.625, Step obj-gen score: 0.0, Whole obj-gen score: 0.0052083333333333495, Vid num: 16, Prompt:  African Elephant nature photography, expansive, awe-inspiring, breathtaking, vivid colors, dramatic lighting, wide-angle, sharp focus, good exposure, golden hour -ar 16:9  Author: @efadiman, Path index: 127, Video name: African_Elephant_nature_photography_expansive_awe-inspiring_breathtaking_vivid_colors_dramatic__seed6798599620705850248


./pikavideos/20230825-180853/elephant/gif_African_Elephant_nature_photography_expansive_awe-inspiring_breathtaking_vivid_colors_dramatic__seed6798599620705850248.gif saved

finished
Directory already exists: ./pikavideos/20230825-180853/horse/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 21 writed 23, obj_num 1

2023-08-25 18:12:02 Det num: 17, Positive num: 11, Acc: 0.6470588235294118, Step obj-gen score: 0.0, Whole obj-gen score: 0.004901960784313741, Vid num: 17, Prompt:  two cowboys riding horses near a farm during American Civil War  Author: @fahadqureashi, Path index: 136, Video name: two_cowboys_riding_horses_near_a_farm_during_American_Civil_War_seed5697136767728898480
2023-08-25 18:12:02 Det num: 17, Positive num: 11, Acc: 0.6470588235294118, Step obj-gen score: 0.0, Whole obj-gen score: 0.004901960784313741, Vid num: 17, Prompt:  two cowboys riding horses near a farm during American Civil War  Author: @fahadqureashi, Path index: 136, Video name: two_cowboys_riding_horses_near_a_farm_during_American_Civil_War_seed5697136767728898480


frame 23 writed
./pikavideos/20230825-180853/horse/mask_two_cowboys_riding_horses_near_a_farm_during_American_Civil_War_seed5697136767728898480.mp4 saved
./pikavideos/20230825-180853/horse/gif_two_cowboys_riding_horses_near_a_farm_during_American_Civil_War_seed5697136767728898480.gif saved

finished
Directory already exists: ./pikavideos/20230825-180853/toilet/masks
final text_encoder_type: bert-base-uncased
Model loaded from ./ckpt/groundingdino_swint_ogc.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])
SegTracker has been initialized
frame 23 writed 23, obj_num 0
./pikavideos/20230825-180853/toilet/mask_skibidi_toilet_dancing_seed16561181095301632481.mp4 saved
./pikavideos/20230825-180853/toilet/gif_skibidi_toilet_dancing_seed16561181095301632481.gif saved

finished


2023-08-25 18:12:12 Det num: 18, Positive num: 11, Acc: 0.6111111111111112, Step obj-gen score: 0.0, Whole obj-gen score: 0.004629629629629644, Vid num: 18, Prompt:  skibidi toilet dancing  Author: @crew, Path index: 137, Video name: skibidi_toilet_dancing_seed16561181095301632481
2023-08-25 18:12:12 Det num: 18, Positive num: 11, Acc: 0.6111111111111112, Step obj-gen score: 0.0, Whole obj-gen score: 0.004629629629629644, Vid num: 18, Prompt:  skibidi toilet dancing  Author: @crew, Path index: 137, Video name: skibidi_toilet_dancing_seed16561181095301632481


Directory already exists: ./pikavideos/20230825-180853/bird/masks
final text_encoder_type: bert-base-uncased


In [None]:
# object_metric = pos_num/det_num
# # for i in range(len(obj_num)):
# #     object_count += abs(det_count[i] - obj_num[i])/(et_count[i] + obj_num[i]) 
# # object_count_metric = 1 - (object_count/len(obj_num))  
# # end_time = time.time()  # Stop the timer
# # elapsed_time = end_time - start_time
# object_metric

np.sum(np.abs(det_count_frames - det_count_avg) / max(np.max(det_count_frames - det_count_avg), 0.001)) / len(det_count_frames)


### Set parameters for input and output

In [None]:
task = 'fulljourney_videos'
# task = 'pikavideos'
video_name = "generated_video_ec706b3b-84c5-4169-b65d-e72efadf754f"
io_args = {
    'input_video': f'/apdcephfs_cq3/share_1290939/shadowcun/evals_dev/{task}/samples/{video_name}.mp4',
    'output_mask_dir': f'./{task}/masks/{video_name}', # save pred masks
    'output_video': f'./{task}/{video_name}.mp4', # mask+frame vizualization, mp4 or avi, else the same as input video
    'output_gif': f'./{task}/{video_name}.gif', # mask visualization
}
path = create_directories(io_args['output_mask_dir'])

### Tuning Grounding-DINO and SAM on the First Frame for Good Initialization

In [None]:
# choose good parameters in sam_args based on the first frame segmentation result
# other arguments can be modified in model_args.py
# note the object number limit is 255 by default, which requires < 10GB GPU memory with amp
sam_args['generator_args'] = {
        'points_per_side': 30,
        'pred_iou_thresh': 0.8,
        'stability_score_thresh': 0.9,
        'crop_n_layers': 1,
        'crop_n_points_downscale_factor': 2,
        'min_mask_region_area': 200,
    }

# Set Text args
'''
parameter:
    grounding_caption: Text prompt to detect objects in key-frames
    box_threshold: threshold for box 
    text_threshold: threshold for label(text)
    box_size_threshold: If the size ratio between the box and the frame is larger than the box_size_threshold, the box will be ignored. This is used to filter out large boxes.
    reset_image: reset the image embeddings for SAM
'''
# grounding_caption = "car.suv"
grounding_caption = "tree" #must have this class in the image, otherwise go wrong
# grounding_caption = "Obama" 
box_threshold, text_threshold, box_size_threshold, reset_image = 0.6, 0.5, 0.5, True 

cap = cv2.VideoCapture(io_args['input_video'])
frame_idx = 0
segtracker = SegTracker(segtracker_args,sam_args,aot_args)
segtracker.restart_tracker()
with torch.cuda.amp.autocast():
    while cap.isOpened():
        ret, frame = cap.read()
        frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
        count, pred_mask, annotated_frame = segtracker.detect_and_seg(frame, grounding_caption, box_threshold, 
                                                               text_threshold, box_size_threshold) # return a flag, whether detected or not
        torch.cuda.empty_cache()
        obj_ids = np.unique(pred_mask)
        obj_ids = obj_ids[obj_ids!=0]
        print("processed frame {}, obj_num {}".format(frame_idx,len(obj_ids)),end='\n')
        break
    cap.release()
    
    init_res = draw_mask(annotated_frame, pred_mask,id_countour=False)
    plt.figure(figsize=(10,10))
    plt.axis('off')
    plt.imshow(init_res)
    plt.show()
    plt.figure(figsize=(10,10))
    plt.axis('off')
    plt.imshow(colorize_mask(pred_mask))
    plt.show()

    del segtracker
    torch.cuda.empty_cache()
    gc.collect()

### Generate Results for the Whole Video

In [None]:
# For every sam_gap frames, we use SAM to find new objects and add them for tracking
# larger sam_gap is faster but may not spot new objects in time
segtracker_args = {
    'sam_gap': 49, # the interval to run sam to segment new objects
    'min_area': 200, # minimal mask area to add a new mask as a new object
    'max_obj_num': 255, # maximal object number to track in a video
    'min_new_obj_iou': 0.8, # the area of a new object in the background should > 80% 
}

# source video to segment
cap = cv2.VideoCapture(io_args['input_video'])
fps = cap.get(cv2.CAP_PROP_FPS)
# output masks
output_dir = io_args['output_mask_dir']
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
pred_list = []
masked_pred_list = []

torch.cuda.empty_cache()
gc.collect()
sam_gap = segtracker_args['sam_gap']
frame_idx = 0
segtracker = SegTracker(segtracker_args, sam_args, aot_args)
segtracker.restart_tracker()

with torch.cuda.amp.autocast():
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
        if frame_idx == 0:
            pred_mask, _ = segtracker.detect_and_seg(frame, grounding_caption, box_threshold, text_threshold, box_size_threshold, reset_image)
            # pred_mask = cv2.imread('./debug/first_frame_mask.png', 0)
            torch. cuda.empty_cache()
            gc.collect()
            segtracker.add_reference(frame, pred_mask)
        elif (frame_idx % sam_gap) == 0:
            seg_mask, _ = segtracker.detect_and_seg(frame, grounding_caption, box_threshold, text_threshold, box_size_threshold, reset_image)
            save_prediction(seg_mask, './debug/seg_result', str(frame_idx)+'.png')
            torch.cuda.empty_cache()
            gc.collect()
            track_mask = segtracker.track(frame)
            save_prediction(track_mask, './debug/aot_result', str(frame_idx)+'.png')
            # find new objects, and update tracker with new objects
            new_obj_mask = segtracker.find_new_objs(track_mask, seg_mask)
            if np.sum(new_obj_mask > 0) >  frame.shape[0] * frame.shape[1] * 0.4:
                new_obj_mask = np.zeros_like(new_obj_mask)
            save_prediction(new_obj_mask,output_dir,str(frame_idx)+'_new.png')
            pred_mask = track_mask + new_obj_mask
            # segtracker.restart_tracker()
            segtracker.add_reference(frame, pred_mask)
        else:
            pred_mask = segtracker.track(frame,update_memory=True)
        torch.cuda.empty_cache()
        gc.collect()
        
        save_prediction(pred_mask,output_dir,str(frame_idx)+'.png')
        # masked_frame = draw_mask(frame,pred_mask)
        # masked_pred_list.append(masked_frame)
        # plt.imshow(masked_frame)
        # plt.show() 
        
        pred_list.append(pred_mask)
        print("processed frame {}, obj_num {}".format(frame_idx,segtracker.get_obj_num()),end='\r')
        frame_idx += 1
    cap.release()
    print('\nfinished')

### Save results for visualization

In [None]:
# draw pred mask on frame and save as a video
cap = cv2.VideoCapture(io_args['input_video'])
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

if io_args['input_video'][-3:]=='mp4':
    fourcc =  cv2.VideoWriter_fourcc(*"mp4v")
elif io_args['input_video'][-3:] == 'avi':
    fourcc =  cv2.VideoWriter_fourcc(*"MJPG")
    # fourcc = cv2.VideoWriter_fourcc(*"XVID")
else:
    fourcc = int(cap.get(cv2.CAP_PROP_FOURCC))
out = cv2.VideoWriter(io_args['output_video'], fourcc, fps, (width, height))

frame_idx = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
    pred_mask = pred_list[frame_idx]
    masked_frame = draw_mask(frame,pred_mask)
    # masked_frame = masked_pred_list[frame_idx]
    masked_frame = cv2.cvtColor(masked_frame,cv2.COLOR_RGB2BGR)
    out.write(masked_frame)
    print('frame {} writed'.format(frame_idx),end='\r')
    frame_idx += 1
out.release()
cap.release()
print("\n{} saved".format(io_args['output_video']))
print('\nfinished')

In [None]:
# save colorized masks as a gif
duration = 1000 / fps
imageio.mimsave(io_args['output_gif'],pred_list,duration=duration)
print("{} saved".format(io_args['output_gif']))

In [None]:
# manually release memory (after cuda out of memory)
del segtracker
torch.cuda.empty_cache()
gc.collect()