In [None]:
!pip install opencv_python

In [None]:
from transformers import CLIPProcessor, CLIPModel

#repo_id = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
#repo_id = "laion/CLIP-ViT-g-14-laion2B-s12B-b42K" # more modern CLIP model
repo_id = "openai/clip-vit-large-patch14-336" # the CLIP model used for SD up to v1.5
device = 'cuda'

print("loading model...")
model = CLIPModel.from_pretrained(repo_id)
print("loading preprocessor...")
processor = CLIPProcessor.from_pretrained(repo_id)

print(f"sending to {device}...")
model.half().to(device)
print("done")


In [None]:
import PIL
import torch
import torchvision.transforms as transforms

def resize_image(fullsize_image: PIL.Image, min_edge_length: int, max_edge_length: int=None) -> PIL.Image:
    transform = transforms.Resize(size=min_edge_length, max_size=max_edge_length)
    return transform(fullsize_image)



def get_clip_image_features(image: PIL.Image) -> torch.Tensor:
    preprocess_results = processor(text=None, 
                                   images=image, 
                                   return_tensors="pt", 
                                   padding=True, 
                                   device=device
                                  )
    pixel_values = preprocess_results.pixel_values
    #print(pixel_values.device)
    image_features = model.get_image_features(pixel_values = pixel_values.half().to(model.device))
    return image_features



In [None]:

import cv2
import numpy as np
from async_video_processor import AsyncVideoProcessor


accumulated_results = {}
out_data = {}

def process_func(frame_cv_bgr):
    frame_cv_rgb = cv2.cvtColor(np.array(frame_cv_bgr), cv2.COLOR_BGR2RGB)    
    pil_image = PIL.Image.fromarray(frame_cv_rgb)
    features = get_clip_image_features(resize_image(pil_image, min_edge_length= 512))
    
    return features.detach().cpu()

def results_func(frame_index, data):
    accumulated_results[frame_index] = data



In [None]:
import time

def test_tqdm_manual():
    pbar = tqdm(range(1000))
    for i in range(1000):
        pbar.update(1)
        time.sleep(0.01)

test_tqdm_manual()

In [None]:
import pickle
import os
from tqdm.notebook import tqdm

async def write_clip_features(root_path):
    global accumulated_results
    print("walking", root_path)
    for directory, _, filenames in os.walk(root_path):
        video_extensions = [".mp4"]
        print('directory:', directory)
        video_filenames = [f for f in filenames if os.path.splitext(f)[1] in video_extensions]
        if len(video_filenames)==0:
            continue
        for filename in tqdm(video_filenames, desc=directory):
            video_path = os.path.join(directory, filename)
            pickle_path = video_path + ".clip-features.pickle"
            if os.path.exists(pickle_path):
                print("not overwriting existing", pickle_path)
                continue

            accumulated_results = {}
            process_fps = 0.5
            first_frame_to_process = 0

            def write_results_func(video, partial:bool):
                if partial:
                    return
                outData = {
                    'type': 'clip features ' + repo_id,
                    'fps': video.get(cv2.CAP_PROP_FPS),
                    'features': accumulated_results,
                    'frameIncrement': frame_increment
                }
                
                with open(pickle_path, 'wb') as handle:
                    pickle.dump(outData, handle, protocol=pickle.HIGHEST_PROTOCOL)
                print("cumulative detection count:",str(len(accumulated_results)))

            async_video_processor = AsyncVideoProcessor(video_path, process_func, results_func, write_results_func, first_frame_to_process, process_fps)
            frame_increment = async_video_processor.frameIncrement
            await async_video_processor.run()


In [None]:
await write_clip_features("./videos")