In [1]:
import os
import sys
import torch
import torch.backends.cudnn as cudnn
from os import path, mkdir
import logging
from torch.utils.tensorboard import SummaryWriter

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from feature_extractor import FeaturesWriter, get_features_loader
from utils.utils import register_logger
from utils.load_model import load_feature_extractor
from features_loader import FeaturesLoader
from network.TorchUtils import TorchModel
from network.anomaly_detector_model import AnomalyDetector, custom_objective, RegularizedLoss
from utils.callbacks import DefaultModelCallback, TensorBoardCallback

# Definitions

## Global definitions

In [2]:
log_every = 50  # log the writing of clips every n steps
log_file = None  # set logging file
num_workers = 4  # define the number of workers used for loading the videos

cudnn.benchmark = True
register_logger(log_file=log_file)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Definitions of features extraction

In [3]:
dataset_path = '/home/ubuntu/repos/llm-rag/data/Anomaly-Videos-Part-1/test'  # path to the video dataset
clip_length = 16  # define the length of each input sample
frame_interval = 1 # define the sampling interval between framesq
batch_size = 4


## Create model and dataset

In [4]:
data_loader, data_iter = get_features_loader(dataset_path,
                                            clip_length,
                                            frame_interval,
                                            batch_size,
                                            num_workers,
                                            "clip"
                                            )



2024-03-26 14:16:42,545 Found 2 video files in /home/ubuntu/repos/llm-rag/data/Anomaly-Videos-Part-1/test


  0%|          | 0/1 [00:00<?, ?it/s]

In [43]:
import clip 
from lavis.models import load_model_and_preprocess
from torchvision.transforms import ToPILImage
import chromadb

class ClipEncoder:
    def __init__(self, dataset_path, clip_length, caption_model_type, frame_interval, batch_size, num_workers):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.dataset_path = dataset_path
        self.clip_length = clip_length
        self.frame_interval = frame_interval
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.chroma_client = chromadb.HttpClient(host='localhost', port=8000)
        self.data_loader, self.data_iter = get_features_loader(dataset_path,
                                                                    clip_length,
                                                                    frame_interval,
                                                                    batch_size,
                                                                    num_workers,
                                                                    "clip"
                                                                    )
        self.model, self.preprocess = clip.load("ViT-B/32", device=device)
        self.caption_model, self.vis_processors, _ = load_model_and_preprocess(name="blip_caption",
                                                                               model_type=caption_model_type,
                                                                               is_eval=True,
                                                                               device=device)
    def encode_image(self, idx):
        frame_tensor = self.data_loader[idx][0].permute(1, 0, 2, 3)
        with torch.no_grad():
            frame_embeddings = self.model.encode_image(frame_tensor.cuda())
        return frame_embeddings

    def get_all_image_embeddings(self):
        embeddings = []
        for idx in range(len(self.data_loader)):
            emb = self.encode_image(idx)
            embeddings.append(emb)
        return embeddings
    
    def get_captions(self):
        captions_list = []  
        for idx in range(len(self.data_loader)):  
            frame_tensor = self.data_loader[idx][0].permute(1, 0, 2, 3)
            pil_image = ToPILImage()(frame_tensor[0]) 
            image = self.vis_processors["eval"](pil_image).unsqueeze(0).to(self.device)
            generated_captions = self.caption_model.generate({"image": image})  
            captions_list.append(generated_captions)
        return captions_list

    def get_all_caption_embeddings(self, captions_list):
        # Future improvements: Maybe multiple captions per image; Think about a way how to add anomalous captions / features
        caption_embeddings = []
        if captions_list:
            for caption_set in captions_list:
                if caption_set:
                    for caption in caption_set:
                        if caption and len(caption) > 0:
                            with torch.no_grad():
                                try:
                                    caption_features = clip.tokenize(caption).to(self.device)
                                    caption_embedding = self.model.encode_text(caption_features)
                                    caption_embeddings.append(caption_embedding)
                                except Exception as e:
                                    print(f"Error encoding text for caption: {caption}")
                                    print(f"Error details: {e}")
        return caption_embeddings
    
    def get_or_create_chroma_collection(self, collection_name):
        try:
            collection = self.chroma_client.get_or_create_collection(collection_name)
            return collection
        except Exception as e:
            print(f"Error creating collection: {collection_name}")
            print(f"Error details: {e}")

    def upload_embeddings_to_chroma(self, collection_name, embeddings, ids, documents=None, metadata=None):
        if not len(embeddings) == len(ids):
            raise ValueError("embeddings and ids must have the same length")

        collection = self.get_or_create_chroma_collection(collection_name)

        for emb, id_ in zip(embeddings, ids):
            #print(emb.shape)
            print(id_)
            try:
                collection.add(documents=documents, embeddings=emb, metadatas=metadata, ids=id_)
                break
            except Exception as e:
                print(f"Failed to add document with ID {id_}: {str(e)}")                
        

In [44]:
normal_videos = "/home/ubuntu/repos/llm-rag/data/normal-videos"
encoder = ClipEncoder(normal_videos, clip_length, 'base_coco', frame_interval, batch_size, num_workers)
#captions_list = encoder.get_captions()
image_embeddings = encoder.get_all_image_embeddings()
#captions = encoder.get_captions()
#caption_embeddings = encoder.get_all_caption_embeddings(captions_list)


document_ids = []
for i in range(len(encoder.data_loader)):
    item = encoder.data_loader.getitem_from_raw_video(idx=i)  
    for j in range(clip_length):
        document_ids.append(str(item[3] + '_' + str(item[1]) + '-' + str(j)))  

2024-03-26 12:34:23,991 Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-03-26 12:34:23,993 Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-03-26 12:34:24,004 Found 1 video files in /home/ubuntu/repos/llm-rag/data/normal-videos


100%|██████████| 1/1 [00:00<00:00,  5.82it/s]


2024-03-26 12:34:33,729 Missing keys []
2024-03-26 12:34:33,730 load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth


In [45]:
import numpy as np

document_ids = np.array(document_ids)
batched_ids = np.array_split(document_ids, len(document_ids) // clip_length)

emb_list = [emb.tolist() for emb in image_embeddings]
encoder.upload_embeddings_to_chroma("normal_videos", emb_list, batched_ids)

Normal_Videos001_x264_0
Failed to add document with ID Normal_Videos001_x264_0: Number of embeddings 16 must match number of ids 1
Normal_Videos001_x264_1
Failed to add document with ID Normal_Videos001_x264_1: Number of embeddings 16 must match number of ids 1
Normal_Videos001_x264_2
Failed to add document with ID Normal_Videos001_x264_2: Number of embeddings 16 must match number of ids 1
Normal_Videos001_x264_3
Failed to add document with ID Normal_Videos001_x264_3: Number of embeddings 16 must match number of ids 1
Normal_Videos001_x264_4
Failed to add document with ID Normal_Videos001_x264_4: Number of embeddings 16 must match number of ids 1
Normal_Videos001_x264_5
Failed to add document with ID Normal_Videos001_x264_5: Number of embeddings 16 must match number of ids 1
Normal_Videos001_x264_6
Failed to add document with ID Normal_Videos001_x264_6: Number of embeddings 16 must match number of ids 1
Normal_Videos001_x264_7
Failed to add document with ID Normal_Videos001_x264_7: Num

In [28]:
print(len(emb_list[0][0]))

512
