In [2]:
import os
import sys
import torch
import torch.backends.cudnn as cudnn
from os import path, mkdir
import logging
from torch.utils.tensorboard import SummaryWriter

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from feature_extractor import FeaturesWriter, get_features_loader
from utils.utils import register_logger
from utils.load_model import load_feature_extractor
from features_loader import FeaturesLoader
from network.TorchUtils import TorchModel
from network.anomaly_detector_model import AnomalyDetector, custom_objective, RegularizedLoss
from utils.callbacks import DefaultModelCallback, TensorBoardCallback

In [3]:
log_every = 50  # log the writing of clips every n steps
log_file = None  # set logging file
num_workers = 4  # define the number of workers used for loading the videos

cudnn.benchmark = True
register_logger(log_file=log_file)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
dataset_path = '/home/ubuntu/repos/llm-rag/data/Anomaly-Videos-Part-1/test'  # path to the video dataset
clip_length = 16  # define the length of each input sample
frame_interval = 1 # define the sampling interval between framesq
batch_size = 4


In [6]:
import clip
import numpy as np
from lavis.models import load_model_and_preprocess
from torchvision.transforms import ToPILImage
import chromadb
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
import base64
from io import BytesIO

class ClipEncoder:
    def __init__(self, dataset_path, clip_length, caption_model_type, frame_interval, batch_size, num_workers):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.dataset_path = dataset_path
        self.clip_length = clip_length
        self.frame_interval = frame_interval
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.chroma_client = chromadb.HttpClient(host='localhost', port=8000)
        self.data_loader, self.data_iter = get_features_loader(dataset_path,
                                                                    clip_length,
                                                                    frame_interval,
                                                                    batch_size,
                                                                    num_workers,
                                                                    "clip"
                                                                    )
        self.model, self.preprocess = clip.load("ViT-B/32", device=device)
        self.caption_model, self.vis_processors, _ = load_model_and_preprocess(name="blip_caption",
                                                                               model_type=caption_model_type,
                                                                               is_eval=True,
                                                                               device=device)
    def encode_image(self, idx):
        frame_tensor = self.data_loader[idx][0].permute(1, 0, 2, 3)
        with torch.no_grad():
            frame_embeddings = self.model.encode_image(frame_tensor.cuda())
        return frame_embeddings

    def get_all_image_embeddings(self):
        embeddings = []
        for idx in range(len(self.data_loader)):
            emb = self.encode_image(idx)
            embeddings.append(emb)
        return embeddings
    
    def export_tensor_to_np(self):
        arr = []
        for idx in range(len(self.data_loader)):
            frame_tensor = self.data_loader[idx][0].permute(1, 0, 2, 3)
            pil_image = ToPILImage()(frame_tensor[0]) 
            arr.append(np.array(pil_image))
        return arr
    
    def export_tensor_to_base64(self):
        arr = []
        for idx in range(len(self.data_loader)):
            frame_tensor = self.data_loader[idx][0].permute(1, 0, 2, 3)
            pil_image = ToPILImage()(frame_tensor[0])
            buffered = BytesIO()
            pil_image.save(buffered, format="JPEG")
            img_str = base64.b64encode(buffered.getvalue()).decode()
            arr.append(img_str)
        return arr
    
    def get_captions(self):
        captions_list = []  
        for idx in range(len(self.data_loader)):  
            frame_tensor = self.data_loader[idx][0].permute(1, 0, 2, 3)
            pil_image = ToPILImage()(frame_tensor[0]) 
            image = self.vis_processors["eval"](pil_image).unsqueeze(0).to(self.device)
            generated_captions = self.caption_model.generate({"image": image})  
            captions_list.append(generated_captions)
        return captions_list

    def get_all_caption_embeddings(self, captions_list):
        # Future improvements: Maybe multiple captions per image; Think about a way how to add anomalous captions / features
        caption_embeddings = []
        if captions_list:
            for caption_set in captions_list:
                if caption_set:
                    for caption in caption_set:
                        if caption and len(caption) > 0:
                            with torch.no_grad():
                                try:
                                    caption_features = clip.tokenize(caption).to(self.device)
                                    caption_embedding = self.model.encode_text(caption_features)
                                    caption_embeddings.append(caption_embedding)
                                except Exception as e:
                                    print(f"Error encoding text for caption: {caption}")
                                    print(f"Error details: {e}")
        return caption_embeddings
    
    def generate_document_ids(self):
        document_ids = []
        for i in range(len(self.data_loader)):
            item = self.data_loader.getitem_from_raw_video(idx=i)  
            for j in range(self.clip_length):
                document_ids.append(str(item[3] + '_' + str(item[1]) + '-' + str(j)))

        batched_ids = [document_ids[i:i+clip_length] for i in range(0, len(document_ids), clip_length)]
        
        return document_ids, batched_ids
    
    def get_or_create_chroma_collection(self, collection_name, embedding_function=None, data_loader=None):
        if embedding_function:
            try:
                collection = self.chroma_client.get_or_create_collection(name=collection_name, embedding_function=embedding_function, data_loader=data_loader)
                return collection
            except Exception as e:
                print(f"Error creating collection: {collection_name}")
                print(f"Error details: {e}")
        else:    
            try:
                collection = self.chroma_client.get_or_create_collection(collection_name)
                return collection
            except Exception as e:
                print(f"Error creating collection: {collection_name}")
                print(f"Error details: {e}")


    def upload_embeddings_to_chroma(self, collection_name, img_data, ids, multi_modal= False, captions=None, documents=None, metadata=None):
        if multi_modal:
            if not len(img_data) == len(ids):
                raise ValueError("data and ids must have the same length")
            
            embedding_function = OpenCLIPEmbeddingFunction("ViT-H-14","laion2b_s32b_b79k" )
            data_loader = ImageLoader()
            
            collection = self.get_or_create_chroma_collection(collection_name, embedding_function, data_loader)
            print("Multi Modal Collection created")

            for frame, id_ in zip(img_data, ids):
                try:
                    collection.add(images=frame[0], metadatas=metadata, ids=id_[0])
                    
                except Exception as e:
                    print(f"Failed to add document with ID {id_}: {str(e)}")         
        
        else:
            if not len(img_data) == len(ids):
                raise ValueError("data and ids must have the same length")

            collection = self.get_or_create_chroma_collection(collection_name)

            for emb, id_ in zip(img_data, ids):
                try:
                    collection.add(documents=documents, embeddings=emb, metadatas=metadata, ids=id_)
                except Exception as e:
                    print(f"Failed to add document with ID {id_}: {str(e)}")                
            

In [17]:
normal_videos = "/home/ubuntu/repos/llm-rag/data/normal-videos"
encoder = ClipEncoder(normal_videos, clip_length, 'base_coco', frame_interval, batch_size, num_workers)
np_arr = encoder.export_tensor_to_np()

document_ids = []
for i in range(len(encoder.data_loader)):
    item = encoder.data_loader.getitem_from_raw_video(idx=i)  
    for j in range(clip_length):
        document_ids.append(str(item[3] + '_' + str(item[1]) + '-' + str(j)))  

2024-03-28 16:46:59,543 Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-03-28 16:46:59,544 Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-03-28 16:46:59,552 Found 41 video files in /home/ubuntu/repos/llm-rag/data/normal-videos


100%|██████████| 3/3 [00:42<00:00, 14.26s/it]


2024-03-28 16:47:50,018 Missing keys []
2024-03-28 16:47:50,018 load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth




In [None]:
encoder = ClipEncoder(dataset_path, clip_length, 'base_coco', frame_interval, batch_size, num_workers)
base64_arr = encoder.export_tensor_to_base64()

document_ids = []
for i in range(len(encoder.data_loader)):
    item = encoder.data_loader.getitem_from_raw_video(idx=i)  
    for j in range(clip_length):
        document_ids.append(str(item[3] + '_' + str(item[1]) + '-' + str(j)))  





In [18]:
batched_ids = [document_ids[i:i+clip_length] for i in range(0, len(document_ids), clip_length)]


In [19]:
encoder.upload_embeddings_to_chroma("multi-modal-norm", np_arr, batched_ids, multi_modal=True)

2024-03-28 17:17:58,041 Loaded ViT-H-14 model config.
2024-03-28 17:18:06,277 Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
Multi Modal Collection created


In [25]:
# from matplotlib import pyplot as plt
collection = encoder.chroma_client.get_collection("multi-modal-norm", embedding_function=OpenCLIPEmbeddingFunction("ViT-H-14","laion2b_s32b_b79k"), data_loader=ImageLoader())

retrieved = collection.query(query_texts="Lockers at a highschool", include=['data', 'distances'], n_results=3)
print(retrieved)
# # TODO: Add a way to visualize the retrieved images

    
    
    

2024-03-28 18:49:35,547 Loaded ViT-H-14 model config.
2024-03-28 18:49:43,466 Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
{'ids': [['Normal_Videos058_x264_53-0', 'Normal_Videos058_x264_60-0', 'Normal_Videos058_x264_59-0']], 'distances': [[1.59778634273598, 1.59778634273598, 1.59778634273598]], 'embeddings': None, 'metadatas': None, 'documents': None, 'uris': [[None, None, None]], 'data': [[None, None, None]]}
