In [None]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pathlib
import os

In [None]:
from tqdm import tqdm
import supervision as sv
from autodistill_grounding_dino import GroundingDINO
from autodistill.detection import CaptionOntology
from autodistill_clip import CLIP

In [None]:
video_capture = cv2.VideoCapture(1)
frame_width = video_capture.get(3)
frame_height = video_capture.get(4)
frame_size = (frame_width,frame_height)
frame_rate = 15
total_frames = 900
frame_counter = 0
video_writer = cv2.VideoWriter(filename="./video.mp4",fourcc=cv2.VideoWriter_fourcc(*"XVID"),fps=frame_rate,
                               frameSize=frame_size)

while(frame_counter <= total_frames):

    _, frame = video_capture.read()
    video_writer.write(frame)
    frame_counter += 1

In [None]:
VIDEOS_DIR_PATH = "/home/aiml_task/auto-annotate"
FRAMES_DIR_PATH = "/home/aiml_task/auto-annotate/frames"

videos_path = sv.list_files_with_extensions(directory=VIDEOS_DIR_PATH, extensions=["mov","mp4"])

In [None]:
for video_path in tqdm(videos_path):

    video_name = str(video_path).split("/")[-1]
    img_file_name_pattern = video_name + "-{:05d}.png"

    with sv.ImageSink(target_dir_path=FRAMES_DIR_PATH, image_name_pattern=img_file_name_pattern) as sink:
        
        for image in sv.get_video_frames_generator(source_path=str(video_path), stride=1):
            sink.save_image(image=image)

In [None]:
ontology = CaptionOntology({"gestures made by fingers of hand ": "hand"})
base_model = GroundingDINO(ontology=ontology)
bbox_annotator = sv.BoxAnnotator()
object_classes_to_detect = ["hand"]

In [None]:
model_detections = base_model.predict("./frames/video.mp4-00031.png")
bbox_coords = [np.uint16(bbox_coordinates) for bbox_coordinates, _, _, _, _, _ in model_detections][0]
annotated_img = bbox_annotator.annotate(scene=plt.imread("./frames/video.mp4-00031.png"),
                                        detections=model_detections)

sv.plot_image(annotated_img)

In [None]:
plt.imshow(plt.imread("./frames/video.mp4-00031.png")[bbox_coords[1]:bbox_coords[3],bbox_coords[0]:bbox_coords[2]])

In [None]:
os.mkdir("./detections")
detections_path = "./detections"

for frame_path in pathlib.Path("./frames").glob("*.png"):

    model_detections = base_model.predict(str(frame_path))
    bbox_coords = [np.uint16(bbox_coordinates) for bbox_coordinates, _, _, _, _, _ in model_detections]

    if len(bbox_coords) > 0:
        bbox_coords = bbox_coords[0]
        annotated_img = bbox_annotator.annotate(scene=plt.imread(str(frame_path)),
                                        detections=model_detections)
        plt.imsave(os.path.join(detections_path,".".join(str(frame_path).split("/")[-1].split(".")[0:-1])+".png"),
                   plt.imread(str(frame_path))[bbox_coords[1]:bbox_coords[3],bbox_coords[0]:bbox_coords[2]])

In [None]:
ontology_dict = dict()
asl_characters = "abcdefghijklmnopqrstuvwxyz0123456789"
class_id2char = dict()

for class_id, char in enumerate(asl_characters):

    ontology_dict[f"alphabet {char} in american sign language"] = char
    class_id2char[class_id] = char 