WORKFLOW PROCESS:

.Import the libraries

.Pre-process image path and video path and remove duplicates

.Detect vehicles in the video frame and images using yolov5

.Visualize and crop  detected images and videos

.Embeddings detection with resnet, post-processing and similarity computation



In [1]:
import os
import cv2
import numpy as np
import torch
from yolov5 import YOLOv5
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from tqdm import tqdm
import hashlib
import torchvision.transforms as transforms
import logging


PRE-PROCESS IMAGE PATH AND VIDEO PATH

The Images are the reference to be searched across the cctv videos

While the video is the cctv video to detect where the images(reference) are found across multiple video clips

In [2]:
# Utility function for hashing (to remove duplicates)
def get_hash(image_array):
    return hashlib.md5(image_array.tobytes()).hexdigest()

# Preprocess Images
def preprocess_images(folder):
    processed_images = {}
    for file in tqdm(os.listdir(folder), desc="Preprocessing images"):
        file_path = os.path.join(folder, file)
        if os.path.isfile(file_path) and file.lower().endswith(('.jpg', '.png', '.jpeg')):
            img = cv2.imread(file_path)
            img = cv2.resize(img, (640, 640))  # Resize to YOLOv5 input size
            processed_images[get_hash(img)] = img  # Remove duplicates
    return processed_images

# Process Videos to Extract Frames
def process_videos(folder, frame_interval=30):
    video_frames = {}
    for file in tqdm(os.listdir(folder), desc="Processing videos"):
        file_path = os.path.join(folder, file)
        if os.path.isfile(file_path) and file.lower().endswith(('.mp4', '.avi', '.mkv')):
            cap = cv2.VideoCapture(file_path)
            count = 0
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                if count % frame_interval == 0:  # Extract frame at intervals
                    frame = cv2.resize(frame, (640, 640))  # Resize
                    video_frames[get_hash(frame)] = frame
                count += 1
            cap.release()
    return video_frames


OBJECT DETECTION WITH YOLOv5

.Detect objects in the visualized amd croped images and videos

.Visualize and crop detections



In [None]:
def visualize_and_crop_detections(image, detections, save_cropped_folder=None, save_annotated_path=None): # Annotates images and videos with bounding box in the detection, then crop them out to focus on the object and eliminates noise/unwanted elements
    """
    Annotate an image with bounding boxes and labels, and optionally crop detected objects.

    Args:
        image: Input image to process.
        detections: Detection results from the YOLO model.
        save_cropped_folder (str): Path to save cropped images. If None, crops are not saved.
        save_annotated_path (str): Path to save annotated image. If None, the annotated image is not saved.

    Returns:
        annotated_image: Image with bounding boxes and labels drawn.
        cropped_images: List of cropped images corresponding to each detection.
    """
    cropped_images = []
    for idx, det in enumerate(detections.xyxy[0]):
        x1, y1, x2, y2, conf, cls = map(int, det[:6])
        label = f"{detections.names[cls]} {conf:.2f}"
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        if save_cropped_folder:
            cropped = image[y1:y2, x1:x2]
            cropped_image_path = os.path.join(save_cropped_folder, f"crop_{idx}.jpg")
            cv2.imwrite(cropped_image_path, cropped)
            cropped_images.append(cropped_image_path)

    if save_annotated_path:
        cv2.imwrite(save_annotated_path, image)

    return image, cropped_images


def detect_objects(folder, model, is_video=False, frame_interval=30, output_folder=None):
    """
    Detect objects in images or videos, save annotated outputs, and optionally save cropped objects.

    Args:
        folder (str): Path to the folder containing input files.
        model: YOLO model instance for predictions.
        is_video (bool): True to process videos, False for images.
        frame_interval (int): Process every nth frame in a video.
        output_folder (str): Path to the folder for saving output files.

    Returns:
        dict: Results containing raw detections and file paths for annotated images/crops.
    """
    results = {}
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if is_video:
        for file in tqdm(os.listdir(folder), desc="Processing Videos"):
            file_path = os.path.join(folder, file)
            if os.path.isfile(file_path) and file.lower().endswith(('.mp4', '.avi', '.mkv')):
                cap = cv2.VideoCapture(file_path)
                frame_count = 0
                output_video_path = os.path.join(output_folder, os.path.basename(file))
                cropped_folder = os.path.join(output_folder, f"{os.path.splitext(file)[0]}_crops")
                os.makedirs(cropped_folder, exist_ok=True)
                out = None

                while cap.isOpened():
                    ret, frame = cap.read()
                    if not ret:
                        break
                    if frame_count % frame_interval == 0:
                        results_frame = model.predict(frame)

                        # Annotate the frame and crop detected objects
                        annotated_frame, cropped_frames = visualize_and_crop_detections(
                            frame, results_frame, save_cropped_folder=cropped_folder
                        )

                        # Initialize VideoWriter once
                        if out is None:
                            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                            out = cv2.VideoWriter(output_video_path, fourcc, 30, 
                                                  (annotated_frame.shape[1], annotated_frame.shape[0]))

                        out.write(annotated_frame)

                    frame_count += 1

                cap.release()
                if out is not None:
                    out.release()

                results[file] = {"raw": results_frame, "annotated_video": output_video_path, "cropped_objects": cropped_folder}
    else:
        for file in tqdm(os.listdir(folder), desc="Processing Images"):
            file_path = os.path.join(folder, file)
            if os.path.isfile(file_path) and file.lower().endswith(('.jpg', '.jpeg', '.png')):
                img = cv2.imread(file_path)
                cropped_folder = os.path.join(output_folder, f"{os.path.splitext(file)[0]}_crops")
                os.makedirs(cropped_folder, exist_ok=True)

                results_img = model.predict(img) # 'results_img'  is useful for extracting numerical data e.g bounding box overlap

                # Annotate and crop the image
                annotated_img, cropped_imgs = visualize_and_crop_detections(
                    img, results_img, save_cropped_folder=cropped_folder, 
                    save_annotated_path=os.path.join(output_folder, file)
                ) # 'annotated_img' is useful for visually inspecting annotated outputs, to see if the objects in the image and video frames match

                # Store detection results
                results[file] = {"raw": results_img, "annotated_image": os.path.join(output_folder, file), "cropped_objects": cropped_imgs}

    return results


# Every Frame: Set frame_interval to 1 to extract every single frame. This provides the highest level of detail but also generates the most data.
# High Frequency: Set frame_interval to 5 or 10 to extract frames every 5 or 10 frames of the video. This still provides a high level of detail while reducing the total number of frames compared to extracting every frame.

LOAD RESNET FOR  EXTRACTING EMBEDDINGS IN VIDEO AND IMAGE


In [4]:
# Load ResNet18 with Custom Weights
def load_resnet18(weights_path):
    print("Loading ResNet18 with custom weights...")
    resnet18 = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)
    resnet18.load_state_dict(torch.load(weights_path))
    resnet18 = torch.nn.Sequential(*list(resnet18.children())[:-1])  # Remove final layer for feature extraction
    resnet18.eval()  # Set to evaluation mode
    return resnet18

# Preprocessing for ResNet18
def preprocess_for_resnet(img):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform(img)

# Extract Embeddings (Feature Extraction) # get the numerical arrays in video and image
def extract_embeddings(detections, feature_extractor):
    embeddings = {}
    for key, detection in tqdm(detections.items(), desc="Extracting embeddings"):
        if hasattr(detection, 'xyxy') and len(detection.xyxy) > 0:
            image_embeddings = []
            for obj in detection.xyxy:
                if len(obj) >= 4:
                    x1, y1, x2, y2 = map(int, obj[:4])
                    cropped_img = detection.orig_img[y1:y2, x1:x2]
                    preprocessed_img = preprocess_for_resnet(cropped_img).unsqueeze(0)

                    with torch.no_grad():
                        embedding = feature_extractor(preprocessed_img).squeeze().cpu().numpy()
                        image_embeddings.append(embedding)
            if image_embeddings:
                embeddings[key] = np.mean(image_embeddings, axis=0)
    return embeddings

MATCH SIMILARITIES AND DISPLAY PLOTTINGS

In [5]:
# Match Similarities
def match_similarities(reference_embedding, video_embeddings):
    similarities = {}
    for key, embedding in video_embeddings.items():
        sim = cosine_similarity([reference_embedding], [embedding])[0][0]
        similarities[key] = sim
    return similarities

def plot_similarities(similarities):
    if not similarities:
        print("No similarities to plot.")
        return

    sorted_sims = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    if sorted_sims:  # Ensure there are items to unpack
        labels, scores = zip(*sorted_sims)
        plt.barh(labels, scores)
        plt.xlabel('Similarity Score')
        plt.ylabel('Video Frame Hash')
        plt.title('Similarity of Frames to Reference Image')
        plt.show()
    else:
        print("No valid similarities to display.")

RUN PIPELINE FOR THE WORKFLOW

In [6]:
# Main Pipeline
def main_pipeline():
    print("Loading ResNet18 model with custom weights...")
    feature_extractor = load_resnet18(RESNET_WEIGHTS_PATH)

    print("Preprocessing Images...")
    processed_images = preprocess_images(image_folder)

    print("Processing Videos...")
    video_frames = process_videos(video_folder, frame_interval=30)

    print("Detecting Objects in Images...")
    image_detections = detect_objects(
        folder=image_folder, model=yolo_model, is_video=False, output_folder=OUTPUT_IMAGES_FOLDER
    )

    print("Detecting Objects in Videos...")
    video_detections = detect_objects(
        folder=video_folder, model=yolo_model, is_video=True, frame_interval=30, output_folder=OUTPUT_VIDEOS_FOLDER
    )

    print("Extracting Embeddings from Images...")
    image_embeddings = extract_embeddings(image_detections, feature_extractor)

    print("Extracting Embeddings from Video Frames...")
    video_embeddings = extract_embeddings(video_detections, feature_extractor)

    print("Matching Similarities...")
    similarities = {}
    for image_hash, image_embedding in image_embeddings.items():
        similarities[image_hash] = match_similarities(image_embedding, video_embeddings)

    print("Plotting Similarities...")
    plot_similarities(similarities)


# Paths to dependencies
YOLO_MODEL_PATH = "c:/Users/pd63899/Downloads/ultralytics/yolov5/yolov5s.pt"
image_folder = "c:/Users/pd63899/Downloads/reference_image"
video_folder = "c:/Users/pd63899/Downloads/cctv_videos"
RESNET_WEIGHTS_PATH = "c:/Users/pd63899/Downloads/resnet18.pth/resnet18.pth"  # Custom ResNet18 weights
OUTPUT_IMAGES_FOLDER = "output_images"
OUTPUT_VIDEOS_FOLDER = "output_videos"
# Initialize YOLOv5 model
yolo_model = YOLOv5(YOLO_MODEL_PATH)


if __name__ == "__main__":
    main_pipeline()

YOLOv5  2024-11-15 Python-3.12.7 torch-2.5.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 270 layers, 7235389 parameters, 0 gradients, 16.6 GFLOPs
Adding AutoShape... 
Using cache found in C:\Users\pd63899/.cache\torch\hub\pytorch_vision_v0.10.0


Loading ResNet18 model with custom weights...
Loading ResNet18 with custom weights...
Preprocessing Images...


Preprocessing images: 100%|██████████| 1/1 [00:00<00:00, 95.39it/s]


Processing Videos...


Processing videos: 100%|██████████| 17/17 [00:48<00:00,  2.86s/it]


Detecting Objects in Images...


  with amp.autocast(autocast):
Processing Images: 100%|██████████| 1/1 [00:00<00:00, 13.19it/s]


Detecting Objects in Videos...


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

Extracting Embeddings from Images...


Extracting embeddings: 100%|██████████| 1/1 [00:00<?, ?it/s]


Extracting Embeddings from Video Frames...


Extracting embeddings: 100%|██████████| 17/17 [00:00<?, ?it/s]

Matching Similarities...
Plotting Similarities...
No similarities to plot.



