### Packages


In [1]:
# load the necessary packages
import os
import tqdm

import numpy as np
import cv2
from PIL import Image, ImageDraw, ImageFont
import imageio

import torch
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as standard_transforms
from model.locator import Crowd_locator
from misc.utils import read_pred_and_gt
from collections import OrderedDict

from scipy.interpolate import interp1d

import json

### Configurations


In [None]:
# configurations
# crowd localization pre-trained model
MODEL_PATH = "../PretrainedCrowdLocModel/NWPU-HR-ep_241_F1_0.802_Pre_0.841_Rec_0.766_mae_55.6_mse_330.9.pth"
BASE_DATASET = "NWPU"
BASE_DATA_ROOT = f"../ProcessedData/{BASE_DATASET}"

# image path
TEST_DATA_ID = ""
TEST_DATA_NAME = ""
TEST_DATA_ROOT = f"../ProcessedData/{TEST_DATA_NAME}"
RESULT_PATH = f"./saved_exp_results/{TEST_DATA_ID}"

# crowd localization output
WIDTH = 1280
HEIGHT = 720
FPS = 25
VIDEO_CODEC = "XVID"

# real scale
SCALE = 100
FIXED_HEIGHT = 1.7
BOX_START = (0, 1)
BOX_END = (12, 8.38)

FIXED_HEIGHT *= SCALE
BOX_START = (int(BOX_START[0] * SCALE), int(BOX_START[1] * SCALE))
BOX_END = (int(BOX_END[0] * SCALE), int(BOX_END[1] * SCALE))

# crowd separation
MAX_MISSED_FRAMES = 8
COST_THRESHOLD = 70
EDGE_BUFFER = 50
INITIALIZE_BUFFER = 3
MIN_TRACK_LENGTH = 10

# visualization
SAVE_SCREENSHOT = True
SCREENSHOT_FRAME = 69

# DLT data
world_points = np.array(
    [
        [0, 0, 0],
        [5.28, 0, 0],
        [10.56, 0, 0],
        [0.88, 9.38, 0],
        [5.28, 9.38, 0],
        [10.56, 9.38, 0],
        [1.50, 5.16, 1.7],
        [5.28, 5.16, 1.7],
        [7.04, 5.16, 1.7],
        [7.34, 7.97, 1.7],
        [4.14, 6.57, 1.7],
        [9.68, 7.00, 1.7],
        [10, 6.5, 1.7],
        [4.4, 1.88, 1.7],
        [3.26, 8.44, 1.7],
    ],
    dtype=np.float32,
)
world_points *= SCALE

image_points = np.array(
    [
        [96, 23],
        [579, 17],
        [1054, 15],
        [96, 711],
        [574, 695],
        [1131, 691],
        [204, 291],
        [576, 283],
        [761, 286],
        [794, 462],
        [473, 374],
        [1033, 409],
        [1052, 353],
        [494, 26],
        [366, 531],
    ],
    dtype=np.float32,
)

### Crowd Localization


In [None]:
# functions for crowd localization
def get_boxInfo_from_Binar_map(Binar_numpy, min_area=3):
    Binar_numpy = Binar_numpy.squeeze().astype(np.uint8)
    assert Binar_numpy.ndim == 2
    cnt, labels, stats, centroids = cv2.connectedComponentsWithStats(
        Binar_numpy, connectivity=4
    )

    boxes = stats[1:, :]
    points = centroids[1:, :]
    index = boxes[:, 4] >= min_area
    boxes = boxes[index]
    points = points[index]
    pre_data = {"num": len(points), "points": points}
    return pre_data, boxes


def test(file_list, model_path, result_path, data_root, img_transform):
    device = torch.device("cpu")
    net = Crowd_locator("HR_Net", pretrained=True).to(device)
    state_dict = torch.load(model_path, map_location=device)

    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        new_key = k.replace("module.", "") if "module." in k else k
        new_state_dict[new_key] = v

    net.load_state_dict(new_state_dict)
    net.eval()

    file_list = tqdm.tqdm(file_list)
    for infos in file_list:
        filename = infos.split()[0]
        imgname = os.path.join(data_root, "images", f"{filename}.jpg")
        img = Image.open(imgname)

        if img.mode == "L":
            img = img.convert("RGB")
        img = img_transform(img)[None, :, :, :]
        slice_h, slice_w = 512, 1024

        with torch.no_grad():
            img = img.to(device)
            b, c, h, w = img.shape
            crop_imgs, crop_masks = [], []

            if h * w < slice_h * 2 * slice_w * 2 and h % 16 == 0 and w % 16 == 0:
                [pred_threshold, pred_map, __] = [
                    i.cpu() for i in net(img, mask_gt=None, mode="val")
                ]
            else:
                if h % 16 != 0:
                    pad_dims = (0, 0, 0, 16 - h % 16)
                    h = (h // 16 + 1) * 16
                    img = F.pad(img, pad_dims, "constant")

                if w % 16 != 0:
                    pad_dims = (0, 16 - w % 16, 0, 0)
                    w = (w // 16 + 1) * 16
                    img = F.pad(img, pad_dims, "constant")

                for i in range(0, h, slice_h):
                    h_start, h_end = max(min(h - slice_h, i), 0), min(h, i + slice_h)
                    for j in range(0, w, slice_w):
                        w_start, w_end = max(min(w - slice_w, j), 0), min(
                            w, j + slice_w
                        )
                        crop_imgs.append(img[:, :, h_start:h_end, w_start:w_end])
                        mask = torch.zeros(1, 1, img.size(2), img.size(3)).cpu()
                        mask[:, :, h_start:h_end, w_start:w_end].fill_(1.0)
                        crop_masks.append(mask)
                crop_imgs, crop_masks = torch.cat(crop_imgs, dim=0), torch.cat(
                    crop_masks, dim=0
                )

                crop_preds, crop_thresholds = [], []
                nz, period = crop_imgs.size(0), 4
                for i in range(0, nz, period):
                    [crop_threshold, crop_pred, __] = [
                        i.cpu()
                        for i in net(
                            crop_imgs[i : min(nz, i + period)], mask_gt=None, mode="val"
                        )
                    ]
                    crop_preds.append(crop_pred)
                    crop_thresholds.append(crop_threshold)

                crop_preds = torch.cat(crop_preds, dim=0)
                crop_thresholds = torch.cat(crop_thresholds, dim=0)

                idx = 0
                pred_map = torch.zeros(b, 1, h, w).cpu()
                pred_threshold = torch.zeros(b, 1, h, w).cpu().float()
                for i in range(0, h, slice_h):
                    h_start, h_end = max(min(h - slice_h, i), 0), min(h, i + slice_h)
                    for j in range(0, w, slice_w):
                        w_start, w_end = max(min(w - slice_w, j), 0), min(
                            w, j + slice_w
                        )
                        pred_map[:, :, h_start:h_end, w_start:w_end] += crop_preds[idx]
                        pred_threshold[
                            :, :, h_start:h_end, w_start:w_end
                        ] += crop_thresholds[idx]
                        idx += 1
                mask = crop_masks.sum(dim=0)
                pred_map = pred_map / mask
                pred_threshold = pred_threshold / mask

            a = torch.ones_like(pred_map)
            b = torch.zeros_like(pred_map)
            binar_map = torch.where(pred_map >= pred_threshold, a, b)

            pred_data, boxes = get_boxInfo_from_Binar_map(binar_map.cpu().numpy())

            with open(result_path + "/result.txt", "a") as f:
                f.write(f'{filename} {pred_data["num"]} ')
                for ind, point in enumerate(pred_data["points"], 1):
                    if ind < pred_data["num"]:
                        f.write(f"{int(point[0])} {int(point[1])} ")
                    else:
                        f.write(f"{int(point[0])} {int(point[1])}")
                f.write("\n")


def CL_visualize_images(result_path, data_root, width, height):
    pred_file = result_path + "/result.txt"
    vis_path = result_path + "/vis"
    img_path = data_root + "/images"

    if not os.path.exists(vis_path):
        os.mkdir(vis_path)

    pred_data, _ = read_pred_and_gt(pred_file)

    with open(os.path.join(data_root, "list.txt"), "r") as f:
        img_filenames = [line.strip() for line in f]

    for filename in img_filenames:
        sample_id = int(filename.split(".")[0])
        pred_p = []

        if pred_data[sample_id]["num"] != 0:
            pred_p = pred_data[sample_id]["points"]

        img = Image.open(os.path.join(img_path, f"{filename}.jpg"))
        img = img.resize((width, height))
        img = np.array(img)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        point_r_value = 10
        if pred_data[sample_id]["num"] != 0:
            for point in pred_p:
                cv2.circle(
                    img, (int(point[0]), int(point[1])), point_r_value, (0, 255, 0), 2
                )

        img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        img.save(os.path.join(vis_path, f"{sample_id}_pred.jpg"))
    print(f"visualization images saved at {vis_path}")


def CL_visualize_video(result_path, width, height, fps, codec="XVID"):
    vis_path = result_path + "/vis"
    video_path = result_path + "/video.avi"

    def sort_key(file_name):
        return int(file_name.split("_")[0])

    image_files = sorted(
        [
            f
            for f in os.listdir(vis_path)
            if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tiff"))
        ],
        key=sort_key,
    )

    fourcc = cv2.VideoWriter_fourcc(*codec)
    video = cv2.VideoWriter(video_path, fourcc, fps, (width, height))

    for image_file in image_files:
        image_path = os.path.join(vis_path, image_file)

        with Image.open(image_path) as img:
            img_resized = img.resize((width, height))
            img_array = cv2.cvtColor(np.array(img_resized), cv2.COLOR_RGB2BGR)

        if img_array is not None:
            video.write(img_array)

    video.release()
    print(f"visualization video saved at {video_path}")

In [None]:
# crowd localization function calls
# saves result.txt, vis folder, and video.avi
torch.backends.cudnn.benchmark = True

mean_std = (
    [0.446139603853, 0.409515678883, 0.395083993673],
    [0.288205742836, 0.278144598007, 0.283502370119],
)
img_transform = standard_transforms.Compose(
    [standard_transforms.ToTensor(), standard_transforms.Normalize(*mean_std)]
)

txtpath = os.path.join(TEST_DATA_ROOT, "list.txt")
with open(txtpath) as f:
    lines = f.readlines()

if not os.path.exists(RESULT_PATH):
    os.mkdir(RESULT_PATH)

test(lines, MODEL_PATH, RESULT_PATH, TEST_DATA_ROOT, img_transform)
CL_visualize_images(RESULT_PATH, TEST_DATA_ROOT, WIDTH, HEIGHT)
CL_visualize_video(RESULT_PATH, WIDTH, HEIGHT, FPS, VIDEO_CODEC)

### Crowd Separation Functions


In [None]:
# DLT algorithm to calculate homography matrix
def normalize_points_2d(points):
    centroid = np.mean(points, axis=0)
    shifted_points = points - centroid
    scale = np.sqrt(2) / np.mean(np.linalg.norm(shifted_points, axis=1))
    normalization_matrix = np.array(
        [[scale, 0, -scale * centroid[0]], [0, scale, -scale * centroid[1]], [0, 0, 1]]
    )
    normalized_points = np.dot(
        normalization_matrix, np.vstack((points.T, np.ones((1, points.shape[0]))))
    )
    return normalized_points.T, normalization_matrix


def normalize_points_3d(points):
    centroid = np.mean(points, axis=0)
    shifted_points = points - centroid
    scale = np.sqrt(3) / np.mean(np.linalg.norm(shifted_points, axis=1))
    normalization_matrix = np.array(
        [
            [scale, 0, 0, -scale * centroid[0]],
            [0, scale, 0, -scale * centroid[1]],
            [0, 0, scale, -scale * centroid[2]],
            [0, 0, 0, 1],
        ]
    )
    normalized_points = np.dot(
        normalization_matrix, np.vstack((points.T, np.ones((1, points.shape[0]))))
    )
    return normalized_points.T, normalization_matrix


def compute_homography(src_points, dst_points):
    A = []
    for src, dst in zip(src_points, dst_points):
        x, y, z = src[:3]
        u, v = dst[:2]
        A.append([-x, -y, -z, -1, 0, 0, 0, 0, u * x, u * y, u * z, u])
        A.append([0, 0, 0, 0, -x, -y, -z, -1, v * x, v * y, v * z, v])
    A = np.array(A)
    U, S, Vh = np.linalg.svd(A)
    L = Vh[-1, :] / Vh[-1, -1]
    H = L.reshape(3, 4)
    return H


def calculate_homography(world_points, image_points):
    norm_world_points, T_world = normalize_points_3d(world_points)
    norm_image_points, T_image = normalize_points_2d(image_points)
    H_normalized = compute_homography(norm_world_points, norm_image_points)
    H = np.dot(np.linalg.inv(T_image), np.dot(H_normalized, T_world))
    return H


# calculates X, Y coordinates from u, v, Z using homography matrix
def calculate_XY_from_uvZ(H, u, v, Z):
    h11, h12, h13, h14 = H[0]
    h21, h22, h23, h24 = H[1]
    h31, h32, h33, h34 = H[2]
    A = np.array([[h11 - u * h31, h12 - u * h32], [h21 - v * h31, h22 - v * h32]])
    B = np.array(
        [
            [u * (h33 * Z + h34) - (h13 * Z + h14)],
            [v * (h33 * Z + h34) - (h23 * Z + h24)],
        ]
    )
    X, Y = np.linalg.solve(A, B).flatten()
    return X, Y


# calculates cost matrix for given positions
def calculate_cost_matrix(prev_positions, curr_positions):
    num_prev = len(prev_positions)
    num_curr = len(curr_positions)
    cost_matrix = np.zeros((num_prev, num_curr))
    for i, prev_pos in enumerate(prev_positions):
        for j, curr_pos in enumerate(curr_positions):
            cost_matrix[i, j] = np.linalg.norm(np.array(prev_pos) - np.array(curr_pos))
    return cost_matrix


# connects points based on smallest values in cost matrix first
def connect_small_values_first(cost_matrix, cost_threshold):
    associations = []
    num_prev, num_curr = cost_matrix.shape
    used_prev = set()
    used_curr = set()
    while len(used_prev) < num_prev and len(used_curr) < num_curr:
        min_value = np.inf
        min_pos = (-1, -1)
        for i in range(num_prev):
            if i in used_prev:
                continue
            for j in range(num_curr):
                if j in used_curr:
                    continue
                if cost_matrix[i, j] < min_value:
                    min_value = cost_matrix[i, j]
                    min_pos = (i, j)
        if min_value >= cost_threshold:
            break
        associations.append(min_pos)
        used_prev.add(min_pos[0])
        used_curr.add(min_pos[1])
    return associations


# tracks objects over multiple frames based on distance threshold
def track_objects(
    bev_frame_data,
    box_start,
    box_end,
    cost_threshold,
    max_missed_frames,
    edge_buffer,
    initialize_allowance,
    min_track_length,
):
    tracks = {}
    missed_frames = {}
    finished_tracks = {}
    next_person_id = 0
    box_width = box_end[0] - box_start[0]
    box_height = box_end[1] - box_start[1]
    print(f"tracking started with {len(bev_frame_data)} frames")
    for frame_idx, (frame_number, num_heads, positions) in enumerate(bev_frame_data):
        if frame_idx == 0:
            for pos in positions:
                if (
                    box_start[0] < pos[0] < box_end[0]
                    and box_start[1] < pos[1] < box_end[1]
                ):
                    tracks[next_person_id] = [(frame_number, pos)]
                    missed_frames[next_person_id] = 0
                    next_person_id += 1
        else:
            prev_positions = [track[-1][1] for track in tracks.values()]
            prev_ids = list(tracks.keys())
            cost_matrix = calculate_cost_matrix(prev_positions, positions)
            associations = connect_small_values_first(cost_matrix, cost_threshold)
            assigned_ids = set()
            for i, j in associations:
                person_id = prev_ids[i]
                tracks[person_id].append((frame_number, positions[j]))
                missed_frames[person_id] = 0
                assigned_ids.add(person_id)
            for j in range(len(positions)):
                if j not in [assoc[1] for assoc in associations]:
                    is_in_box = (
                        box_start[0] < positions[j][0] < box_end[0]
                        and box_start[1] < positions[j][1] < box_end[1]
                    )
                    is_in_x_edge = (positions[j][0] - box_start[0] < edge_buffer) or (
                        box_width - edge_buffer < positions[j][0] - box_start[0]
                    )
                    is_in_y_edge = (positions[j][1] - box_start[1] < edge_buffer) or (
                        box_height - edge_buffer < positions[j][1] - box_start[1]
                    )
                    if is_in_box and (is_in_x_edge or is_in_y_edge):
                        tracks[next_person_id] = [(frame_number, positions[j])]
                        missed_frames[next_person_id] = 0
                        next_person_id += 1
                    elif is_in_box and frame_idx < initialize_allowance:
                        tracks[next_person_id] = [(frame_number, positions[j])]
                        missed_frames[next_person_id] = 0
                        next_person_id += 1
            for person_id in prev_ids:
                x = tracks[person_id][-1][1][0]
                y = tracks[person_id][-1][1][1]
                if not (
                    box_start[0] < x < box_end[0] and box_start[1] < y < box_end[1]
                ):
                    finished_tracks[person_id] = tracks[person_id]
                    del tracks[person_id]
                    del missed_frames[person_id]
                if person_id not in assigned_ids:
                    missed_frames[person_id] += 1
                    if missed_frames[person_id] > max_missed_frames:
                        if len(tracks[person_id]) < min_track_length:
                            del tracks[person_id]
                            del missed_frames[person_id]
                        else:
                            finished_tracks[person_id] = tracks[person_id]
                            del tracks[person_id]
                            del missed_frames[person_id]
    for person_id, track in tracks.items():
        if len(track) >= min_track_length:
            finished_tracks[person_id] = track
    print(f"tracking finished with {len(finished_tracks)} tracks")

    cleaned_tracks = {}
    for i, (person_id, track) in enumerate(finished_tracks.items()):
        cleaned_tracks[i] = track

    return {**cleaned_tracks}


# interpolates missing data points in tracks
def interpolate_missing_data(tracks):
    interpolated_tracks = {}
    for person_id, track in tracks.items():
        track.sort(key=lambda x: x[0])
        frames = [pos[0] for pos in track]
        positions = [pos[1] for pos in track]
        positions = np.array(positions)
        all_frames = np.arange(frames[0], frames[-1] + 1)
        interpolated_x = np.interp(all_frames, frames, positions[:, 0])
        interpolated_y = np.interp(all_frames, frames, positions[:, 1])
        interpolated_positions = list(zip(interpolated_x, interpolated_y))
        interpolated_tracks[person_id] = [
            (frame, (int(x), int(y)))
            for frame, (x, y) in zip(all_frames, interpolated_positions)
        ]

    return interpolated_tracks


# crowd localization gif visualization
def CL_visualize_gif(
    frame_data,
    result_path,
    data_root,
    width,
    height,
    fps,
    save_screenshot=False,
    screenshot_frame=None,
):
    img_folder = os.path.join(data_root, "images")
    print(f"CL visualization gif started with {len(frame_data)} frames")
    with imageio.get_writer(
        result_path + "/gif.gif", mode="I", fps=fps, loop=0
    ) as writer:
        for frame_number, num_heads, positions in frame_data:
            img_path = os.path.join(img_folder, f"{frame_number}.jpg")
            img = Image.open(img_path)
            img = img.resize((width, height))
            img = np.array(img)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            for pos in positions:
                cv2.circle(img, pos, 10, (0, 255, 0), 2)
            if save_screenshot and frame_number == screenshot_frame:
                Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).save(
                    result_path + "/screenshot.jpg"
                )
                print(f"screenshot saved at {result_path + '/screenshot.jpg'}")
            writer.append_data(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    print(f"CL visualization gif saved at {result_path + '/gif.gif'}")


# precomputes the outside area mask and box lines for visualization
def precompute_outside_area_and_box_lines(H, width, height, box_start, box_end):
    outside_area_color = (255, 204, 204)
    box_width = box_end[0] - box_start[0]
    box_height = box_end[1] - box_start[1]
    outside_area_mask = np.zeros((box_height, box_width), dtype=np.uint8)
    for y in range(box_start[1], box_end[1]):
        for x in range(box_start[0], box_end[0]):
            U = np.dot(H, np.array([x, y, 0, 1]))
            u, v = U[0] / U[2], U[1] / U[2]
            if not (0 <= u < width and 0 <= v < height):
                outside_area_mask[y - box_start[1], x - box_start[0]] = 1
    box_corners = [
        box_start,
        (box_end[0], box_start[1]),
        box_end,
        (box_start[0], box_end[1]),
    ]
    box_lines = []
    for corner in box_corners:
        X, Y = corner
        U = np.dot(H, np.array([X, Y, 0, 1]))
        u_trans, v_trans = U[0] / U[2], U[1] / U[2]
        box_lines.append((int(u_trans), int(v_trans)))
    return outside_area_mask, outside_area_color, box_lines


# crowd separation gif visualization
def CS_visualize_gif(
    tracks,
    H,
    frame_data,
    result_path,
    data_root,
    width,
    height,
    box_start,
    box_end,
    fixed_height,
    fps,
    save_screenshot=False,
    screenshot_frame=None,
):
    img_folder = os.path.join(data_root, "images")
    vis_original_path = result_path + "/CS_original_vis"
    vis_bev_path = result_path + "/CS_BEV_vis"
    os.makedirs(vis_original_path, exist_ok=True)
    os.makedirs(vis_bev_path, exist_ok=True)
    colors = [
        (255, 128, 128),
        (128, 255, 128),
        (128, 128, 255),
        (255, 255, 128),
        (255, 128, 255),
        (128, 255, 255),
        (192, 192, 128),
        (128, 192, 192),
    ]
    box_width = box_end[0] - box_start[0]
    box_height = box_end[1] - box_start[1]
    outside_area_mask, outside_area_color, box_lines = (
        precompute_outside_area_and_box_lines(H, width, height, box_start, box_end)
    )
    frame_dict = {
        frame_number: positions for frame_number, num_heads, positions in frame_data
    }
    print(f"CS visualization gif started with {len(frame_dict)} frames")
    with imageio.get_writer(
        result_path + "/CS_original_gif.gif", mode="I", fps=fps, loop=0
    ) as writer_original, imageio.get_writer(
        result_path + "/CS_BEV_gif.gif", mode="I", fps=fps, loop=0
    ) as writer_bev:

        for frame_number in sorted(frame_dict.keys()):
            img_path = os.path.join(img_folder, f"{frame_number}.jpg")
            img = Image.open(img_path)
            img = img.resize((width, height))
            img = np.array(img)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            bev_img = np.ones((box_height, box_width, 3), dtype=np.uint8) * 255
            cv2.line(img, box_lines[0], box_lines[1], (0, 0, 255), 2)
            cv2.line(img, box_lines[1], box_lines[2], (0, 0, 255), 2)
            cv2.line(img, box_lines[2], box_lines[3], (0, 0, 255), 2)
            cv2.line(img, box_lines[3], box_lines[0], (0, 0, 255), 2)
            for person_id, track in tracks.items():
                for track_frame_number, pos in track:
                    if track_frame_number == frame_number:
                        u, v = pos
                        if not (
                            box_start[0] < u < box_end[0]
                            and box_start[1] < v < box_end[1]
                        ):
                            break
                        u, v = u - box_start[0], v - box_start[1]
                        color = colors[int(person_id) % len(colors)]
                        cv2.circle(bev_img, (int(u), int(v)), 10, color, -1)
                        cv2.circle(bev_img, (int(u), int(v)), 10, (0, 0, 0), 2)
                        cv2.putText(
                            bev_img,
                            str(person_id),
                            (int(u) + 15, int(v) + 5),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            0.5,
                            (0, 0, 0),
                            2,
                            cv2.LINE_AA,
                        )
                        X, Y, Z = pos[0], pos[1], fixed_height
                        U = np.dot(H, np.array([X, Y, Z, 1]))
                        u_trans, v_trans = U[0] / U[2], U[1] / U[2]
                        cv2.circle(img, (int(u_trans), int(v_trans)), 10, color, 2)
                        cv2.putText(
                            img,
                            str(person_id),
                            (int(u_trans) + 15, int(v_trans) + 5),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            0.5,
                            (0, 0, 0),
                            2,
                            cv2.LINE_AA,
                        )

            bev_img[outside_area_mask == 1] = outside_area_color[::-1]
            if save_screenshot and frame_number == screenshot_frame:
                Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).save(
                    result_path + "/CS_original_screenshot.jpg"
                )
                Image.fromarray(cv2.cvtColor(bev_img, cv2.COLOR_BGR2RGB)).save(
                    result_path + "/CS_BEV_screenshot.jpg"
                )
                print(
                    f"screenshots saved at {result_path + '/CS_original_screenshot.jpg'} and {result_path + '/CS_BEV_screenshot.jpg'}"
                )
            writer_original.append_data(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
            writer_bev.append_data(cv2.cvtColor(bev_img, cv2.COLOR_BGR2RGB))
            cv2.imwrite(os.path.join(vis_original_path, f"{frame_number}.jpg"), img)
            cv2.imwrite(os.path.join(vis_bev_path, f"{frame_number}.jpg"), bev_img)
    print(
        f"CS visualization gif saved at {result_path + '/CS_original_gif.gif'} and {result_path + '/CS_BEV_gif.gif'}"
    )

### Crowd Separation


In [None]:
# BEV transformation and crowd seperation
with open(RESULT_PATH + "/result.txt", "r") as file:
    data = file.readlines()

frame_data = []
bev_frame_data = []
H = calculate_homography(world_points, image_points)

for line in data:
    parts = list(map(int, line.split()))
    frame_number = parts[0]
    num_heads = parts[1]
    positions = [(parts[i], parts[i + 1]) for i in range(2, len(parts), 2)]

    bev_positions = [
        calculate_XY_from_uvZ(H, pos[0], pos[1], FIXED_HEIGHT) for pos in positions
    ]
    frame_data.append((frame_number, num_heads, positions))
    bev_frame_data.append((frame_number, num_heads, bev_positions))

CL_visualize_gif(
    frame_data,
    RESULT_PATH,
    TEST_DATA_ROOT,
    WIDTH,
    HEIGHT,
    FPS,
    SAVE_SCREENSHOT,
    SCREENSHOT_FRAME,
)
tracks = track_objects(
    bev_frame_data,
    BOX_START,
    BOX_END,
    COST_THRESHOLD,
    MAX_MISSED_FRAMES,
    EDGE_BUFFER,
    INITIALIZE_BUFFER,
    MIN_TRACK_LENGTH,
)
interpolated_tracks = interpolate_missing_data(tracks)
with open(RESULT_PATH + "/tracks.txt", "w") as file:
    for person_id, track in interpolated_tracks.items():
        for frame_number, pos in track:
            file.write(f"{person_id} {frame_number} {pos[0]} {pos[1]}\n")
CS_visualize_gif(
    interpolated_tracks,
    H,
    frame_data,
    RESULT_PATH,
    TEST_DATA_ROOT,
    WIDTH,
    HEIGHT,
    BOX_START,
    BOX_END,
    FIXED_HEIGHT,
    FPS,
    SAVE_SCREENSHOT,
    SCREENSHOT_FRAME,
)

In [None]:
# make a video using the visualization original, bev images
def CS_visualize_video(
    result_path, width, height, box_start, box_end, fps, codec="XVID"
):
    box_width = box_end[0] - box_start[0]
    box_height = box_end[1] - box_start[1]
    vis_original_path = result_path + "/CS_original_vis"
    vis_bev_path = result_path + "/CS_BEV_vis"
    video_original_path = result_path + "/CS_original_video.avi"
    video_bev_path = result_path + "/CS_BEV_video.avi"
    print(
        f"CS visualization video started with {len(os.listdir(vis_original_path))} frames"
    )

    def sort_key(file_name):
        return int(file_name.split(".")[0])

    original_image_files = sorted(
        [
            f
            for f in os.listdir(vis_original_path)
            if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tiff"))
        ],
        key=sort_key,
    )
    bev_image_files = sorted(
        [
            f
            for f in os.listdir(vis_bev_path)
            if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tiff"))
        ],
        key=sort_key,
    )

    fourcc = cv2.VideoWriter_fourcc(*codec)
    video_original = cv2.VideoWriter(video_original_path, fourcc, fps, (width, height))
    video_bev = cv2.VideoWriter(video_bev_path, fourcc, fps, (box_width, box_height))

    for original_image_file, bev_image_file in zip(
        original_image_files, bev_image_files
    ):
        original_image_path = os.path.join(vis_original_path, original_image_file)
        bev_image_path = os.path.join(vis_bev_path, bev_image_file)

        with Image.open(original_image_path) as original_img, Image.open(
            bev_image_path
        ) as bev_img:
            original_img_resized = original_img.resize((width, height))
            bev_img_resized = bev_img.resize((box_width, box_height))
            original_img_array = cv2.cvtColor(
                np.array(original_img_resized), cv2.COLOR_RGB2BGR
            )
            bev_img_array = cv2.cvtColor(np.array(bev_img_resized), cv2.COLOR_RGB2BGR)

        if original_img_array is not None:
            video_original.write(original_img_array)
        if bev_img_array is not None:
            video_bev.write(bev_img_array)

    video_original.release()
    video_bev.release()
    print(f"visualization video saved at {video_original_path} and {video_bev_path}")


CS_visualize_video(RESULT_PATH, WIDTH, HEIGHT, BOX_START, BOX_END, FPS, VIDEO_CODEC)

### Utils


In [None]:
# (util) DLT data collection
click_positions = []


def mouse_callback(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        click_positions.append([x, y])
        cv2.circle(open_cv_image, (x, y), 5, (0, 0, 255), -1)
        cv2.putText(
            open_cv_image,
            str(len(click_positions)),
            (x + 10, y - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 255),
            1,
            cv2.LINE_AA,
        )
        cv2.imshow("Image", open_cv_image)


image_num = 360
image_path = TEST_DATA_ROOT + "/images/" + str(image_num) + ".jpg"
image = Image.open(image_path)

resized_image = image.resize((WIDTH, HEIGHT))
open_cv_image = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)

cv2.namedWindow("Image")
cv2.setMouseCallback("Image", mouse_callback)

cv2.imshow("Image", open_cv_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

print(click_positions)

[[96, 23], [579, 17], [1054, 15], [96, 711], [574, 695], [1131, 691], [204, 291], [576, 283], [761, 286], [794, 462], [473, 374], [1033, 409], [1052, 353], [494, 26], [366, 531]]


In [None]:
# (util) CL data add
def mouse_callback(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        click_positions.append([x, y])
        cv2.circle(open_cv_image, (x, y), 5, (0, 0, 255), -1)
        cv2.imshow("Image", open_cv_image)


image_num = 0
while True:
    click_positions = []
    image_path = RESULT_PATH + "/vis/" + str(image_num) + "_pred.jpg"
    try:
        image = Image.open(image_path)

        resized_image = image.resize((WIDTH, HEIGHT))
        open_cv_image = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
        cv2.putText(
            open_cv_image,
            str(image_num),
            (10, 30),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (0, 0, 255),
            2,
            cv2.LINE_AA,
        )

        cv2.namedWindow("Image")
        cv2.setMouseCallback("Image", mouse_callback)

        cv2.imshow("Image", open_cv_image)
        if cv2.waitKey(0) == 27:
            cv2.destroyAllWindows()
            break
        cv2.destroyAllWindows()
        image_num += 1
        if len(click_positions) == 0:
            continue
        print(image_num, len(click_positions), end=" ")
        for pos in click_positions:
            print(pos[0], pos[1], end=" ", sep=" ")
        print()
    except:
        print("Error")
        break

28 1 59 86 
29 1 64 91 
30 1 71 86 
31 1 73 89 
32 1 80 82 
33 1 86 84 
34 1 90 89 
37 1 104 85 
38 1 108 85 
41 1 121 88 
42 1 122 85 
43 1 129 85 
44 1 134 91 
45 1 138 84 
46 1 146 87 
47 1 148 87 
51 1 168 89 
52 1 171 90 
53 2 172 87 19 554 
54 1 178 90 
55 1 181 89 
56 1 184 87 
57 1 192 90 
58 1 195 87 
59 1 198 87 
60 1 207 88 
61 1 210 92 
62 1 211 90 
63 1 214 85 
64 1 217 83 
65 1 225 84 
66 1 223 85 
67 1 232 83 
68 1 238 87 
69 1 241 89 
74 1 262 78 
75 1 264 78 
77 1 138 499 
95 1 349 68 
96 1 357 67 
97 1 366 67 
98 1 369 65 
99 1 374 66 
100 1 380 67 
101 1 383 62 
102 1 387 65 
103 1 391 67 
106 1 407 65 
109 1 414 67 
110 1 421 65 
113 1 429 65 
114 1 438 62 
115 1 442 61 
117 1 445 65 
118 1 450 64 
119 1 455 62 
120 1 459 66 
122 1 464 63 
123 1 461 67 
124 1 479 64 
125 1 399 419 
127 2 414 424 487 58 
128 1 416 423 
129 2 426 424 491 62 
130 1 434 422 
131 2 501 52 443 422 
132 1 440 425 
133 2 447 421 502 61 
136 1 464 420 
139 1 482 427 
143 1 509 421 
145 1 524

In [None]:
# CL data add
with open(RESULT_PATH + "/result.txt", "r") as file:
    data1 = file.readlines()
with open(RESULT_PATH + "/result_add.txt", "r") as file:
    data2 = file.readlines()


def parse_data(data):
    parsed_data = {}
    for line in data:
        parts = list(map(int, line.split()))
        frame = parts[0]
        crowd_size = parts[1]
        positions = parts[2:]
        parsed_data[frame] = {"crowd_size": crowd_size, "positions": positions}
    return parsed_data


data1_parsed = parse_data(data1)
data2_parsed = parse_data(data2)

merged_data = {}
for frame in data1_parsed:
    if frame in data2_parsed:
        merged_data[frame] = {
            "crowd_size": data1_parsed[frame]["crowd_size"]
            + data2_parsed[frame]["crowd_size"],
            "positions": data1_parsed[frame]["positions"]
            + data2_parsed[frame]["positions"],
        }
    else:
        merged_data[frame] = data1_parsed[frame]

merged_data_lines = []
for frame, details in merged_data.items():
    line = f"{frame} {details['crowd_size']} " + " ".join(
        map(str, details["positions"])
    )
    merged_data_lines.append(line)

merged_data_str = "\n".join(merged_data_lines)
print(merged_data_str)

0 48 138 1 601 3 417 3 38 22 150 23 1034 36 617 33 315 60 384 61 823 62 691 66 45 66 1114 71 134 74 760 99 951 96 693 107 388 113 394 149 246 178 159 178 53 181 160 211 314 247 810 250 225 261 730 302 357 330 1020 349 770 358 10 369 274 374 1033 387 478 391 223 407 840 426 349 436 16 438 696 473 795 479 1210 511 588 524 200 570 132 581 916 646 177 657 1137 658 363 712
1 49 133 1 411 3 605 2 152 24 1038 32 42 23 622 31 388 60 320 60 828 61 695 65 48 67 1118 70 137 74 761 94 955 93 697 106 391 114 399 152 162 177 250 179 57 186 164 209 309 245 817 251 221 259 734 303 361 333 1026 350 776 358 15 369 279 374 1041 386 483 390 227 406 845 428 355 437 22 439 699 472 801 482 1217 507 594 523 195 570 129 579 923 651 1131 656 183 658 198 682 368 711
2 52 131 1 406 2 611 2 1039 21 47 24 157 27 626 30 1048 44 392 59 324 60 832 61 697 65 53 67 1124 72 141 75 762 91 961 95 699 105 397 113 403 152 167 177 254 180 62 182 167 209 305 244 824 253 216 259 1278 271 739 301 365 333 1031 348 781 357 19 367 