CSRNET

In [5]:
import torch
import torch.nn as nn
from torchvision import models
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import cv2

class CSRNet(nn.Module):
    def __init__(self, load_weights=False):
        super(CSRNet, self).__init__()
        self.frontend_feat = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
        self.backend_feat  = [512, 512, 512, 256, 128, 64]
        self.frontend = self.make_layers(self.frontend_feat)
        self.backend = self.make_layers(self.backend_feat, in_channels=512, dilation=True)
        self.output_layer = nn.Conv2d(64, 1, kernel_size=1)
        if not load_weights:
            mod = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
            self._initialize_weights()
            frontend_state_dict = self.frontend.state_dict()
            vgg_state_dict = mod.state_dict()
            for k in frontend_state_dict.keys():
                if k in vgg_state_dict:
                    frontend_state_dict[k] = vgg_state_dict[k]
            self.frontend.load_state_dict(frontend_state_dict)

    def forward(self, x):
        x = self.frontend(x)
        x = self.backend(x)
        x = self.output_layer(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, std=0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def make_layers(self, cfg, in_channels=3, batch_norm=False, dilation=False):
        layers = []
        if dilation:
            d_rate = 2
        else:
            d_rate = 1
        for v in cfg:
            if v == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=d_rate, dilation=d_rate)
                if batch_norm:
                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
                else:
                    layers += [conv2d, nn.ReLU(inplace=True)]
                in_channels = v
        return nn.Sequential(*layers)

# Initialize the CSRNet model
model = CSRNet()

# Load the pretrained CSRNet model weights (assuming you have a checkpoint file named 'PartAmodel_best.pth.tar')
checkpoint = torch.load('checkpoint/PartAmodel_best.pth.tar', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'])
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Transformation
transform = transforms.Compose([
    transforms.Resize((512, 512)),  # Resize to reduce computation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Path to the input video file
input_video_path = "inputs/simulation.mp4"

# Open the input video
cap = cv2.VideoCapture(input_video_path)

# Check if video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

frame_count = 0
frame_skip = 5  # Process every 3rd frame

# Loop to continuously get frames from the video
while True:
    ret, frame = cap.read()
    if not ret:
        print("Finished processing video")
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    # Resize the frame for faster processing
    frame_resized = cv2.resize(frame, (512, 512))

    # Preprocess the frame
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    input_tensor = transform(pil_image).unsqueeze(0).to(device)

    # Run the CSRNet model on the frame
    with torch.no_grad():
        density_map = model(input_tensor)

    # Convert density map to count
    density_map_np = density_map.squeeze().cpu().numpy()
    count = np.sum(density_map_np)

    # Display the count on the frame
    annotated_frame = frame.copy()
    cv2.putText(annotated_frame, f"Count: {int(count)}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Display the annotated frame
    cv2.imshow("Crowd Counting", annotated_frame)

    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture object and close display windows
cap.release()
cv2.destroyAllWindows()


CSRNET + density map

In [44]:
import torch
import torch.nn as nn
from torchvision import models
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt

class CSRNet(nn.Module):
    def __init__(self, load_weights=False):
        super(CSRNet, self).__init__()
        self.frontend_feat = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
        self.backend_feat  = [512, 512, 512, 256, 128, 64]
        self.frontend = self.make_layers(self.frontend_feat)
        self.backend = self.make_layers(self.backend_feat, in_channels=512, dilation=True)
        self.output_layer = nn.Conv2d(64, 1, kernel_size=1)
        if not load_weights:
            mod = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
            self._initialize_weights()
            frontend_state_dict = self.frontend.state_dict()
            vgg_state_dict = mod.state_dict()
            for k in frontend_state_dict.keys():
                if k in vgg_state_dict:
                    frontend_state_dict[k] = vgg_state_dict[k]
            self.frontend.load_state_dict(frontend_state_dict)

    def forward(self, x):
        x = self.frontend(x)
        x = self.backend(x)
        x = self.output_layer(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, std=0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def make_layers(self, cfg, in_channels=3, batch_norm=False, dilation=False):
        layers = []
        if dilation:
            d_rate = 2
        else:
            d_rate = 1
        for v in cfg:
            if v == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=d_rate, dilation=d_rate)
                if batch_norm:
                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
                else:
                    layers += [conv2d, nn.ReLU(inplace=True)]
                in_channels = v
        return nn.Sequential(*layers)

# Initialize the CSRNet model
model = CSRNet()

# Load the pretrained CSRNet model weights (assuming you have a checkpoint file named 'PartAmodel_best.pth.tar')
checkpoint = torch.load('checkpoint/PartAmodel_best.pth.tar', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'])
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Transformation
transform = transforms.Compose([
    transforms.Resize((654, 654)),  # Resize to reduce computation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Path to the input video file
input_video_path = "inputs/simulation.mp4"

# Open the input video
cap = cv2.VideoCapture(input_video_path)

# Check if video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

frame_count = 0
frame_skip = 5  # Process every 5th frame

# Loop to continuously get frames from the video
while True:
    ret, frame = cap.read()
    if not ret:
        print("Finished processing video")
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    # Resize the frame for faster processing
    frame_resized = cv2.resize(frame, (512, 512))

    # Preprocess the frame
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    input_tensor = transform(pil_image).unsqueeze(0).to(device)

    # Run the CSRNet model on the frame
    with torch.no_grad():
        density_map = model(input_tensor)

    # Convert density map to count
    density_map_np = density_map.squeeze().cpu().numpy()
    count = np.sum(density_map_np)

    # Normalize density map for visualization
    density_map_np = density_map_np / np.max(density_map_np)  # Normalize between 0 and 1
    density_map_np = (density_map_np * 255).astype(np.uint8)  # Scale to 255 for display

    # Apply a color map to the density map
    density_map_colored = cv2.applyColorMap(density_map_np, cv2.COLORMAP_JET)

    # Resize density map to match original frame size
    density_map_resized = cv2.resize(density_map_colored, (frame.shape[1], frame.shape[0]))

    # Display the count on the frame
    annotated_frame = frame.copy()
    cv2.putText(annotated_frame, f"Count: {int(count)}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Concatenate the frame and the density map for side-by-side display
    combined_display = cv2.hconcat([annotated_frame, density_map_resized])

    # Display the annotated frame and the density map
    cv2.imshow("Crowd Counting and Density Map", combined_display)

    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture object and close display windows
cap.release()
cv2.destroyAllWindows()


CSRNET + density + threshold

In [1]:
import torch
import torch.nn as nn
from torchvision import models
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import cv2

class CSRNet(nn.Module):
    def __init__(self, load_weights=False):
        super(CSRNet, self).__init__()
        self.frontend_feat = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
        self.backend_feat  = [512, 512, 512, 256, 128, 64]
        self.frontend = self.make_layers(self.frontend_feat)
        self.backend = self.make_layers(self.backend_feat, in_channels=512, dilation=True)
        self.output_layer = nn.Conv2d(64, 1, kernel_size=1)
        if not load_weights:
            mod = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
            self._initialize_weights()
            frontend_state_dict = self.frontend.state_dict()
            vgg_state_dict = mod.state_dict()
            for k in frontend_state_dict.keys():
                if k in vgg_state_dict:
                    frontend_state_dict[k] = vgg_state_dict[k]
            self.frontend.load_state_dict(frontend_state_dict)

    def forward(self, x):
        x = self.frontend(x)
        x = self.backend(x)
        x = self.output_layer(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, std=0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def make_layers(self, cfg, in_channels=3, batch_norm=False, dilation=False):
        layers = []
        if dilation:
            d_rate = 2
        else:
            d_rate = 1
        for v in cfg:
            if v == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=d_rate, dilation=d_rate)
                if batch_norm:
                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
                else:
                    layers += [conv2d, nn.ReLU(inplace=True)]
                in_channels = v
        return nn.Sequential(*layers)

# Initialize the CSRNet model
model = CSRNet()

# Load the pretrained CSRNet model weights (assuming you have a checkpoint file named 'PartAmodel_best.pth.tar')
checkpoint = torch.load('checkpoint/PartAmodel_best.pth.tar', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'])
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Transformation
transform = transforms.Compose([
    transforms.Resize((654, 654)),  # Resize to reduce computation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Path to the input video file
input_video_path = "inputs/simulation.mp4"

# Open the input video
cap = cv2.VideoCapture(input_video_path)

# Check if video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

frame_count = 0
frame_skip = 5  # Process every 5th frame

# Loop to continuously get frames from the video
while True:
    ret, frame = cap.read()
    if not ret:
        print("Finished processing video")
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    # Resize the frame for faster processing
    frame_resized = cv2.resize(frame, (512, 512))

    # Preprocess the frame
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    input_tensor = transform(pil_image).unsqueeze(0).to(device)

    # Run the CSRNet model on the frame
    with torch.no_grad():
        density_map = model(input_tensor)

    # Convert density map to count
    density_map_np = density_map.squeeze().cpu().numpy()
    count = np.sum(density_map_np)

    # Normalize density map for visualization
    density_map_np_normalized = density_map_np / np.max(density_map_np)  # Normalize between 0 and 1
    density_map_np_visual = (density_map_np_normalized * 255).astype(np.uint8)  # Scale to 255 for display

    # Apply a color map to the density map
    density_map_colored = cv2.applyColorMap(density_map_np_visual, cv2.COLORMAP_JET)

    # Resize density map to match original frame size
    density_map_resized = cv2.resize(density_map_colored, (frame.shape[1], frame.shape[0]))

    # Highlight areas of high density
    threshold = 0.9  # Adjust this threshold based on what is considered a high density
    high_density_mask = (density_map_np_normalized > threshold).astype(np.uint8) * 255
    high_density_colored = cv2.applyColorMap(high_density_mask, cv2.COLORMAP_HOT)
    high_density_mask_resized = cv2.resize(high_density_mask, (frame.shape[1], frame.shape[0]))

    # Overlay high-density areas on the original frame
    high_density_colored_resized = cv2.resize(high_density_colored, (frame.shape[1], frame.shape[0]))
    frame[high_density_mask_resized > 0] = high_density_colored_resized[high_density_mask_resized > 0]

    # Display the count on the frame
    annotated_frame = frame.copy()
    cv2.putText(annotated_frame, f"Count: {int(count)}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Concatenate the frame and the density map for side-by-side display
    combined_display = cv2.hconcat([annotated_frame, density_map_resized])

    # Display the annotated frame and the density map
    cv2.imshow("Crowd Counting and Density Map", combined_display)

    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture object and close display windows
cap.release()
cv2.destroyAllWindows()


yolov8

In [4]:
from ultralytics import YOLO
import cv2

# Load the YOLOv8 model
model = YOLO("yolov8n.pt")  # or any other YOLOv8 model

# Open the input video
cap = cv2.VideoCapture("inputs/simulation.mp4")

# Loop through the frames
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Run the YOLOv8 model on the frame
    results = model(frame, conf = 0.1)

    # Extract detections
    detections = results[0].boxes.data.cpu().numpy()  # Get the bounding box detections

    people_count = 0
    for det in detections:
        x1, y1, x2, y2, conf, cls = det
        if int(cls) == 0:  # Assuming 'person' class is labeled as 0
            people_count += 1
            # Draw bounding box
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)

    # Display the count of people on the frame
    cv2.putText(frame, f'People Count: {people_count}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)

    # Show the frame in a window
    cv2.imshow('Frame', frame)

    # Press 'q' to exit the video
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()



0: 480x640 14 persons, 18.4ms
Speed: 4.3ms preprocess, 18.4ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 16.2ms
Speed: 3.2ms preprocess, 16.2ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 17.2ms
Speed: 2.8ms preprocess, 17.2ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 16.1ms
Speed: 3.8ms preprocess, 16.1ms inference, 1.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 15.6ms
Speed: 4.0ms preprocess, 15.6ms inference, 1.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 15.7ms
Speed: 3.2ms preprocess, 15.7ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 15 persons, 15.6ms
Speed: 2.6ms preprocess, 15.6ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 14 persons, 15.6ms
Speed: 2.8ms preprocess, 15.6ms inference, 2.0ms postprocess per image at