In [1]:
import torch
import os
import torch.nn as nn
from torchvision import transforms
from scipy.io import loadmat
import cv2 as cv
from PIL import Image, ImageDraw
import numpy as np
from torch.utils.data import Dataset
import random

## HourGlass Moudule (HourGlassNet)

Creating a smaller version of the hourglass network (encoding/decoding network). The hourglass network is usually used in a stacked fashion. In the CenterNet Model (which is a Object Detection Algorithm/Architecture), they use a stacked hourglass network for feature extraction.

Below you can see how the HourGlass Module is being implemented.

In [2]:
class ResNetBlock(nn.Module):
    """Creates a ResNet Block for feature extraction."""

    def __init__(self, inp_dim, out_dim):
        """Instantiates the Residual Module."""
        super(ResNetBlock, self).__init__()

        # half the output dimension.
        out = out_dim//2

        # sequence of layers (Batch Normalization, ReLu, Convolution)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(inp_dim)
        self.conv1 = nn.Conv2d(inp_dim, out, kernel_size=1, stride=1, padding=0, bias=True)
        self.bn2 = nn.BatchNorm2d(out)
        self.conv2 = nn.Conv2d(out, out, kernel_size=3, stride=1, padding=1, bias=True)
        self.bn3 = nn.BatchNorm2d(out)
        self.conv3 = nn.Conv2d(out, out_dim, kernel_size=1, stride=1, padding=0, bias=True)

        # add a skip layer for residual information
        self.skip_layer = nn.Conv2d(inp_dim, out_dim, kernel_size=1, stride=1, padding=0, bias=True)
        self.need_skip = not (inp_dim == out_dim)

    def forward(self, x):
        """Defines a forward pass of the ResNet block."""

        # save residual information.
        residual = self.skip_layer(x) if self.need_skip else x
        out = self.bn1(x)
        out = self.relu(out)
        out = self.conv1(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn3(out)
        out = self.relu(out)
        out = self.conv3(out)
        out += residual

        return out

In [3]:
class Hourglass(nn.Module):
    """Hourglass Module."""

    def __init__(self, n, filters, bn=None):
        """Creates an Hourglass Module/Network."""

        super(Hourglass, self).__init__()
        self.n = n
        # up-sampling data.
        self.up1 = ResNetBlock(filters, filters)
        # encoding/feature extraction.
        self.pool1 = nn.MaxPool2d(2, 2)
        self.low1 = ResNetBlock(filters, filters)
        # recursion to add more resnet blocks.
        self.low2 = Hourglass(n-1, filters, bn=bn) if self.n > 1 else ResNetBlock(filters, filters)
        self.low3 = ResNetBlock(filters, filters)
        # up-sampling data.
        self.up2 = nn.Upsample(scale_factor=2, mode='nearest')

    def forward(self, x):
        """Forward pass of the Hourglass Module."""

        up1  = self.up1(x)
        pool1 = self.pool1(x)
        # encoding (lower levels).
        low1 = self.low1(pool1)
        low2 = self.low2(low1)
        low3 = self.low3(low2)
        up2  = self.up2(low3)

        # decoding (up-sampling).
        return up1 + up2

In [4]:
class HourGlassNetwork(nn.Module):
    """Creates an Hourglass Network."""

    def __init__(self, input_shape=(256, 256, 3), num_stack=1, num_residual=1, num_heatmap=1):
        """Instantiates the network (we want single center key points.)"""

        super(HourGlassNetwork, self).__init__()

        # initial feature extraction layers.
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=True)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.res1 = ResNetBlock(64, 128)
        self.pool = nn.MaxPool2d(2, 2)
        self.res2 = ResNetBlock(128, 128)
        self.res3 = ResNetBlock(128, 256)
        self.hg1 = Hourglass(4, 256)
        self.linear = nn.Conv2d(256, 256, kernel_size=1, padding=0, bias=True)
        self.bn2 = nn.BatchNorm2d(256)
        self.y = nn.Conv2d(256, num_heatmap, kernel_size=1, padding=0, bias=True)

    def forward(self, x):
        """Forward pass of the model."""

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.res1(out)
        out = self.pool(out)
        out = self.res2(out)
        out = self.res3(out)
        out = self.hg1(out)
        out = self.linear(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.y(out)

        return out

Create a Torch device to manage the data and the model.

In [5]:
# Device will determine whether to run the training on GPU or CPU.
DEVICE = torch.device('mps' if torch.has_mps else 'cuda' if torch.cuda.is_available() else 'cpu')

# create the model.
model = HourGlassNetwork()
model.to(DEVICE)
print('Device Set')

Device Set


Test an output of the Model.

In [6]:
img_path = r'/Users/chiahaohsutai/Documents/GitHub/PRW/images/frames/c1s1_000801.jpg'
image = cv.imread(img_path)
image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
image = cv.resize(image, (256, 256))

MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]

process = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=MEAN, std=STD)
])

img = process(image).unsqueeze(0).to(DEVICE)
img.shape

torch.Size([1, 3, 256, 256])

Run a prediction

In [7]:
# put the model in evalution mode.
model.eval()

with torch.no_grad():
    pred = model(img)

# get the shape of the output. 
pred.shape

torch.Size([1, 1, 64, 64])

Get a look at what the prediction looks like.

In [8]:
pred

tensor([[[[-0.1295, -0.1512, -0.1849,  ...,  0.0416,  0.0409,  0.0465],
          [-0.1538, -0.1727, -0.2024,  ...,  0.0381,  0.0267,  0.0582],
          [-0.1895, -0.2120, -0.2323,  ...,  0.0290,  0.0436,  0.0566],
          ...,
          [ 0.0969,  0.0890,  0.0903,  ...,  0.1402,  0.1566,  0.1640],
          [ 0.0967,  0.0955,  0.0820,  ...,  0.1465,  0.1495,  0.1527],
          [ 0.0977,  0.0975,  0.0862,  ...,  0.1482,  0.1557,  0.1558]]]],
       device='mps:0')

Now look at the annotations and boxes for the image. Get the centers for each pedestrian in the image.

In [9]:
# load in the annotations.
ann_path = r'/Users/chiahaohsutai/Documents/GitHub/PRW/annotations/c1s1_000801.jpg.mat'
annotation = loadmat(ann_path)['box_new']
annotation = [ann[1:] for ann in annotation]

In [10]:
# calculate the centers.
centers = []
for ann in annotation:
    x, y, w, h = ann
    centers.append((x+(w/2), y+(h/2)))

Run the cell below as code if you want to see the centers being drawn in the image. The centers are red dots and they are pretty small.

In [11]:
im = Image.open(img_path)

Now we are going to focus on training and learning. We need to create a Gaussian Patch to help the model learn. Otherwise the model might have a hard time predicting the center.

In [12]:
def generate_patch(scale=12):
    """Creates a heatmap using Gaussian Distribution."""

    # constants.
    sigma = 1

    size = 6 * sigma + 1
    x_mesh, y_mesh = torch.meshgrid(torch.arange(0, 6*sigma+1, 1), torch.arange(0, 6*sigma+1, 1), indexing='xy')

    # the center of the gaussian patch should be 1
    center_x = size // 2
    center_y = size // 2

    # generate this 7x7 gaussian patch
    xmesh = torch.square(torch.sub(x_mesh, center_x))
    ymesh = torch.square(torch.sub(y_mesh, center_y))
    denom = (sigma**2) * 2
    gaussian_patch = torch.mul(torch.exp(torch.div(torch.neg(torch.add(xmesh, ymesh)), denom)), scale)

    return gaussian_patch

Display what the Gaussian Patch looks like.

In [13]:
w, h = im.size
x, y = centers[0]
patch = generate_patch(1)
torch.Tensor.numpy(patch)

array([[1.2340980e-04, 1.5034392e-03, 6.7379470e-03, 1.1108996e-02,
        6.7379470e-03, 1.5034392e-03, 1.2340980e-04],
       [1.5034392e-03, 1.8315639e-02, 8.2084998e-02, 1.3533528e-01,
        8.2084998e-02, 1.8315639e-02, 1.5034392e-03],
       [6.7379470e-03, 8.2084998e-02, 3.6787945e-01, 6.0653067e-01,
        3.6787945e-01, 8.2084998e-02, 6.7379470e-03],
       [1.1108996e-02, 1.3533528e-01, 6.0653067e-01, 1.0000000e+00,
        6.0653067e-01, 1.3533528e-01, 1.1108996e-02],
       [6.7379470e-03, 8.2084998e-02, 3.6787945e-01, 6.0653067e-01,
        3.6787945e-01, 8.2084998e-02, 6.7379470e-03],
       [1.5034392e-03, 1.8315639e-02, 8.2084998e-02, 1.3533528e-01,
        8.2084998e-02, 1.8315639e-02, 1.5034392e-03],
       [1.2340980e-04, 1.5034392e-03, 6.7379470e-03, 1.1108996e-02,
        6.7379470e-03, 1.5034392e-03, 1.2340980e-04]], dtype=float32)

Display it as an image.

The Gaussian Patch is being created properly as seen by the output tensor after converting from img to tensor. Now we should try to figure out how to place the patch in the heatmap.

In [14]:
def make_heatmap(width, height, center_x, center_y, gau_patch):
    """Places a Gaussian Patch in the heatmap."""

    # constants.
    heatmap = np.zeros((height, width))
    sigma = 1
    visibility = 2
    gau_patch = torch.Tensor.numpy(gau_patch)

    # this gaussian patch is 7x7, let's get four corners of it first
    xmin = center_x - 3 * sigma
    ymin = center_y - 3 * sigma
    xmax = center_x + 3 * sigma
    ymax = center_y + 3 * sigma
    
    # if outside the image don't include the gaussian patch.
    if xmin >= width or ymin >= height or xmax < 0 or ymax < 0 or visibility == 0:
        return heatmap

    # determine boundaries for patch if outside the image.
    patch_xmin = max(0, -xmin)
    patch_ymin = max(0, -ymin)
    patch_xmax = min(xmax, width) - xmin
    patch_ymax = min(ymax, height) - ymin
    
    # we need to determine where to put this patch in the whole heatmap
    heatmap_xmin = max(0, xmin)
    heatmap_ymin = max(0, ymin)

    for j in range(patch_ymin, patch_ymax):
        for i in range(patch_xmin, patch_xmax):
            heatmap[j+heatmap_ymin, i+heatmap_xmin] = gau_patch[j, i]

    return torch.FloatTensor(heatmap)

Before generating the final image for training (aka the heatmap), we are going to preprocess the data. To a 64x64 image which would be equivalent to the output shape of the HourGlass Model.

In [15]:
rx, ry = 64/w, 64/h

# re-locate the centers for the new image dimensions.
resized_centers = []
for center in centers:
    cx, cy = center
    resized_centers.append((cx*rx, cy*ry))

In [16]:
x_pos, y_pos = resized_centers[0]
heatmap_example = make_heatmap(64, 64, int(x_pos), int(y_pos), patch)
heatmap_example[int(y_pos)][int(x_pos)]

tensor(1.)

The patch was added since the center of the pedestrian has value 1. Run the code below if you want a visualization of the heatmap.