In [1]:
import glob
import os
import shutil
from tqdm import tqdm_notebook as tqdm
import collections
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
from PIL import Image
from IPython.display import display
import random
from torchvision.models.vgg import vgg16_bn
import math

num_workers = 16
batch_size = 32


Person = collections.namedtuple("Person", ["leftx", 'lefty', 'rightx', 'righty', 'age', 'gender'])

def iog_files():
    files = []
    for file in glob.glob("iog_dataset/*/PersonData.txt"):
        dirname = os.path.dirname(file)
        with open(file) as f:
            pic = None
            for line in f.readlines():
                line = line.strip()
                if line.endswith(".jpg"):
                    if pic is not None:
                        src = os.path.join(dirname, pic)
                        files.append((src, poses))

                    pic = line
                    poses = []
                else:
                    p = Person(*map(int, line.split("\t")))
                    poses.append(((p.leftx + p.rightx)/2, (p.lefty+p.righty)/2))
    return files
assert len(iog_files())

from scipy.io import loadmat

def qrnf_files():
    files = []
    for file in glob.glob('UCF-QNRF_ECCV18/*/*.jpg'):
        basepath = os.path.splitext(file)[0]
        metapath = basepath + "_ann.mat"
        meta = loadmat(metapath)
        files.append((file, meta['annPoints']))
    return files

assert len(qrnf_files())

device = torch.device('cuda')

class IOGDataset(torch.utils.data.dataset.Dataset):
    def __init__(self, val, transform=[]):
        self.files = iog_files() + qrnf_files()
        #self.files = qrnf_files()
        self.val = val
        self.transforms = transforms.Compose(transform + [
            transforms.ToTensor(), 
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        self.scale = (0.08, 1.0)
        self.ratio = (3. / 4., 4. / 3.)
        self.flip_chance = 0.5
        self.size = 64
        self.density_size = self.size
        self.interpolation = Image.BILINEAR
        sizes = [len(people) for _, people in self.files]
        self.bins = np.percentile(sizes, (33.3, 66.6, 100))
        self.local_window = torch.ones((1, 1, 31, 31))
        if getattr(IOGDataset, 'local_bins', None) is None:
            IOGDataset.local_bins = self._get_local_bins()
        
    def _get_bin(self, people):
        for i, b in enumerate(self.bins):
            if people <= b:
                return i
        
    def __len__(self):
        return len(self.files)
    
    def _get_local_bins(self):
        sets = []
        for path, people in tqdm(self.files, desc="_get_local_bins"):
            size = Image.open(path).size
            _, _, local = self._get_density(people, 0, 0, size[1], size[0])
            for n in local.numpy().flatten():
                if n > 0:
                    sets.append(n)
        return np.percentile(sets, (33.3, 66.6, 100))
    
    def _get_density(self, people, i, j, h, w):
        density = torch.zeros((self.density_size, self.density_size))
        in_window = 0
        for (x, y) in people:
            newx = int((x-j)/w*self.density_size)
            newy = int((y-i)/h*self.density_size)
            if newx < 0 or newx >= self.density_size or newy < 0 or newy >= self.density_size:
                continue
            density[newy, newx] += 1.0
            in_window += 1
            
        density = density.unsqueeze(dim=0)
        
        local = nn.functional.conv2d(
            density.unsqueeze(dim=0), 
            self.local_window, 
            padding=int(self.local_window.shape[-1]/2),
            stride=4,
        ).squeeze(dim=0)
            
        return density, in_window, local
    
    def __getitem__(self, index):
        path, people = self.files[index]
        img = Image.open(path).convert('RGB')
        if self.val:
            i = 0
            j = 0
            h = img.height
            w = img.width
        else:
            i, j, h, w = transforms.RandomResizedCrop.get_params(img, self.scale, self.ratio)
            
        img = transforms.functional.resized_crop(img, i, j, h, w, (self.size, self.size), self.interpolation)
        
        density, in_window, local = self._get_density(people, i, j, h, w)
        local = local.squeeze(dim=0)
        
        local_size = int(self.density_size / 4)
        local_classes = torch.zeros((local_size, local_size), dtype=torch.uint8)
        for p in IOGDataset.local_bins:
            matches = local > p
            local_classes += matches
            
        global_class = self._get_bin(in_window)
        
        img = self.transforms(img)
        if not self.val and random.random() < self.flip_chance:
            img = img.flip(dims=(2,))
            density = density.flip(dims=(2,))
        return img, density, global_class, local_classes.long()

train_set = IOGDataset(val=False, transform=[
    transforms.RandomGrayscale()
])

img, density, _, local_classes = train_set[0]
print(local_classes, img.shape, density.shape)
display(transforms.functional.to_pil_image(img), transforms.functional.to_pil_image(density))

val_set = IOGDataset(val=True)

indicies = np.random.permutation(len(train_set))
n_training_samples = int(len(train_set) * 0.95)
train_sampler = SubsetRandomSampler(indicies[:n_training_samples])
val_sampler = SubsetRandomSampler(indicies[n_training_samples:])

train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                           sampler=train_sampler, num_workers=num_workers,
                                           drop_last=True, pin_memory=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size,
                                            sampler=val_sampler, num_workers=num_workers,
                                        pin_memory=True)

assert len(val_loader)
assert len(train_loader)

HBox(children=(IntProgress(value=0, description='_get_local_bins', max=6604, style=ProgressStyle(description_w…




KeyboardInterrupt: 

In [None]:


class MultiScaleAnalysis(nn.Module):
    def __init__(self, k, out):
        super().__init__()
        per = int(out/4)
        self.stack1 = nn.Conv2d(k, per, 1)
        self.stack2 = nn.Sequential(
            nn.Conv2d(k, 32, 1),
            nn.Conv2d(32, per, 3, padding=1),
        )
        self.stack3 = nn.Sequential(
            nn.Conv2d(k, 32, 1),
            nn.Conv2d(32, per, 5, padding=2),
        )
        self.stack4 = nn.Sequential(
            nn.Conv2d(k, 32, 1),
            nn.Conv2d(32, per, 7, padding=3),
        )
        
    def forward(self, x):
        return torch.cat([
            self.stack1(x),
            self.stack2(x),
            self.stack3(x),
            self.stack4(x)
        ], dim=1)

class InceptionNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.pool = nn.AvgPool2d(2, 2)
        self.relu = nn.LeakyReLU(0.1)
        
        k = 64
        self.conv1 = MultiScaleAnalysis(3, k)
        self.conv2 = MultiScaleAnalysis(k, k)
        self.conv3 = MultiScaleAnalysis(k, k)
        self.conv4 = MultiScaleAnalysis(k, k)
        self.conv5 = MultiScaleAnalysis(k, k)
        
        self.conv6 = nn.Conv2d(k, 1, 3)

    def forward(self, orig):
        x = self.relu(self.conv1(orig))
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = self.pool(x)
        
        x = self.relu(self.conv5(x))
        x = self.conv6(x)
        return x
    
padding_type = 'reflect'
norm_layer = nn.BatchNorm2d
use_dropout = False
use_bias = False
    
class ResnetBlock(nn.Module):
    """Define a Resnet block"""

    def __init__(self, dim):
        """Initialize the Resnet block
        A resnet block is a conv block with skip connections
        We construct a conv block with build_conv_block function,
        and implement skip connections in <forward> function.
        Original Resnet paper: https://arxiv.org/pdf/1512.03385.pdf
        """
        super(ResnetBlock, self).__init__()
        self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, use_dropout, use_bias)

    def build_conv_block(self, dim, padding_type, norm_layer, use_dropout, use_bias):
        """Construct a convolutional block.
        Parameters:
            dim (int)           -- the number of channels in the conv layer.
            padding_type (str)  -- the name of padding layer: reflect | replicate | zero
            norm_layer          -- normalization layer
            use_dropout (bool)  -- if use dropout layers.
            use_bias (bool)     -- if the conv layer uses bias or not
        Returns a conv block (with a conv layer, a normalization layer, and a non-linearity layer (ReLU))
        """
        conv_block = []
        p = 0
        if padding_type == 'reflect':
            conv_block += [nn.ReflectionPad2d(1)]
        elif padding_type == 'replicate':
            conv_block += [nn.ReplicationPad2d(1)]
        elif padding_type == 'zero':
            p = 1
        else:
            raise NotImplementedError('padding [%s] is not implemented' % padding_type)

        conv_block += [
            nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), 
            norm_layer(dim), 
            nn.ReLU(True)
        ]
        if use_dropout:
            conv_block += [nn.Dropout(0.5)]

        p = 0
        if padding_type == 'reflect':
            conv_block += [nn.ReflectionPad2d(1)]
        elif padding_type == 'replicate':
            conv_block += [nn.ReplicationPad2d(1)]
        elif padding_type == 'zero':
            p = 1
        else:
            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
        conv_block += [
            nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), 
            norm_layer(dim)
        ]

        return nn.Sequential(*conv_block)

    def forward(self, x):
        """Forward function (with skip connections)"""
        out = x + self.conv_block(x)  # add skip connections
        return out
    
class LocalAttention(nn.Module):
    def __init__(self, h, w):
        super().__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(3, 1, 31, padding=15, bias=use_bias),
            norm_layer(1),
            relu(),
            
            nn.Conv2d(1, 8, 5, padding=2, bias=use_bias),
            norm_layer(8),
            relu(),
            
            nn.Conv2d(8, 64, 3, padding=1, bias=use_bias),
            norm_layer(64),
            relu(),
            
            pool(),
            
            nn.Conv2d(64, 128, 3, padding=1, bias=use_bias),
            norm_layer(128),
            relu(),
            
            ResnetBlock(128),
            
            pool(),
            
            ResnetBlock(128),
        )
        self.h = int(h / 4)
        self.w = int(w / 4)
        self.fc_size = 128 * self.h * self.w
        self.fc = nn.Sequential(
            nn.Linear(self.fc_size, 1024),
            relu(),
            nn.Linear(1024, 512),
            relu(),
            nn.Linear(512, 3 * self.h * self.w),
            nn.Sigmoid(),
        )
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, self.fc_size)
        x = self.fc(x)
        x = x.view(-1, 3, self.h, self.w)
        return x
       
    
def relu():
    return nn.LeakyReLU(0.1, inplace=True)

def pool():
    return nn.MaxPool2d(2, 2)

def global_attention():
    vgg = vgg16_bn(pretrained=True)
    for param in vgg.features.parameters():
        param.require_grad = False
    vgg.classifier = nn.Sequential(
        nn.Linear(25088, 512),
        relu(),
        nn.Linear(512, 256),
        relu(),
        nn.Linear(256, 3),
        nn.Softmax(),
    )
    return vgg

class SPN(nn.Module):
    def __init__(self):
        super().__init__()
        
        h = train_set.size
        w = train_set.size
        
        self.cnn1 = nn.Sequential(
            nn.Conv2d(3, 16, 11, padding=5, bias=use_bias),
            norm_layer(16),
            relu(),
            
            pool(),
            
            nn.Conv2d(16, 24, 9, padding=4, bias=use_bias),
            norm_layer(24),
            relu(),
            
            nn.Conv2d(24, 16, 7, padding=3, bias=use_bias),
            norm_layer(16),
            relu(),
            
            pool(),
            
            nn.Conv2d(16, 16, 7, padding=3, bias=use_bias),
            norm_layer(16),
            relu(),
            
            nn.Conv2d(16, 8, 5, padding=2, bias=use_bias),
            norm_layer(8),
            relu(),
        )
        self.cnn2 = nn.Sequential(
            nn.Conv2d(3, 16, 9, padding=4, bias=use_bias),
            norm_layer(16),
            relu(),
            
            pool(),
            
            nn.Conv2d(16, 24, 7, padding=3, bias=use_bias),
            norm_layer(24),
            relu(),
            
            nn.Conv2d(24, 32, 5, padding=2, bias=use_bias),
            norm_layer(32),
            relu(),
            
            pool(),
            
            nn.Conv2d(32, 32, 5, padding=2, bias=use_bias),
            norm_layer(32),
            relu(),
            nn.Conv2d(32, 16, 3, padding=1, bias=use_bias),
            norm_layer(16),
            relu(),
        )
        self.cnn3 = nn.Sequential(
            nn.Conv2d(3, 16, 7, padding=3, bias=use_bias),
            norm_layer(16),
            relu(),
            
            pool(),
            
            nn.Conv2d(16, 24, 5, padding=2, bias=use_bias),
            norm_layer(24),
            relu(),
            
            nn.Conv2d(24, 48, 3, padding=1, bias=use_bias),
            norm_layer(48),
            relu(),
            
            pool(),
            
            nn.Conv2d(48, 48, 3, padding=1, bias=use_bias),
            norm_layer(48),
            relu(),
            
            nn.Conv2d(48, 24, 3, padding=1, bias=use_bias),
            norm_layer(24),
            relu(),
        )
        self.global_attention = global_attention()
        self.local_attention = LocalAttention(h, w)
        self.fusion_network = nn.Sequential(
            nn.Conv2d(48, 128, 3, padding=1, bias=use_bias),
            norm_layer(128),
            relu(),
            nn.Conv2d(128, 128, 3, padding=1, bias=use_bias),
            norm_layer(128),
            relu(),
            nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1, output_padding=1, bias=use_bias),
            norm_layer(64),
            relu(),
            nn.ConvTranspose2d(64, 16, 3, stride=2, padding=1, output_padding=1, bias=use_bias),
            norm_layer(16),
            relu(),
            nn.ReflectionPad2d(3),
            nn.Conv2d(16, 1, 7),
        )
        
    def forward(self, x):
        g = self.global_attention(x)
        g_squeezed = g.view(-1, 3, 1, 1, 1)
        
        g1 = g_squeezed[:, 0]
        g2 = g_squeezed[:, 1]
        g3 = g_squeezed[:, 2]
        
        f1 = self.cnn1(x)
        f2 = self.cnn2(x)
        f3 = self.cnn3(x)
        
        l = self.local_attention(x)
        l1 = l[:, 0].unsqueeze(dim=1)
        l2 = l[:, 1].unsqueeze(dim=1)
        l3 = l[:, 2].unsqueeze(dim=1)
        
        a1 = g1 * l1 * f1
        a2 = g2 * l2 * f2
        a3 = g3 * l3 * f3
        
        out = torch.cat((a1, a2, a3), dim=1)
        out = self.fusion_network(out)
        return out, g, l

net = SPN()
print(net)
net.to(device)

net(next(train_loader.__iter__())[0].to(device))

density_criterion = nn.MSELoss()
global_scale_criterion = nn.CrossEntropyLoss()
local_scale_criterion = nn.CrossEntropyLoss()
gs_lambda = 0.05
ls_lambda = 0.05

#optimizer = optim.Adam(net.parameters(), lr=0.0001, weight_decay=1e-6)
#optimizer = optim.RMSprop(net.parameters(), lr=0.001)
optimizer = optim.SGD(
    net.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

lr_scheduler = optim.lr_scheduler.StepLR(
    optimizer, step_size=30, gamma=0.1)

def compute_accuracy(a, b):
    size = train_set.density_size
    return (
        a.view(-1, size*size).sum(1).round().int() == b.view(-1, size*size).sum(1).round().int()
    ).sum().item()

def compute_err(a, b):
    size = train_set.density_size
    return (
        a.view(-1, size*size).sum(1)- b.view(-1, size*size).sum(1)
    ).abs().sum().item()

def display_density(tensor, scale=1):
    display(
        transforms.functional.to_pil_image(
            F.interpolate(
                tensor.unsqueeze(dim=0), 
                scale_factor=(scale, scale)
            ).squeeze(dim=0).cpu()
        )
    )

def display_pair(inputs, outputs, labels):
    with torch.no_grad():
        display_density(inputs[0], scale=1)
        display_density(
            (torch.cat((labels[0], outputs[0]), dim=1) * 0.5).clamp(0, 1), 
            scale=1
        )
        
def train_epoch(name, epoch, loader, val=False):
    running_loss = 0.0
    running_density_loss = 0.0
    running_gs_loss = 0.0
    running_ls_loss = 0.0
    running_error = 0.0
    accuracy = 0
    total = 0
    for inputs, labels, gs, ls in tqdm(loader, desc="{} - epoch {}".format(name, epoch)):
        inputs = inputs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        gs = gs.to(device, non_blocking=True)
        ls = ls.to(device, non_blocking=True)
        
        optimizer.zero_grad()
        outputs, global_out, local_out = net(inputs)
        density_loss = density_criterion(outputs, labels)
        gs_loss = gs_lambda * global_scale_criterion(global_out, gs)
        ls_loss = ls_lambda * local_scale_criterion(local_out, ls)
        loss = density_loss + gs_loss + ls_loss
        if not val:
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            running_error += compute_err(outputs, labels)
            running_loss += loss.item()
            running_density_loss += density_loss.item()
            running_gs_loss += gs_loss.item()
            running_ls_loss += ls_loss.item()
            total += len(labels)
            accuracy += compute_accuracy(outputs, labels)
        
    print('{} - loss: {:.5f} (density: {:.5f}, gs: {:.5f}, ls: {:.5f}), accuracy: {:.3f}, error: {:.3f}'.format(
        name,
        running_loss / total, running_density_loss / total, running_gs_loss / total, 
        running_ls_loss / total, accuracy / total, running_error / total))
    print(gs[0], global_out[0])
    #print(ls[0]) #, local_out[0])
    display_pair(inputs, outputs, labels)

def train(epoch):
    train_epoch('train', epoch, train_loader)
    lr_scheduler.step()
    with torch.no_grad():
        train_epoch('val', epoch, val_loader, val=True)

for epoch in range(10000): 
    train(epoch)

In [None]:
optimizer = optim.Adam(net.parameters(), lr=0.0001)
for epoch in range(200): 
    train(epoch)

In [None]:
train_set.bins