**Resouces:**

Explanation on the resnet architecture: input_size/output_size/kernel/stride at each layer:
https://medium.com/@14prakash/understanding-and-implementing-architectures-of-resnet-and-resnext-for-state-of-the-art-image-cf51669e1624

Resnet50 Architecture:
https://www.kaggle.com/keras/resnet50

Simple way of unpacking resnetX for stripping out FC layers and such:
https://discuss.pytorch.org/t/resnet-pretrained-model-with-last-fc-layer-stripped-does-not-work/17951

Reason as to why we want to resize each image and their labels to 224 x 224:
https://stackoverflow.com/questions/43922308/what-input-image-size-is-correct-for-the-version-of-resnet-v2-in-tensorflow-slim

How to modify the FC layer of resnet:
https://discuss.pytorch.org/t/how-to-modify-the-final-fc-layer-based-on-the-torch-model/766/3

How to partially freeze resnet34:
https://medium.com/@14prakash/almost-any-image-classification-problem-using-pytorch-i-am-in-love-with-pytorch-26c7aa979ec4

**Data preprocessing requirement:**

In order to define the heatmap loss as torch.nn.functional.cross_entropy(input, target, weight=None, size_average=True, ignore_index=-100, reduce=True):

We need to have the target/label take on the form of (N, J, H, W)

Each j in J represents a joint

**Important:** The image is of size (N, 3, H, W)

## Load Dependencies

In [2]:
%matplotlib notebook

import math
import torch
import torch.nn as nn
import numpy as np
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, sampler
from torchvision import transforms, utils
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from sklearn import metrics


from libs.data_utils import HandDataset, ToTensor, Scale, GestureDataset
from libs.layer_utils import flatten, random_weight, zero_weight
from libs.model_utils import (show_joints, makePosList, makeHeatMapOneHot,
                              makeMaps, generate_blw, ComputeLoss, get_loss,
                              load_model, save_model)
from libs.model import model, modelHeatmap, modelLocmap
from libs.lit_data import data2d, data3d, data3d2
from libs.misc import write_log
                              
plt.ion() # interactive mode

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2


## Define Parameters

In [None]:
params = {}

USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    USE_GPU=False
print('Using device:', device)

batch_size = 4
num_joints = 21
image_size = 224
dtype = torch.float32
g_heatmap_size = 9

b_idx = np.repeat(np.arange(batch_size), num_joints)
b_idx = torch.from_numpy(b_idx).long()
j_idx = np.array(list(np.arange(num_joints))*batch_size)
j_idx = torch.from_numpy(j_idx).long()

params['batch_size'] = batch_size
params['num_joints'] = num_joints
params['image_size'] = image_size
params['dtype'] = dtype
params['device'] = device
params['USE_GPU'] = USE_GPU
params['g_heatmap_size'] = 9
params['b_idx'] = b_idx
params['j_idx'] = j_idx

blw = generate_blw(params)

## Load Dataset

In [None]:
transform = transforms.Compose([
    Scale(image_size, image_size),
    ToTensor(),
])

hand_train = HandDataset('toy_dataset.csv', transform=transform, train=True, device=device)
N = len(hand_train)
loader_train = DataLoader(hand_train, batch_size=batch_size,
            sampler=sampler.SubsetRandomSampler(range(int(N*0.85))),
            drop_last=True)

hand_val = HandDataset('toy_dataset.csv', transform=transform, train=True)
loader_val = DataLoader(hand_val, batch_size=batch_size,
            sampler=sampler.SubsetRandomSampler(range(int(N*0.85), int(N*0.9))),
                       drop_last=True)

hand_test = HandDataset('toy_dataset.csv', transform=transform, train=False)
loader_test = DataLoader(hand_test, batch_size=batch_size,
                         sampler=sampler.SubsetRandomSampler(range(int(N*0.9),N)),
                        drop_last=True)


## Train model

### Define additional params

In [None]:
print_every = 600
save_every = 1200

trainlog_fp = 'trainlog.txt'
vallog_fp = 'vallog_fp.txt'

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
        list(model.parameters()) + list(modelHeatmap.parameters()) +\
        list(modelLocmap.parameters()) ), lr=1.0e-3)



### Training!

In [None]:
for epoch in range(50):
    for idx, batch in enumerate(loader_train):
        image = batch['image'].float()
        pos2d_list = batch['pos_2d'] 
        pos3d_list = batch['pos_3d']
        
        model.train()
        loss, loss_detailed = get_loss(model, modelHeatmap, modelLocmap,
                        image, pos2d_list, pos3d_list, blw, params)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        write_log(trainlog_fp, epoch, idx, loss, loss_detailed, 'train')
        
        if idx % print_every == 0:
            model.eval()
            with torch.no_grad():
                val_loss = []
                val_loss_det = []
                for vidx, vbatch in enumerate(loader_val):
                    image = vbatch['image'].float()
                    pos2d_list = vbatch['pos_2d'] 
                    pos3d_list = vbatch['pos_3d']
                    
                    vloss, vloss_detailed = get_loss(model, modelHeatmap, modelLocmap,
                        image, pos2d_list, pos3d_list, blw, params)
                    
                    val_loss.append(vloss)
                    val_loss_det.append(vloss_detailed)
                    
            val_loss = np.mean(val_loss)
            val_loss_det = np.array(val_loss_det)
            val_loss_det = np.mean(val_loss_det, axis=0)
            write_log(vallog_fp, epoch, idx, val_loss, val_loss_det, 'val')
                
        if idx % save_every == 0:
            save_model(epoch, idx, model, modelHeatmap, modelLocmap, optimizer)
            
                    

In [None]:
def makePosList(h_pred, l_pred):
    p2d_y, p2d_x = np.unravel_index(torch.argmax(h_pred.view(num_joints, -1), dim=1).data.numpy(), (image_size, image_size))
    p2d = np.stack((p2d_x, p2d_y), axis=-1)

    p3d_x = l_pred[0].data.numpy()
    p3d_x = p3d_x[p2d_y, p2d_x]

    p3d_y = l_pred[1].data.numpy()
    p3d_y = p3d_y[p2d_y, p2d_x]

    p3d_z = l_pred[2].data.numpy()
    p3d_z = p3d_z[p2d_y, p2d_x]

    p3d = np.stack((p3d_x, p3d_y, p3d_z), axis=-1)
    
    return p2d, p3d
#b_idx = torch.from_numpy(np.repeat(np.arange(batch_size), num_joints)).long()
mplb_idx = np.repeat(np.arange(batch_size), num_joints)
def makePosListBatch(h_pred, l_pred):
    idx_2d = torch.argmax(h_pred.view(batch_size, num_joints, -1), dim=2).data.numpy()
    
    p2d_y, p2d_x = np.unravel_index(idx_2d, (image_size, image_size))
    p2d = np.stack((p2d_x, p2d_y), axis=-1)

    l_pred = l_pred.view(batch_size, 3, -1)
    
    p3d_x = l_pred[:, 0].data.numpy()
    p3d_x = p3d_x[mplb_idx, idx_2d.reshape(-1)]

    p3d_y = l_pred[:, 1].data.numpy()
    p3d_y = p3d_y[mplb_idx, idx_2d.reshape(-1)]

    p3d_z = l_pred[:, 2].data.numpy()
    p3d_z = p3d_z[mplb_idx, idx_2d.reshape(-1)]

    p3d = np.stack((p3d_x.reshape(batch_size, num_joints), p3d_y.reshape(batch_size, num_joints), p3d_z.reshape(batch_size, num_joints)), axis=-1)
    
    return p2d, p3d

print("Evaluation...")

### IMPORTANT ###
# Switch all models to "eval" mode so BatchNorm stop computing new mean and variance, and dropout no longer dropout
# Reference here: https://discuss.pytorch.org/t/what-does-model-eval-do-for-batchnorm-layer/7146
# An here: https://pytorch.org/docs/0.3.1/nn.html?highlight=eval#torch.nn.Module.eval
model.eval()
modelHeatmap.eval()
modelLocmap.eval()

eval_loss = 0
eval_iter = 0
def eval_net():
    for idx, batch in enumerate(loader_train):
        eval_iter = idx
        image = batch['image'].float()
        pos2d_list = batch['pos_2d'] # size (N, 21, 2)
        pos3d_list = batch['pos_3d'] # size (N, 21, 3)
        loc_map, heatmap, one_hot = makeMaps(pos2d_list, pos3d_list)
        y_pred = model(image)
        h_pred = modelHeatmap(y_pred)
        l_pred = modelLocmap(y_pred)
        #print("output shape: {}".format(y_pred.shape))
        # use heatmap loss defined in VNect
        #loss = computeLoss(heatmap, one_hot, loc_map, h_pred, l_pred)

        #eval_loss = eval_loss + loss

        if idx == 0:
            # show some images
            p2d, p3d = makePosList(h_pred[0], l_pred[0])
            show_joints(image[0].data.numpy().transpose((1,2,0)), p2d, p3d)
            show_joints(image[0].data.numpy().transpose((1,2,0)), batch['pos_2d'][0], batch['pos_3d'][0])
            break
#eval_net()
#eval_iter = eval_iter + 1
#print("Eval Loss: {}".format(eval_loss / eval_iter))

**Fully Connected Layer to Predict Gesture**

Input: The outputs of our joint prediction model outputs:

p2d: # size (N, 21, 2)
p3d: # size (N, 21, 3)

Output:

y: # size (N, C=10), where C is the number of gesture classes

**Note:** We will use both the 2D positions and the 3D positions of the joints to figure out what the gesture is

In [None]:
# First, let's get the gesture data
g_dataset = GestureDataset('gesture_dataset.csv', transform=transform, train=True)

N = 40 #len(g_dataset)

loader_g_train = DataLoader(g_dataset, batch_size=batch_size,
            sampler=sampler.SubsetRandomSampler(range(int(N*0.8))))

loader_g_val = DataLoader(g_dataset, batch_size=batch_size,
            sampler=sampler.SubsetRandomSampler(range(int(N*0.8), int(N*0.9))))

loader_g_test = DataLoader(g_dataset, batch_size=batch_size,
                         sampler=sampler.SubsetRandomSampler(range(int(N*0.9),N)))
save_model(epoch, idx, model, modelHeatmap, modelLocmap, optimizer)

**Define the Model**

In [None]:
# our fc should just output probabilities
norm5d = nn.InstanceNorm1d(num_features=5)

fc = nn.Sequential(
    nn.Linear(in_features=105, out_features=50),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(in_features=50, out_features=10),
    nn.Softmax(dim=1)
)

# predicted gesture probability distribution

**Define the FC loss function**

In [None]:
# Defines constants
batch_idx = torch.from_numpy(np.arange(batch_size)).long()
epsilon = 1e-8
def compute_g_loss(g_pred, g_GT):
    # g_GT is of size (N, ) it just contains the labels for the batch
    # print("Probability sum (should be 1.0): ", torch.sum(g_pred[0]) )
    # print("Probability max: ", torch.max(g_pred[0]), " ### Probability min: ", torch.min(g_pred[0]))
    # print("Label: ", g_GT.long() )
    g_loss = torch.sum(-1.0 * (g_pred[batch_idx, g_GT.long()] + epsilon).log()) / batch_size
    return g_loss

**Freeze Joint Prediction Network and Define Optimizer**

In [None]:
# completely freeze out JP Net# comple 
for param in model.parameters():
    param.requires_grad = False
for param in modelHeatmap.parameters():
    param.requires_grad = False
for param in modelLocmap.parameters():
    param.requires_grad = False

# Optimize fully connected network only
g_optimizer = torch.optim.Adam(fc.parameters(), lr=1.0e-3)
g_losses = []

**Load previously trained parameters**

In [None]:
model.load_state_dict(torch.load('model_param_e1_i4200.pt', map_location={'cuda:0': 'cpu'}))
modelHeatmap.load_state_dict(torch.load('modelHeatmap_param_e1_i4200.pt', map_location={'cuda:0': 'cpu'}))
modelLocmap.load_state_dict(torch.load('modelLocmap_param_e1_i4200.pt', map_location={'cuda:0': 'cpu'}))

#fc.load_state_dict(torch.load('fc_param.pt'))
#g_optimizer.load_state_dict(torch.load('g_optimizer_param.pt'))
#g_training_param = torch.load('g_training_param.pt')
#g_losses = training_param['g_losses']

**Training Loop**

In [None]:
for epoch in range(20):
    print("Epoch: {}".format(epoch))
    for idx, batch in enumerate(loader_g_train):
        # make all the ground truth tensors needed for loss computation
        image = batch['image'].float()
        # y_pred is of size 64 x 224 x 224
        y_pred = model(image)
        
        # h_pred is of size 21 x 224 x 224
        h_pred = modelHeatmap(y_pred)
        
        # l_pred is of size 3 x 224 x 224, the 3 representing x, y, z location maps of all 21 joints
        l_pred = modelLocmap(y_pred)
        
        p2d, p3d = makePosListBatch(h_pred, l_pred)
        #print(p2d.shape, p3d.shape)
        # print and store the loss curve
        
        ### Begin: Gesture recognition network
        p2d = torch.from_numpy(p2d) # shape (N, 21, 2)
        p3d = torch.from_numpy(p3d) # shape (N, 21, 3)
        
        p2d.transpose_(1, 2) # shape (N, 2, 21)
        p3d.transpose_(1, 2) # shape (N, 3, 21)
        
        # put 2D and 3D joint positions together
        fc_in = torch.cat((p2d.float(), p3d), dim=1) # shape (N, 5, 21)
        
        n_fc_in = norm5d(fc_in) # normalized each of 2dx, 2dy, 3dx, 3dy, 3dz, over 21 joints

        n_fc_in = n_fc_in.view(batch_size, -1) # shape(N, 105)
        
        #fc_in = torch.randn_like(fc_in)
        g_pred = fc(n_fc_in) # shape (N, 10)
        ### End: Gesture recognition network
        
        g_loss = compute_g_loss(g_pred, batch['label'])
        
        print("G Loss: {}".format(g_loss))
        g_losses.append(g_loss)
        
        g_loss.backward()
        
        g_optimizer.step()
        # Clears the gradients of all optimized torch.Tensor s
        g_optimizer.zero_grad()
        
    torch.save(fc.state_dict(), 'fc_param.pt')
    torch.save(g_optimizer.state_dict(), 'g_optimizer_param.pt')
    torch.save({'g_losses': g_losses, 'epoch': epoch + 1}, 'g_training_param.pt')
        
print("======Training Done======")

In [None]:
for idx, batch in enumerate(loader_g_train):
    # make all the ground truth tensors needed for loss computation
    image = batch['image'].float()
    # y_pred is of size 64 x 224 x 224
    y_pred = model(image)

    # h_pred is of size 21 x 224 x 224
    h_pred = modelHeatmap(y_pred)

    # l_pred is of size 3 x 224 x 224, the 3 representing x, y, z location maps of all 21 joints
    l_pred = modelLocmap(y_pred)

    p2d, p3d = makePosList(h_pred[0], l_pred[0])
    show_joints(image[0].data.numpy().transpose((1,2,0)), p2d, p3d)
    
    p2d, p3d = makePosList(h_pred[1], l_pred[1])
    show_joints(image[1].data.numpy().transpose((1,2,0)), p2d, p3d)
    
    p2d, p3d = makePosList(h_pred[2], l_pred[2])
    show_joints(image[2].data.numpy().transpose((1,2,0)), p2d, p3d)
    
    p2d, p3d = makePosList(h_pred[3], l_pred[3])
    show_joints(image[3].data.numpy().transpose((1,2,0)), p2d, p3d)
    
    break

**Convert PyTorch Model to Keras (Experimental and DOESN't work)**

In [None]:
# Reference: https://mil-tokyo.github.io/webdnn/docs/tutorial/pytorch.html
from pytorch2keras.converter import pytorch_to_keras

# model = models.alexnet(pretrained=True)
# graph = PyTorchConverter().convert(model, dummy_input)
# exec_info = generate_descriptor("webgpu", graph)  # also "webassembly", "webgl", "fallback" are available.
# exec_info.save("./output")

dummy_input = torch.autograd.Variable(torch.randn(1, 3, 224, 224))
dummy_base_output = torch.autograd.Variable(torch.randn(1, 128, 224, 224))

# important to switch all models to eval mode
model.eval()
modelHeatmap.eval()
modelLocmap.eval()
 
# we should specify shape of the input tensor
# Export base model
k_model = pytorch_to_keras(model, dummy_input, (3, 224, 224,), verbose=True)
k_model.save('k_model.h5')

# Export 2D heatmap branch
k_modelHeatmap = pytorch_to_keras(modelHeatmap, dummy_base_output, (128, 224, 224,), verbose=True)
k_modelHeatmap.save('k_modelHeatmap.h5')

# Export 3D location branch
k_modelLocmap = pytorch_to_keras(modelLocmap, dummy_base_output, (128, 224, 224,), verbose=True)
k_modelLocmap.save('k_modelLocmap.h5')

In [6]:
resnet50 = models.resnet50()
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print("resnet50", count_parameters(resnet50))
print("model", count_parameters(model))
print("modelHeatmap", count_parameters(modelHeatmap))
print("modelLocmap", count_parameters(modelLocmap))

resnet50 25557032
model 9553920
modelHeatmap 24213
modelLocmap 97923
