In [1]:
from importlib import reload

In [2]:
import dataloader
reload(dataloader)
from dataloader import VideoClassificationDataset
import argparse

In [3]:
opt = {
    'feats_dir': "/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024",
    'max_frames': 50
}

In [4]:
train_dataset = VideoClassificationDataset(opt, 'train')

load feats from /home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024
Pre-cache 309 features in memory.
Finished initializing dataloader.


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda:0")

In [6]:
from torch.utils.data import DataLoader

In [7]:
train_loader = DataLoader(train_dataset,
                         batch_size=8,
                         num_workers=4,
                         shuffle=True)

In [11]:
class NeXtVLAD(nn.Module):
    """NeXtVLAD layer implementation"""

    def __init__(self, dim=1024, num_clusters=64, lamb=2, groups=8, max_frames=300):
        super(NeXtVLAD, self).__init__()
        self.num_clusters = num_clusters
        self.dim = dim
        self.alpha = 0
        self.K = num_clusters
        self.G = groups
        self.group_size = int((lamb*dim) // self.G)
        # expansion FC
        self.fc0 = nn.Linear(dim, lamb*dim)
        # soft assignment FC (the cluster weights)
        self.fc_gk = nn.Linear(lamb*dim, self.G * self.K)
        # attention over groups FC
        self.fc_g = nn.Linear(lamb*dim, self.G)
        self.cluster_weights2 = nn.Parameter(torch.rand(1, self.group_size, self.K))
        
        self.bn0 = nn.BatchNorm1d(max_frames)
        self.bn1 = nn.BatchNorm1d(1)
        
        
    def forward(self, x, mask=None):
#         print(f"x: {x.shape}")
    
        _, M, N = x.shape
        # expansion FC: B x M x N -> B x M x λN
        x_dot = self.fc0(x) 
        
        # reshape into groups: B x M x λN -> B x M x G x (λN/G)
        x_tilde = x_dot.reshape(-1, M, self.G, self.group_size)
        
        # residuals across groups and clusters: B x M x λN -> B x M x (G*K) 
        WgkX = self.fc_gk(x_dot)
        WgkX = self.bn0(WgkX)
        
        # residuals reshape across clusters: B x M x (G*K) -> B x (M*G) x K
        WgkX = WgkX.reshape(-1, M*self.G, self.K)
        
        # softmax over assignment: B x (M*G) x K -> B x (M*G) x K
        alpha_gk = F.softmax(WgkX, dim=-1)
        
        # attention across groups: B x M x λN -> B x M x G
        alpha_g = torch.sigmoid(self.fc_g(x_dot))
        if mask is not None:
            alpha_g = torch.mul(alpha_g, mask.unsqueeze(2))
        
        # reshape across time: B x M x G -> B x (M*G) x 1
        alpha_g = alpha_g.reshape(-1, M*self.G, 1)
        
        # apply attention: B x (M*G) x K (X) B x (M*G) x 1 -> B x (M*G) x K
        activation = torch.mul(alpha_gk, alpha_g)
        
        # sum over time and group: B x (M*G) x K -> B x 1 x K
        a_sum = torch.sum(activation, -2, keepdim=True)
        
        # calculate group centers: B x 1 x K (X) 1 x (λN/G) x K -> B x (λN/G) x K
        a = torch.mul(a_sum, self.cluster_weights2)
        
        # permute: B x (M*G) x K -> B x K x (M*G)
        activation = activation.permute(0, 2, 1)
        
        # reshape: B x M x G x (λN/G) -> B x (M*G) x (λN/G)
        reshaped_x_tilde = x_tilde.reshape(-1, M * self.G, self.group_size)
        
        # cluster activation: B x K x (M*G) (X) B x (M*G) x (λN/G) -> B x K x (λN/G)
        vlad = torch.matmul(activation, reshaped_x_tilde)
        # print(f"vlad: {vlad.shape}")
        
        # permute: B x K x (λN/G) (X) B x (λN/G) x K
        vlad = vlad.permute(0, 2, 1)
        # distance to centers: B x (λN/G) x K (-) B x (λN/G) x K
        vlad = torch.sub(vlad, a)
        # normalize: B x (λN/G) x K
        vlad = F.normalize(vlad, 1)
        # reshape: B x (λN/G) x K -> B x 1 x (K * (λN/G))
        vlad = vlad.reshape(-1, 1, self.K*self.group_size)
        vlad = self.bn1(vlad)
        # reshape:  B x 1 x (K * (λN/G)) -> B x (K * (λN/G)) 
        vlad = vlad.reshape(-1, self.K*self.group_size)
        
        return vlad

In [12]:
class NeXtVLADModel(nn.Module):
    def __init__(self, num_classes, num_clusters=64, dim=1024, lamb=2, hidden_size=1024, 
                 groups=8, max_frames=300, drop_rate=0.5, gating_reduction=8):
        super(NeXtVLADModel, self).__init__()
        self.drop_rate = drop_rate
        self.group_size = int((lamb*dim) // groups)
        self.fc0 = nn.Linear(num_clusters*self.group_size, hidden_size)
        self.bn0 = nn.BatchNorm1d(1)
        self.fc1 = nn.Linear(hidden_size, hidden_size // gating_reduction)
        self.bn1 = nn.BatchNorm1d(1)
        self.fc2 = nn.Linear(hidden_size // gating_reduction, hidden_size)
        self.logistic = nn.Linear(hidden_size, num_classes)
        
        self.video_nextvlad = NeXtVLAD(1024, max_frames=max_frames, lamb=lamb, 
                                       num_clusters=num_clusters, groups=groups)
        
    def forward(self, x, mask=None):
        # B x M x N -> B x (K * (λN/G)) 
        vlad = self.video_nextvlad(x, mask=mask)
        
        # B x (K * (λN/G)) 
        if self.drop_rate > 0.:
            vlad = F.dropout(vlad, p=self.drop_rate)
        
        # B x (K * (λN/G))  -> B x H0
        activation = self.fc0(vlad)
        activation = self.bn0(activation.unsqueeze(1)).squeeze()
        activation = F.relu(activation)
        # B x H0 -> B x Gr
        gates = self.fc1(activation)
        gates = self.bn1(gates.unsqueeze(1)).squeeze()
        # B x Gr -> B x H0
        gates = self.fc2(gates)
        gates = torch.sigmoid(gates)
        # B x H0
        activation = torch.mul(activation, gates)
        out = self.logistic(activation)
        out = torch.sigmoid(out)
        
        return out
            
        

In [22]:
model = NeXtVLADModel(train_dataset.num_classes, max_frames=opt['max_frames'])

In [23]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
exp_lr_schedulr = optim.lr_scheduler.StepLR(optimizer, step_size=25)

model.train()
model.to(device)

for epoch in range(5):
    for data in train_loader:
        fc_feats = data['fc_feats'].to(device)
        labels = data['ground_truth'].to(device)
        masks = data['mask'].to(device)

        out = model(fc_feats, mask=masks)
    #     print(f"out: {out.shape}")
    #     print(f"labels: {labels.shape}")
        loss = F.binary_cross_entropy(out, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"epoch:\t{epoch},\tloss:{loss.cpu().data.numpy()}")

epoch:	0,	loss:0.71150803565979
epoch:	0,	loss:0.7146843075752258
epoch:	0,	loss:0.5954272150993347
epoch:	0,	loss:0.6422020196914673
epoch:	0,	loss:0.5928784012794495
epoch:	0,	loss:0.5122247338294983
epoch:	0,	loss:0.6339268088340759
epoch:	0,	loss:0.6461817026138306
epoch:	0,	loss:0.5485246777534485
epoch:	0,	loss:0.4685041010379791
epoch:	0,	loss:0.47569671273231506
epoch:	0,	loss:0.44309744238853455
epoch:	0,	loss:0.4427070617675781
epoch:	0,	loss:0.4889450967311859
epoch:	0,	loss:0.4677741527557373
epoch:	0,	loss:0.5337631106376648
epoch:	0,	loss:0.26898637413978577
epoch:	0,	loss:0.4407520294189453
epoch:	0,	loss:0.3478405773639679
epoch:	0,	loss:0.3842126131057739
epoch:	0,	loss:0.4324081242084503
epoch:	0,	loss:0.3600882589817047
epoch:	0,	loss:0.23256240785121918
epoch:	0,	loss:0.3121560513973236
epoch:	0,	loss:0.30288824439048767
epoch:	0,	loss:0.17022843658924103
epoch:	0,	loss:0.2929340898990631
epoch:	0,	loss:0.12832005321979523
epoch:	0,	loss:0.27824535965919495
epoch:	0

In [65]:
data = train_dataset.__getitem__(5)
fc_feats = data['fc_feats']
print(fc_feats.shape)

torch.Size([16, 1024])


In [36]:
import numpy as np
import metrics
reload(metrics)
from metrics import calculate_gap

In [34]:
opt = {
    'feats_dir': "/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/test_PCA-1024",
    'max_frames': 50
}
test_dataset = VideoClassificationDataset(opt, 'test')
test_loader = DataLoader(test_dataset,
                         batch_size=8,
                         num_workers=4,
                         shuffle=True)

load feats from /home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/test_PCA-1024
Pre-cache 75 features in memory.
Finished initializing dataloader.


In [43]:
preds = []
actuals = []

for data in test_loader:
    fc_feats = data['fc_feats'].to(device)
    labels = data['ground_truth']
    masks = data['mask'].to(device)

    out = model(fc_feats, mask=masks)
    out = out.cpu().data.numpy()
    labels = labels.cpu().data.numpy()
#     print(out.shape)
#     print(labels.shape)
    preds.extend(out)
    actuals.extend(labels)
    
print(f"GAP(20): {calculate_gap(np.asarray(preds), np.asarray(actuals), top_k=20)}")

GAP(20): 0.9933333333333333
