In [1]:
import torch
import torchvision
import argparse

In [2]:
import arrayfire as af
import array

In [33]:
def toArrayFire(x):
    x_np = x.detach().contiguous().numpy()
    shape = 1
    if len(x_np.shape) == 0:
        shape = (1,)
    else:
        shape = x_np.shape[::-1]
    afArray = af.Array(x_np.ctypes.data, shape, x_np.dtype.char)
    return afArray

def saveStateDict(model, filepath):
    params = {}
    i = 0
    for (name, param) in model.named_parameters():
        if 'in_proj' in name:
            q, k, v = param.chunk(3, dim=0)
            hack = '0'
            if 'in_proj_bias' in name: hack = '1'
            params['0q_' + hack + name] = q
            params['1k_' + hack + name] = k
            params['2v_' + hack + name] = v
            if 'in_proj_bias' in name:
                for key in sorted(params.keys()):
                    af_array = toArrayFire(params[key])
                    if 'weight' in key:
                        af_array = af.array.transpose(af_array)
                    print(key, i, params[key].shape)
                    print(af.array.save_array(key, af_array, filepath, True))
                    i = i + 1
                params = {}
            continue
        elif len(param.size()) > 0:
            af_array = toArrayFire(param)
            if 'fc' in name and 'weight' in name:
                af_array = af.array.transpose(af_array)
            if 'weight' in name and 'proj' in name:
                af_array = af.array.transpose(af_array)
            if 'weight' in name and 'linear' in name:
                af_array = af.array.transpose(af_array)
            print(name, i, param.shape)
            print(af.array.save_array(name, af_array, filepath, True))
            i = i + 1
    
def create_parser():
    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
    parser.add_argument('--lr', default=1e-4, type=float)
    parser.add_argument('--lr_backbone', default=1e-5, type=float)
    parser.add_argument('--batch_size', default=2, type=int)
    parser.add_argument('--weight_decay', default=1e-4, type=float)
    parser.add_argument('--epochs', default=300, type=int)
    parser.add_argument('--lr_drop', default=200, type=int)
    parser.add_argument('--optimizer', default="adam", type=str)
    parser.add_argument('--clip_max_norm', default=0.1, type=float,
                        help='gradient clipping max norm')
    parser.add_argument('--eval_skip', default=1, type=int,
                        help='do evaluation every "eval_skip" frames')
    parser.add_argument('--schedule', default='step', type=str,
                        choices=('step', 'multistep'))

    # Model parameters
    parser.add_argument('--frozen_weights', type=str, default=None,
                        help="Path to the pretrained model. If set, only the mask head will be trained")
    # * Backbone
    parser.add_argument('--backbone', default='resnet50', type=str,
                        help="Name of the convolutional backbone to use")
    parser.add_argument('--dilation', action='store_true',
                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
                        help="Type of positional embedding to use on top of the image features")

    # * Transformer
    parser.add_argument('--enc_layers', default=6, type=int,
                        help="Number of encoding layers in the transformer")
    parser.add_argument('--dec_layers', default=6, type=int,
                        help="Number of decoding layers in the transformer")
    parser.add_argument('--dim_feedforward', default=2048, type=int,
                        help="Intermediate size of the feedforward layers in the transformer blocks")
    parser.add_argument('--hidden_dim', default=256, type=int,
                        help="Size of the embeddings (dimension of the transformer)")
    parser.add_argument('--dropout', default=0.1, type=float,
                        help="Dropout applied in the transformer")
    parser.add_argument('--nheads', default=8, type=int,
                        help="Number of attention heads inside the transformer's attentions")
    parser.add_argument('--num_queries', default=100, type=int,
                        help="Number of query slots")
    parser.add_argument('--pre_norm', action='store_true')
    parser.add_argument('--no_pass_pos_and_query', dest='pass_pos_and_query', action='store_false',
                        help="Disables passing the positional encodings to each attention layers")

    # * Segmentation
    parser.add_argument('--mask_model', default='none', type=str, choices=("none", "smallconv", "v2"),
                        help="Segmentation head to be used (if None, segmentation will not be trained)")

    # Loss
    parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
                        help="Disables auxiliary decoding losses (loss at each layer)")
    parser.add_argument('--set_loss', default='hungarian', type=str,
                        choices=('sequential', 'hungarian', 'lexicographical'),
                        help="Type of matching to perform in the loss")
    parser.add_argument('--bcl', dest='use_bcl', action='store_true',
                        help="Use balanced classification loss")
    # * Matcher
    parser.add_argument('--set_cost_class', default=1, type=float,
                        help="Class coefficient in the matching cost")
    parser.add_argument('--set_cost_bbox', default=5, type=float,
                        help="L1 box coefficient in the matching cost")
    parser.add_argument('--set_cost_giou', default=2, type=float,
                        help="giou box coefficient in the matching cost")
    # * Loss coefficients
    parser.add_argument('--mask_loss_coef', default=1, type=float)
    parser.add_argument('--dice_loss_coef', default=1, type=float)
    parser.add_argument('--bbox_loss_coef', default=5, type=float)
    parser.add_argument('--giou_loss_coef', default=2, type=float)
    parser.add_argument('--eos_coef', default=0.1, type=float,
                        help="Relative classification weight of the no-object class")

    # dataset parameters
    parser.add_argument('--dataset_file', default='coco')
    parser.add_argument('--coco_path', type=str, default='/datasets01/COCO/022719')
    parser.add_argument('--coco_panoptic_path', type=str, default='/datasets01/COCO/060419')
    parser.add_argument('--remove_difficult', action='store_true')
    parser.add_argument('--masks', action='store_true')

    parser.add_argument('--output-dir', default='',
                        help='path where to save, empty for no saving')
    parser.add_argument('--device', default='cuda',
                        help='device to use for training / testing')
    parser.add_argument('--seed', default=42, type=int)
    parser.add_argument('--resume', default='', help='resume from checkpoint')
    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                        help='start epoch')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--num_workers', default=2, type=int)

    # distributed training parameters
    parser.add_argument('--world-size', default=1, type=int,
                        help='number of distributed processes')
    parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
    return parser

In [27]:
parser = create_parser()
args = parser.parse_args(["--hidden_dim=8", "--dropout=0.0"])
model = build_transformer(args)   

In [34]:
from models.transformer import *


from models.backbone import *
filepath = '/private/home/padentomasello/scratch/pytorch_testing/transformer.array'

N = 2
C = 8
H = 3
W = 3

embedding_size = 8
tgt_len = 10

queries = torch.rand(tgt_len, embedding_size)
src = torch.rand(N, C, H, W)
pos = torch.rand(N, C, H, W)
mask = torch.zeros(N, H, W)
#mask[0, :20, :20] = 1
##mask[1, :4, :10] = 1



af.array.save_array('src', toArrayFire(src), filepath, False)
af.array.save_array('queries', toArrayFire(queries), filepath, True)
af.array.save_array('mask', toArrayFire(mask), filepath, True)
af.array.save_array('pos', toArrayFire(pos), filepath, True)
       


model.eval()
output = model(src, mask.to(bool), queries, pos)[0]
saveStateDict(model, filepath)
af.array.save_array('output', toArrayFire(output), filepath, True)

0q_0encoder.layers.0.self_attn.in_proj_weight 0 torch.Size([8, 8])
4
0q_1encoder.layers.0.self_attn.in_proj_bias 1 torch.Size([8])
5
1k_0encoder.layers.0.self_attn.in_proj_weight 2 torch.Size([8, 8])
6
1k_1encoder.layers.0.self_attn.in_proj_bias 3 torch.Size([8])
7
2v_0encoder.layers.0.self_attn.in_proj_weight 4 torch.Size([8, 8])
8
2v_1encoder.layers.0.self_attn.in_proj_bias 5 torch.Size([8])
9
encoder.layers.0.self_attn.out_proj.weight 6 torch.Size([8, 8])
10
encoder.layers.0.self_attn.out_proj.bias 7 torch.Size([8])
11
encoder.layers.0.linear1.weight 8 torch.Size([2048, 8])
12
encoder.layers.0.linear1.bias 9 torch.Size([2048])
13
encoder.layers.0.linear2.weight 10 torch.Size([8, 2048])
14
encoder.layers.0.linear2.bias 11 torch.Size([8])
15
encoder.layers.0.norm1.weight 12 torch.Size([8])
16
encoder.layers.0.norm1.bias 13 torch.Size([8])
17
encoder.layers.0.norm2.weight 14 torch.Size([8])
18
encoder.layers.0.norm2.bias 15 torch.Size([8])
19
0q_0encoder.layers.1.self_attn.in_proj_weig

1k_0decoder.layers.4.multihead_attn.in_proj_weight 210 torch.Size([8, 8])
214
1k_1decoder.layers.4.multihead_attn.in_proj_bias 211 torch.Size([8])
215
2v_0decoder.layers.4.multihead_attn.in_proj_weight 212 torch.Size([8, 8])
216
2v_1decoder.layers.4.multihead_attn.in_proj_bias 213 torch.Size([8])
217
decoder.layers.4.multihead_attn.out_proj.weight 214 torch.Size([8, 8])
218
decoder.layers.4.multihead_attn.out_proj.bias 215 torch.Size([8])
219
decoder.layers.4.linear1.weight 216 torch.Size([2048, 8])
220
decoder.layers.4.linear1.bias 217 torch.Size([2048])
221
decoder.layers.4.linear2.weight 218 torch.Size([8, 2048])
222
decoder.layers.4.linear2.bias 219 torch.Size([8])
223
decoder.layers.4.norm1.weight 220 torch.Size([8])
224
decoder.layers.4.norm1.bias 221 torch.Size([8])
225
decoder.layers.4.norm2.weight 222 torch.Size([8])
226
decoder.layers.4.norm2.bias 223 torch.Size([8])
227
decoder.layers.4.norm3.weight 224 torch.Size([8])
228
decoder.layers.4.norm3.bias 225 torch.Size([8])
229


258

In [36]:
output

tensor([[[[-1.1497e+00, -1.8422e+00,  5.3584e-01, -4.2837e-01,  8.7465e-01,
            2.3533e-01,  1.3450e+00,  4.2949e-01],
          [-1.1422e+00, -1.8410e+00,  5.3973e-01, -4.3984e-01,  8.7021e-01,
            2.3584e-01,  1.3518e+00,  4.2544e-01],
          [-1.1505e+00, -1.8423e+00,  5.3595e-01, -4.2602e-01,  8.7637e-01,
            2.3130e-01,  1.3438e+00,  4.3137e-01],
          [-1.1422e+00, -1.8419e+00,  5.3946e-01, -4.3884e-01,  8.7005e-01,
            2.3922e-01,  1.3514e+00,  4.2271e-01],
          [-1.1407e+00, -1.8411e+00,  5.4150e-01, -4.4315e-01,  8.6841e-01,
            2.3937e-01,  1.3524e+00,  4.2325e-01],
          [-1.1475e+00, -1.8424e+00,  5.3804e-01, -4.3271e-01,  8.7312e-01,
            2.3756e-01,  1.3453e+00,  4.2848e-01],
          [-1.1424e+00, -1.8434e+00,  5.3477e-01, -4.3430e-01,  8.7063e-01,
            2.3132e-01,  1.3502e+00,  4.3316e-01],
          [-1.1493e+00, -1.8426e+00,  5.3871e-01, -4.3077e-01,  8.7433e-01,
            2.3917e-01,  1.3429e+00

In [None]:
model.state_dict().keys()

In [45]:
output = model(NestedTensor(x, mask))

torch.Size([2, 7, 7])
torch.Size([2, 256, 7, 7])


In [48]:
len(output[0])

1

In [54]:
output[1][0].shape

torch.Size([2, 256, 7, 7])

In [56]:
output[0][0].tensors.shape

torch.Size([2, 2048, 7, 7])