# Tensorflow Datasets used for training PyTorch model
## (COCO captions 2017, FasterRCNN)

In [1]:
#TODO: investigate issue with data_loader hanging when num_workers > 1 
    #ANSWER: image sizes too large 
    #TODO: resize augment
#TODO: download and load pretrained model locally


In [2]:
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import torch
from PIL import Image
import re
import tensorflow_datasets as tfds
import tensorflow as tf
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import utils
import custom_transforms as T
import engine
from engine import train_one_epoch, evaluate

#disable gpu for tensorflow dataset loading so torch can use gpu
tf.config.set_visible_devices([], 'GPU')
if not tf.config.experimental.list_logical_devices('GPU'):
    print('success')

# torch.cuda.empty_cache()

DATA_DIR='/home/evan/Datasets/tensorflow'
NUM_CLASSES = 80

success


2021-10-22 19:27:39.774376: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-22 19:27:39.774830: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-22 19:27:39.775070: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-22 19:27:39.775619: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

## Define dataset

In [8]:
class CocoCaptionsDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, transforms, split='train', download=False):
        dataset,info = tfds.load('coco_captions', with_info=True, split=split, data_dir=data_dir, download=download)
        self.dataset = iter(dataset)
        self.info = info
        self.split = split
        self.transforms = transforms
    
    #no idx used here, always return next in repeated dataset, previously shuffled batched prefetched
    def __getitem__(self, idx):
        sample = next(self.dataset)
        
        #convert from tf.EagerTensor to np.ndarray
        image = sample['image'].numpy()
        image = Image.fromarray(image)
        boxes = sample['objects']['bbox'].numpy()
        labels = sample['objects']['label'].numpy()

        #convert to torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)



        target = {}
        target['boxes'] = boxes
        target['labels'] = labels

        image, target = self.transforms(image, target)

        return image, target

    def __len__(self):
        return self.info.splits[self.split].num_examples

def get_transforms(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        transforms.append(T.RandomPhotometricDistort())
    return T.Compose(transforms)


train = CocoCaptionsDataset(DATA_DIR, get_transforms(train=True), split='train', download=False)
print(train.__len__())
print(train.__getitem__(None)[0].size())
print(train.__getitem__(None)[1]['boxes'])


82783
torch.Size([3, 478, 640])
tensor([[0.2706, 0.2954, 0.3992, 0.4007],
        [0.4231, 0.7597, 0.5831, 0.8971],
        [0.3532, 0.3925, 0.4837, 0.4830],
        [0.5268, 0.6122, 0.6354, 0.7434],
        [0.5053, 0.5638, 0.5874, 0.6663],
        [0.4363, 0.3346, 0.5020, 0.4293],
        [0.4312, 0.6682, 0.5559, 0.8065],
        [0.2556, 0.1519, 0.3563, 0.2430],
        [0.3487, 0.2075, 0.4543, 0.3197],
        [0.4532, 0.4670, 0.5468, 0.5742]])


## Define model architecture

In [4]:
# load a pre-trained model for classification and return
# only the features
#*****************TODO: only download once, load locally*********************************************************
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
# FasterRCNN needs to know the number of
# output channels in a backbone. For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 1280

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# OrderedDict[Tensor], and in featmap_names you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                output_size=7,
                                                sampling_ratio=2)

# put the pieces together inside a FasterRCNN model
model = FasterRCNN(backbone,
                   num_classes=NUM_CLASSES,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)

## Begin training

In [9]:
def run_training(model, num_epochs=10):
    device = torch.device('cuda')
    model.to(device)

    train= CocoCaptionsDataset(DATA_DIR, split='train', transforms = get_transforms(train=True))
    val = CocoCaptionsDataset(DATA_DIR, split='val' , transforms = get_transforms(train=False))

    data_loader_train = torch.utils.data.DataLoader(
        train, batch_size=1, shuffle=True, num_workers=0,
        collate_fn=utils.collate_fn)

    data_loader_val = torch.utils.data.DataLoader(
        val, batch_size=1, shuffle=False, num_workers=0,
        collate_fn=utils.collate_fn)


    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    warmup_factor = 1.0 / 1000
    warmup_iters = min(1000, len(data_loader_train) - 1)

    lr_scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer, start_factor=warmup_factor, total_iters=warmup_iters
    )

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, lr_scheduler, data_loader_train, device, epoch, 10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the val dataset
        evaluate(model, data_loader_val, device=device)

    print("Done training")

run_training(model, num_epochs=10)

Epoch: [0]  [    0/82783]  eta: 13:19:16  lr: 0.000010  loss: 5.1894 (5.1894)  loss_classifier: 4.4196 (4.4196)  loss_box_reg: 0.0027 (0.0027)  loss_objectness: 0.7037 (0.7037)  loss_rpn_box_reg: 0.0634 (0.0634)  time: 0.5793  data: 0.1506  max mem: 2196
Epoch: [0]  [   10/82783]  eta: 4:05:34  lr: 0.000060  loss: 5.1473 (5.1010)  loss_classifier: 4.3695 (4.3229)  loss_box_reg: 0.0001 (0.0004)  loss_objectness: 0.7070 (0.7117)  loss_rpn_box_reg: 0.0634 (0.0659)  time: 0.1780  data: 0.0304  max mem: 3002
Epoch: [0]  [   20/82783]  eta: 3:34:38  lr: 0.000110  loss: 4.7586 (4.6322)  loss_classifier: 3.9911 (3.8637)  loss_box_reg: 0.0001 (0.0012)  loss_objectness: 0.7029 (0.6991)  loss_rpn_box_reg: 0.0622 (0.0682)  time: 0.1344  data: 0.0173  max mem: 3002
Epoch: [0]  [   30/82783]  eta: 3:21:31  lr: 0.000160  loss: 3.0407 (3.7273)  loss_classifier: 2.3631 (2.9935)  loss_box_reg: 0.0003 (0.0013)  loss_objectness: 0.6505 (0.6735)  loss_rpn_box_reg: 0.0562 (0.0590)  time: 0.1286  data: 0.013