Try to keep the code here. And then export into a .py file.

Do not try to modify the .py file directly until this is notebook is gone from each branch.

Remember to clear output

In [1]:
# needed for model

import torch
import torch.nn as nn

from torch.autograd import Variable
import torch.nn.functional as F

import numpy as np

import torchvision

In [2]:
from data_helper import UnlabeledDataset, LabeledDataset
from helper import collate_fn, draw_box

In [3]:
# yolo v3

# Utils and calculating loss

Transforming Coordinates

Define the given coordinates as world coordinates

Define normalized from upper left bound of world coordinates (translate to there, rotate, and normalize) as our normalized image coordinates (or image coordinates for short).

Always facing right in world coordinates.

In [4]:
BASE = 40
WIDTH = 2 * 40
HEIGHT = 2 * 40

NUM_CLASSES = 10

#cuda = torch.cuda.is_available()
cuda = False

device = 'cuda:0' if cuda else 'cpu'
FloatTensor = FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
## input is the 
# want output that is
# [batch_index, class_index, x_center, y_center, width, height, tx1, tx2, tx3, tx4, ty1, ty2, ty3, ty4]
# where the tXN ranges from -1 to 1 and is the location of that coordinate in terms of +- w/2 or h/2
def transform_target(in_target):
    
    out_target = []
    
    for tgt_index in range(len(in_target)):
        
        #how many boxes for these target
        nbox = in_target[tgt_index]['bounding_box'].shape[0]
        individual_target = FloatTensor(nbox, 14).fill_(0)
        
        # CONVERT ALL THE BOUNDING BOXES for an individual sample at once
        
        bbox = in_target[tgt_index]['bounding_box'].to(device)
        translation = FloatTensor(bbox.shape[0], bbox.shape[1], bbox.shape[2])
        translation[:, 0, :].fill_(-40)
        translation[:, 1, :].fill_(40)

        # translate to uppert left
        box = bbox - translation
        # reflect y
        box[:, 1, :].mul_(-1)

        x_min = box[:, 0].min(dim = 1)[0]
        y_min = box[:, 1].min(dim = 1)[0]
        x_max = box[:, 0].max(dim = 1)[0]
        y_max = box[:, 1].max(dim = 1)[0]

        x_center = ((x_min + x_max) / 2)
        y_center = ((y_min + y_max) / 2)
        width = (x_max - x_min)
        height = (y_max - y_min)

        # already normalized
        tx = (box [:, 0, :] - x_center.view(-1, 1)) / (width.view(-1, 1) / 2)
        ty = (box [:, 1, :] - y_center.view(-1, 1)) / (height.view(-1, 1) / 2)

        x_center_n = x_center / WIDTH
        y_center_n = y_center / HEIGHT
        width_n = width / WIDTH
        height_n = height / HEIGHT

        individual_target[:, 2] = x_center_n
        individual_target[:, 3] = y_center_n
        individual_target[:, 4] = width_n
        individual_target[:, 5] = height_n
        
        individual_target[:, 6:10] = tx
        individual_target[:, 10:14] = ty
        for box_index in range(nbox):
            
            
            category = in_target[tgt_index]['category'][box_index]
            
            # from which sample in the batch
            individual_target[box_index, 0] = tgt_index
            # class
            individual_target[box_index, 1] = category
            
        
        out_target.append(individual_target)
        
    return torch.cat(out_target, dim = 0) 

Load presaved model

In [5]:
# works by side effects
def load_pretask_weight_from_model(model, presaved_encoder):
    model.encoder.load_state_dict(presaved_encoder.state_dict())
    
    for param in model.encoder.parameters():
        param.requires_grad = False
        
    return model

In [6]:
# use this if you want Initialize Our Model with encoder weights from an existing pretask encoder in memory
def initialize_model_for_training(presaved_encoder):
    model = KobeModel()
    load_pretask_weight_from_model(model, presaved_encoder)
    
    return model

In [7]:
# use this if you want Initialize Our Model with encoder weights from a file
def initialize_model_for_training_file(presaved_encoder_file):
    presaved_encoder = PreTaskEncoder()
    presaved_encoder.load_state_dict(torch.load(presaved_encoder_file))
    presaved_encoder.eval()

    
    return initialize_model_for_training(presaved_encoder)

Converting predictions to the format for competition

Helper functions to calculate bounding boxes and such

In [8]:
# taken from https://github.com/eriklindernoren/PyTorch-YOLOv3/blob/master/utils/utils.py


def xywh2xyxy(x):
    y = x.new(x.shape)
    y[..., 0] = x[..., 0] - x[..., 2] / 2
    y[..., 1] = x[..., 1] - x[..., 3] / 2
    y[..., 2] = x[..., 0] + x[..., 2] / 2
    y[..., 3] = x[..., 1] + x[..., 3] / 2
    return y


def bbox_wh_iou(wh1, wh2):
    wh2 = wh2.t()
    w1, h1 = wh1[0], wh1[1]
    w2, h2 = wh2[0], wh2[1]
    inter_area = torch.min(w1, w2) * torch.min(h1, h2)
    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
    return inter_area / union_area


def bbox_iou(box1, box2, x1y1x2y2=True):
    """
    Returns the IoU of two bounding boxes
    """
    if not x1y1x2y2:
        # Transform from center and width to exact coordinates
        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
    else:
        # Get the coordinates of bounding boxes
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

    # get the corrdinates of the intersection rectangle
    inter_rect_x1 = torch.max(b1_x1, b2_x1)
    inter_rect_y1 = torch.max(b1_y1, b2_y1)
    inter_rect_x2 = torch.min(b1_x2, b2_x2)
    inter_rect_y2 = torch.min(b1_y2, b2_y2)
    # Intersection area
    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(
        inter_rect_y2 - inter_rect_y1 + 1, min=0
    )
    # Union Area
    b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
    b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

    return iou


def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):
    """
    Removes detections with lower object confidence score than 'conf_thres' and performs
    Non-Maximum Suppression to further filter detections.
    Returns detections with shape:
        (x1, y1, x2, y2, x1, x2, x3, x4, y1, y2, y3, y4, object_conf, class_score, class_pred)

        
        # where the x1, ..., x4 and y1, ... y4 are stll from -1 to 1
        # first x1, y1, x2, y2 are in the grid coordinates and need to be converted back
        
    """

    # From (center x, center y, width, height) to (x1, y1, x2, y2)
    prediction[..., :4] = xywh2xyxy(prediction[..., :4])
    output = [None for _ in range(len(prediction))]
    for image_i, image_pred in enumerate(prediction):
        # Filter out confidence scores below threshold
        image_pred = image_pred[image_pred[:, 12] >= conf_thres]
        # If none are remaining => process next image
        if not image_pred.size(0):
            continue
        # Object confidence times class confidence
        score = image_pred[:, 12] * image_pred[:, 13:].max(1)[0]
        # Sort by it
        image_pred = image_pred[(-score).argsort()]
        class_confs, class_preds = image_pred[:, 13:].max(1, keepdim=True)
        detections = torch.cat((image_pred[:, :13], class_confs.float(), class_preds.float()), 1)
        # Perform non-maximum suppression
        keep_boxes = []
        #print("DETECTIONS BEFORE NMS")
        #print(detections.shape)
        while detections.size(0):
            large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
            label_match = detections[0, -1] == detections[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            weights = detections[invalid, 12:13]
            # Merge overlapping bboxes by order of confidence
            detections[0, :12] = (weights * detections[invalid, :12]).sum(0) / weights.sum()
            keep_boxes += [detections[0]]
            detections = detections[~invalid]
        if keep_boxes:
            output[image_i] = torch.stack(keep_boxes)

    return output


def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):

    BoolTensor = torch.cuda.BoolTensor if pred_boxes.is_cuda else torch.BoolTensor
    FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor

    nB = pred_boxes.size(0)
    nA = pred_boxes.size(1)
    nC = pred_cls.size(-1)
    nG = pred_boxes.size(2)

    # Output tensors
    obj_mask = BoolTensor(nB, nA, nG, nG).fill_(0)
    noobj_mask = BoolTensor(nB, nA, nG, nG).fill_(1)
    class_mask = FloatTensor(nB, nA, nG, nG).fill_(0)
    iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0)
    
    tx = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty = FloatTensor(nB, nA, nG, nG).fill_(0)
    tw = FloatTensor(nB, nA, nG, nG).fill_(0)
    th = FloatTensor(nB, nA, nG, nG).fill_(0)
    tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)
    
    ### predict additional coordinates for the center within the anchor box
    ##### THIS IS OUR ADDITION
    ##### RANGE OF THESE VALUES TO BE DETERMINED
    tx1 = FloatTensor(nB, nA, nG, nG).fill_(0)
    tx2 = FloatTensor(nB, nA, nG, nG).fill_(0)
    tx3 = FloatTensor(nB, nA, nG, nG).fill_(0)
    tx4 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty1 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty2 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty3 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty4 = FloatTensor(nB, nA, nG, nG).fill_(0)

    # Convert to position relative to box
    target_boxes = target[:, 2:6] * nG
    gxy = target_boxes[:, :2]
    gwh = target_boxes[:, 2:]

    
    
    # Get anchors with best iou
    ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors])
    best_ious, best_n = ious.max(0)
    # Separate target values
    b, target_labels = target[:, :2].long().t()
    gx, gy = gxy.t()
    gw, gh = gwh.t()
    gi, gj = gxy.long().t()
    # Set masks
    obj_mask[b, best_n, gj, gi] = 1
    noobj_mask[b, best_n, gj, gi] = 0

    # Set noobj mask to zero where iou exceeds ignore threshold
    for i, anchor_ious in enumerate(ious.t()):
        noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0

    # Coordinates
    tx[b, best_n, gj, gi] = gx - gx.floor()
    ty[b, best_n, gj, gi] = gy - gy.floor()
    
    
    tx1[b, best_n, gj, gi] = target[:, 6]
    tx2[b, best_n, gj, gi] = target[:, 7]
    tx3[b, best_n, gj, gi] = target[:, 8]
    tx4[b, best_n, gj, gi] = target[:, 9]
    
    ty1[b, best_n, gj, gi] = target[:, 10]
    ty2[b, best_n, gj, gi] = target[:, 11]
    ty3[b, best_n, gj, gi] = target[:, 12]
    ty4[b, best_n, gj, gi] = target[:, 13]
    
    # Width and height
    tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
    th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
    # One-hot encoding of label
    tcls[b, best_n, gj, gi, target_labels] = 1
    # Compute label correctness and iou at best anchor
    class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
    # iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)

    tconf = obj_mask.float()
    return (class_mask, obj_mask, noobj_mask, 
            tx, ty, tw, th, 
            tx1, tx2, tx3, tx4, 
            ty1, ty2, ty3, ty4, 
            tcls, tconf)

Losses

In [9]:
def RoadMapLoss(pred_rm, target_rm):
    bce_loss = nn.BCELoss()

    return bce_loss(pred_rm, target_rm)

In [10]:
def total_joint_loss(yolo_loss, rm_loss, lambd):
    return yolo_loss + lambd * rm_loss

Train Loop and test loops
Not necessarily using data loader.

Assuming targets are already pre-processed.

In [11]:
def train_yolo(data_loader, kobe_model, kobe_optimizer, lambd = 0.5):
    kobe_model.train()
    train_loss = 0 
        
    for sample, target, road_image, extra in trainloader:
        sample = torch.stack(sample).to(device)
        target = transform_target(target).to(device)
        road_image = torch.stack(road_image)

        kobe_optimizer.zero_grad()

        output_yolo, yolo_loss = kobe_model(sample, yolo_target = target)

        # SHOULD GET LOWER OVER EPOCHS
        #print("PRINTING output yolo after nms")
        #if output_yolo[0] is not None:
        #    print(output_yolo[0].shape)
        #else:
        #    print("It was none!")
        #    print(output_yolo)
        #rm_loss = RoadMapLoss(outputs_rm, batch_rms)

        #loss = total_joint_loss(yolo_loss, rm_loss, lambd)

        train_loss += yolo_loss.item()
        yolo_loss.backward()

        kobe_optimizer.step()
        
    print("TRAIN LOSS: {}".format(train_loss.item()))

# Model Part

Pre-defined anchors. Should honestly come from KMeans on detection boxes but let's see how this does before going complex

In [12]:
### the code uses only the last 3 anchors so let ssee what this does
# width, height

#### anchors are supposed to be in terms of number of grid points it would take
#### in a 416x416 image (assuming using default of YOLO)
#### we are given 80x80
### we match in the 416x416 space though
### so scale what esteban gave by 5 (5.2 actualy but wtv)
anchors = [(5,5), (25, 12), (12, 25), (100, 25), (50, 12), (40, 60)]

Our YoloLayer for task of object localization

Ignoring orientation

In [13]:
ENCODER_HIDDEN = 26718
class PreTaskEncoder(nn.Module):
    def __init__(self, n_features):
        super(PreTaskEncoder, self).__init__()
        # number of different kernels to use
        self.n_features = n_features
        self.conv1 = nn.Conv2d(in_channels=3,
                               out_channels=n_features,
                               kernel_size=5,
                               )
        self.conv2 = nn.Conv2d(n_features,
                               n_features,
                               kernel_size=5)
    
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)

        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)

        # return an array shape
        x = x.view(-1, ENCODER_HIDDEN)
        return x

In [2]:
class ReshapeLayer2d(nn.Module):
    def __init__(self, channels, dim):
        super(ReshapeLayer2d, self).__init__()
        self.channels = channels
        self.dim = dim

    def forward(self, x):
        return x.view(x.shape[0], self.channels, self.dim, self.dim)
    
class ReshapeLayer1d(nn.Module):
    def __init__(self, features):
        super(ReshapeLayer1d, self).__init__()
        self.features = features

    def forward(self, x):
        return x.view(x.shape[0], self.features)

class YoloDecoder(nn.Module):
    
    def __init__(self, anchors, num_classes, img_dim=416):
        
        super(YoloDecoder, self).__init__()
        
        self.anchors = anchors
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.ignore_thres = 0.5
        
        self.obj_scale = 1
        # 100 originally here
        # 5 gives about 60 objects detected per image
        # 20 gives about 10 per image
        # 10 gives about 45
        self.noobj_scale = 12
        
        self.img_dim = img_dim
        self.grid_size = 8
        
        # takes in dense output from encoder or shared decoder and puts into an
        # image of dim img_dim

        self.m = nn.Sequential(
            nn.Linear(6 * ENCODER_HIDDEN, 5 * 15 * 15),
            nn.ReLU(),
            ReshapeLayer2d(5, 15),
            nn.Conv2d(5, 5, kernel_size=3, stride = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride = 1),
            nn.Conv2d(5, 5, kernel_size=3, stride = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 1), 
            ReshapeLayer1d(405),
            nn.Linear(405, self.num_anchors * (self.num_classes + 5 + 8) * self.grid_size * self.grid_size)
        )
        
        self.compute_grid_offsets(self.grid_size, cuda)
        
    def compute_grid_offsets(self, grid_size, cuda=True):
        self.grid_size = grid_size
        g = self.grid_size
        FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
        self.stride = self.img_dim / self.grid_size
        # Calculate offsets for each grid
        self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
        self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
        self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
        
    def forward(self, x, targets = None):
        # Tensors for cuda support
        # Tensors for cuda support
        x = self.m(x)
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
        BoolTensor = torch.cuda.BoolTensor if x.is_cuda else torch.BoolTensor

        num_samples = x.shape[0]

        prediction = (
            x.view(num_samples, self.num_anchors, self.num_classes + 5 + 8, self.grid_size, self.grid_size)
            .permute(0, 1, 3, 4, 2)
            .contiguous()
        )

        # Get outputs
        xc = torch.sigmoid(prediction[..., 0])  # Center x
        yc = torch.sigmoid(prediction[..., 1])  # Center y
        w = prediction[..., 2]  # Width
        h = prediction[..., 3]  # Height
        
        #### get x1, x2, x3, x4, y1, y2, y3, y4
        
        x1 = torch.tanh(prediction[..., 4])
        x2 = torch.tanh(prediction[..., 5])
        x3 = torch.tanh(prediction[..., 6])
        x4 = torch.tanh(prediction[..., 7])
        y1 = torch.tanh(prediction[..., 8])
        y2 = torch.tanh(prediction[..., 9])
        y3 = torch.tanh(prediction[..., 10])
        y4 = torch.tanh(prediction[..., 11])
        
        pred_conf = torch.sigmoid(prediction[..., 12])  # Conf
        pred_cls = torch.sigmoid(prediction[..., 13:])  # Cls pred.

        # Add offset and scale with anchors
        # mulitply by stride to convert from grid to image coordinates (of YOLO img of 416)
        pred_boxes = FloatTensor(prediction[..., :12].shape)
        
        pred_boxes[..., 0] = (xc.data + self.grid_x) * self.stride
        pred_boxes[..., 1] = (yc.data + self.grid_y) * self.stride
        
        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w * self.stride
        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h * self.stride
        
        pred_boxes[..., 4] = x1
        pred_boxes[..., 5] = x2
        pred_boxes[..., 6] = x3
        pred_boxes[..., 7] = x4
        
        pred_boxes[..., 8] = y1
        pred_boxes[..., 9] = y2
        pred_boxes[..., 10] = y3
        pred_boxes[..., 11] = y4
        ### need to figure out what to do with all the x1, x2, x3, x4 etc

        # ORIGINAL OUTPUTS IN TERMS OF GRID SIZES, DO NOT FORGET TO CONVERT BACK
        output = torch.cat(
            (
                pred_boxes.view(num_samples, -1, 12),
                pred_conf.view(num_samples, -1, 1),
                pred_cls.view(num_samples, -1, self.num_classes),
            ),
            -1,
        )

        if targets is None:
            return output, 0
        else:
            mse_loss = nn.MSELoss()
            bce_loss = nn.BCELoss()
    
            class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tx1, tx2, tx3, tx4, ty1, ty2, ty3, ty4, tcls, tconf = build_targets(
                        pred_boxes=pred_boxes,
                        pred_cls=pred_cls,
                        target=targets,
                        anchors=self.scaled_anchors,
                        ignore_thres=self.ignore_thres,
                    )

            # Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
            loss_xc = mse_loss(xc[obj_mask], tx[obj_mask])
            loss_yc = mse_loss(yc[obj_mask], ty[obj_mask])
        

            loss_x1 = mse_loss(x1[obj_mask], tx1[obj_mask])
            loss_x2 = mse_loss(x2[obj_mask], tx2[obj_mask])
            loss_x3 = mse_loss(x3[obj_mask], tx3[obj_mask])
            loss_x4 = mse_loss(x4[obj_mask], tx4[obj_mask])
            
            loss_y1 = mse_loss(y1[obj_mask], ty1[obj_mask])
            loss_y2 = mse_loss(y2[obj_mask], ty2[obj_mask])
            loss_y3 = mse_loss(y3[obj_mask], ty3[obj_mask])
            loss_y4 = mse_loss(y4[obj_mask], ty4[obj_mask])
            
            loss_w = mse_loss(w[obj_mask], tw[obj_mask])
            loss_h = mse_loss(h[obj_mask], th[obj_mask])

            loss_conf_obj = bce_loss(pred_conf[obj_mask], tconf[obj_mask])
            loss_conf_noobj = bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
            loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
            loss_cls = bce_loss(pred_cls[obj_mask], tcls[obj_mask])
            
            #print("Losses xc {}, yc {}, x1 {}, x2 {}, x3 {}, x4 {}, y1 {}, y2 {}, y3 {}, y4 {}, \
            #       w {}, h {}, conf {}, cls {}".format(loss_xc.item(), loss_yc.item(), loss_x1.item(),
            #                                          loss_x2.item(), loss_x3.item(), loss_x4.item(),
            #                                          loss_y1.item(), loss_y2.item(), loss_y3.item(),
            #                                          loss_y4.item(), loss_w.item(), loss_h.item(),
            #                                          loss_conf.item(), loss_cls.item()))
            total_loss = 10*(loss_xc + loss_yc + \
                         loss_x1 + loss_x2 + loss_x3 + loss_x4 + \
                         loss_y1 + loss_y2 + loss_y3 + loss_y4 + \
                         loss_w + loss_h) + loss_conf + 10*loss_cls

            return output, total_loss


Our Model does that does both tasks

In [15]:
class KobeModel(nn.Module):
    
    def __init__(self, num_classes, encoder_features, yolo_dim, rm_dim):
        super(KobeModel, self).__init__()
        
        self.yolo_dim = yolo_dim
        
        self.encoder = PreTaskEncoder(encoder_features)
        
        
        #self.shared_decoder = nn.Sequential()
        
        self.yolo_decoder = YoloDecoder(anchors, num_classes, img_dim=yolo_dim)
        
        #self.rm_decoder = RmDecoder(rm_dim)
        
    def encode(self, x):
        
        # get all the representations laid out like this
        x = torch.cat([self.encoder(x[:, i, :]) for i in range(6)], dim = 1)
            
            
        #convert from dense representation from encoder into an image
        # x.view(...)
        
        #x = self.shared_decoder(x)
        
        return x
    
    def forward(self, x, yolo_target = None, rm_target = None ):
        encoding = self.encode(x)
        
        # output_1 first 12 corresponds to the bounding boxes
        # output_1 13 corresponds to confidences
        # output_1's and the rest corresponds to the class labels
        # oputput_1[3] is them spread out
        output_1, yolo_loss = self.get_bounding_boxes(x, encoding = encoding, target = yolo_target)
        # roadmap decoder
        #output_2, rm_loss = self.rm_decoder(x, encoding, target = rm_target)
        
        # output1 is not in the context of our bounding boxes
        #return output_1, output_2, yolo_loss, rm_loss
        return output_1, yolo_loss
    
    # for easy use for competition
    # in competition, encoding is None
    def get_bounding_boxes(self, x, encoding = None, target = None):
        if encoding is None:
            encoding = self.encode(x)
        
        outputs, yolo_loss = self.yolo_decoder(encoding, targets=target)
        
        outputs = non_max_suppression(outputs)
        
        boxes = []
        
        for output in outputs:
            # let's convert it back to center_x, center_y, width and height
            if output is None:
                boxes.append(None)
                continue
            
            better_coordinates = FloatTensor(len(output), 2, 4)
            translation = FloatTensor(len(output), 2, 4)
            translation[:, 0, :].fill_(-40)
            translation[:, 1, :].fill_(40)

            center_x = (output[:, 0] + output[:, 2]) / 2 / 416 * 80
            center_y = (output[:, 1] + output[:, 3]) / 2 / 416 * 80
            width = output[:, 2] - output[:,0] / 416 * 80
            height = output[:, 3] - output[:,1] / 416 * 80
            
            x1 = center_x + output[:, 4] * width/2
            x2 = center_x + output[:, 5] * width/2
            x3 = center_x + output[:, 6] * width/2
            x4 = center_x + output[:, 7] * width/2
            y1 = center_y + output[:, 8] * height/2
            y2 = center_y + output[:, 9] * height/2
            y3 = center_y + output[:, 10] * height/2
            y4 = center_y + output[:, 11] * height/2  
            
            better_coordinates[:, 0, 0] = x1
            better_coordinates[:, 0, 1] = x2
            better_coordinates[:, 0, 2] = x3
            better_coordinates[:, 0, 3] = x4
            
            better_coordinates[:, 1, 0] = y1
            better_coordinates[:, 1, 1] = y2
            better_coordinates[:, 1, 2] = y3
            better_coordinates[:, 1, 3] = y4
            
            better_coordinates[:, 1, :].mul_(-1)
            # shift back!
            better_coordinates += translation
            
            boxes.append(better_coordinates)
        return tuple(boxes), yolo_loss
            

In [16]:
kobe_model = KobeModel(10, 6, 416, 800)
kobe_model.to(device)
lr = 0.0001
b1 = 0.9
b2 = 0.999

kobe_optimizer = torch.optim.Adam(kobe_model.parameters(), 
                                            lr=lr,
                                            betas = (b1,b2))

In [17]:
n_epochs = 10

In [18]:
image_folder = 'data'
annotation_csv = 'data/annotation.csv'

transform = torchvision.transforms.ToTensor()


labeled_scene_index = np.arange(106, 134)
labeled_trainset = LabeledDataset(image_folder=image_folder,
                                  annotation_file=annotation_csv,
                                  scene_index=labeled_scene_index,
                                  transform=transform,
                                  extra_info=True
                                 )
trainloader = torch.utils.data.DataLoader(labeled_trainset, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)

for sample, target, road_image, extra in trainloader:
    sample = torch.stack(sample).to(device)
    target = transform_target(target).to(device)
    road_image = torch.stack(road_image)
    
    output_yolo, yolo_loss = kobe_model(sample, yolo_target = target)
    
    # SHOULD GET LOWER OVER EPOCHS
    print(output_yolo[0].shape)
    yolo_loss.backward()

In [None]:
for epoch in range(n_epochs):
    print("EPOCH: {}".format(epoch))
    train_yolo(trainloader, kobe_model, kobe_optimizer, 10)

EPOCH: 0
DETECTIONS BEFORE NMS
torch.Size([796, 15])
DETECTIONS BEFORE NMS
torch.Size([796, 15])
PRINTING output yolo after nms
torch.Size([780, 2, 4])


# to debug architecture
z = torch.rand(10 , 5 * 15 * 15)
z = ReshapeLayer2d(5, 15)(z)
z = nn.Conv2d(5, 5, kernel_size=3, stride = 1)(z)

z = nn.MaxPool2d(kernel_size=2, stride = 1)(z)
z = nn.Conv2d(5, 5, kernel_size=3, stride = 1)(z)

z = nn.MaxPool2d(kernel_size = 2, stride = 1)(z)
z = ReshapeLayer1d(405)(z)