In [208]:
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import torch
from torch import nn
import math
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torchvision import datasets
import torchvision.transforms as transforms
from PIL import Image, ImageDraw, ImageFilter, ImageTransform
import random as rand
import os
import torch.nn.functional as F
from PIL import Image, ImageDraw
import numpy as np
import os
from copy import copy
import matplotlib
import cv2
from dataclasses import dataclass, field

In [3]:
# define darknet19 backbone architecture config
# old tuple format: (kernel_size, out_channels, stride, padding)
# tuple format: (in channels, out channels, kernel_size, stride, padding)
# smaller network, we are just doing 4 classes as a demo

architecture_config = [
    (3, 64, 7, 2, 3),
    "M",
    (64, 192, 3, 1, 1),
    "M",
    (192, 128, 1, 1, 0),
    (128, 256, 3, 1, 1),
    (256, 256, 1, 1, 0),
    (256, 512, 3, 1, 1),
    "M",
    (512, 256, 1, 1, 0),
    (256, 512, 3, 1, 1),
    "M",
    (512, 256, 1, 1, 0),
    (256, 512, 3, 1, 1),
    "M",
    (512, 256, 1, 1, 0),
    (256, 512, 3, 1, 1),
    "M",
    (512, 256, 1, 1, 0),
    (256, 512, 3, 1, 1),
    (512, 512, 1, 1, 0),
    (512, 1024, 3, 1, 1),
    "M",
]

In [230]:
# [x1,y1,w1,h1,C1,p11,p12,p13,p14]
# [x2,y2,w2,h2,C2,p21,p22,p23,p24]
# first list is wide bbox, objectness confidence and class probs
# second list is tall bbox, objectness confidence and class probs

@dataclass
class Bbox:
    x: float
    y: float
    w: float
    h: float
    C: float
    p1: float
    p2: float
    p3: float
    p4: float
    
    def cords(self):
        return [self.x,self.y,self.w,self.h]
    
    def probs(self):
        return [self.p1,self.p2,self.p3,self.p4]

@dataclass
class Label:
    wide: Bbox
    tall: Bbox
        
    def set_grid_index(self,i,j):
        # setter method for index in grid
        self.i = i
        self.j = j
        
    def best_bbox(self):
        # is the wide or tall 
        # bbox more confident?
        if wide.C > tall.C:
            return 'wide'
        else:
            return 'tall'
        
@dataclass
class Grid:
    S: int
    label_list: list[Label] # flat input label list.
    width: float
    height: float
        
    def vals_2_index(self, bbox):
        # avoid fencepost error
        j = math.ceil((bbox.x / self.width) * (self.S - 1))
        i = math.ceil((bbox.y / self.height) * (self.S - 1))
        return i,j
            
    def __post_init__(self):
        self.cell_width = int(self.width / self.S)
        self.cell_height = int(self.height / self.S)
        print(f"total dimensions: {self.width}x{self.height}")
        print("cell width: ", self.cell_width)
        print("cell height: ", self.cell_height)
        # make every bbox empty to begin with then replace with 
        # labels in each correct cell
        self.cells = [[Label(Bbox(0,0,0,0,0,0,0,0,0),Bbox(0,0,0,0,0,0,0,0,0)) 
                       for j in range(self.S)] for i in range(self.S)]
        print(len(self.cells),"x",len(self.cells[0]), " size grid")
        print("assigning labels to grid cells")
        # need to make sure x,y are converted to absolute pixel
        # value first though because yolo stored as fractions...
        for label in self.label_list:
            print(label)
            label.wide.x *= self.width
            label.tall.x *= self.width
            label.wide.y *= self.height
            label.tall.y *= self.height
            print(label)
            # assign grid cell index
            # based on more confident
            # out of wide or tall
            if label.best_bbox()=='wide':
                i,j = self.vals_2_index(label.wide)
                # now we have to recalculate x and y
                label.wide.x = label.wide.x - self.cell_width * (j-1)
                label.wide.y = label.wide.y - self.cell_height * (i-1)
            else:
                i,j = self.vals_2_index(label.tall)
                print(i,j)
                # now we have to recalculate x and y
                label.tall.x = label.tall.x - self.cell_width * (j-1)
                label.tall.y = label.tall.y - self.cell_height * (i-1)
            print("i,j: ",i,j)
            # put it in the grid
            self.cells[i][j] = label
            # set grid index
            label.set_grid_index(i,j)

In [231]:
width = 1000
w = 560
S = 10
print(math.ceil(S * w/width))

6


In [232]:
# friendship with classes ended. 🤝 dataclasses are my new best friend 
wide = Bbox(0.5,0.5,2.0,2.0,1.0,0.0,0.0,0.0,0.3)
tall = Bbox(0.2,0.3,2.0,2.3,0.5,0.0,0.0,0.0,0.3)

def vals_2_index(self, bbox):
    j = math.ceil((bbox.x / self.width) * self.S)
    i = math.ceil((bbox.y / self.height) * self.S)

# should be stored in 1,1 middle cell
l1 = Label(wide, tall)
g = Grid(3, [l1], 100, 100)
g.cells

total dimensions: 100x100
cell width:  33
cell height:  33
3 x 3  size grid
assigning labels to grid cells
Label(wide=Bbox(x=0.5, y=0.5, w=2.0, h=2.0, C=1.0, p1=0.0, p2=0.0, p3=0.0, p4=0.3), tall=Bbox(x=0.2, y=0.3, w=2.0, h=2.3, C=0.5, p1=0.0, p2=0.0, p3=0.0, p4=0.3))
Label(wide=Bbox(x=50.0, y=50.0, w=2.0, h=2.0, C=1.0, p1=0.0, p2=0.0, p3=0.0, p4=0.3), tall=Bbox(x=20.0, y=30.0, w=2.0, h=2.3, C=0.5, p1=0.0, p2=0.0, p3=0.0, p4=0.3))
i,j:  1 1


[[Label(wide=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0), tall=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0)),
  Label(wide=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0), tall=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0)),
  Label(wide=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0), tall=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0))],
 [Label(wide=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0), tall=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0)),
  Label(wide=Bbox(x=50.0, y=50.0, w=2.0, h=2.0, C=1.0, p1=0.0, p2=0.0, p3=0.0, p4=0.3), tall=Bbox(x=20.0, y=30.0, w=2.0, h=2.3, C=0.5, p1=0.0, p2=0.0, p3=0.0, p4=0.3)),
  Label(wide=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0), tall=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0))],
 [Label(wide=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0), tall=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, p3=0, p4=0)),
  Label(wide=Bbox(x=0, y=0, w=0, h=0, C=0, p1=0, p2=0, 

In [233]:
#utility functions for creating labels
PATH='/Users/ben/Downloads/archive'

def wide_or_tall(bbox):
    # wide or tall bbox
    x1 = bbox[0]
    y1 = bbox[1]
    x2 = bbox[2]
    y2 = bbox[3]
    if abs(x1-x2)>abs(y1-y2):
        return 'wide'
    else:
        return 'tall'

def probs(classname):
    #return C,p1,p2,p3,p4 subarr given object exists
    if classname=='buffalo':
        return [1,1,0,0,0]
    elif classname=='elephant':
        return [1,0,1,0,0]
    elif classname=='rhino':
        return [1,0,0,1,0]
    elif classname=="zebra":
        return [1,0,0,0,1]
    else:
        raise ValueError("Class name invalid: ",classname)

# now we make our labels in [x1,y1,w1,h1,C1,p11,p12,p13,p14,x2,y2,w2,h2,C2,p21,p22,p23,p24] format
# say first bbox prediction wide, second bbox prediction tall

def getLabel(label, classnames):
    # input: label (readlines output)
    # return Label dataclass
    label =label.replace('/n','').split(' ')
    label = [float(l) for l in label]
    bbox = label[1:]
    bbox_type = wide_or_tall(bbox)
    if bbox_type=='wide':
        result_left = np.concatenate([bbox,probs(classnames)])
        result_right = np.zeros([9,])
    elif bbox_type=='tall':
        result_left = np.zeros([9,])
        result_right = np.concatenate([bbox,probs(classnames)])
    wide = Bbox(*result_left)
    tall = Bbox(*result_right)
    l = Label(wide, tall)
    return l

def make_labels(path):
    #read dataset labels into dictionary
    labels={}
    os.chdir(path)
    animals=['buffalo','elephant','rhino','zebra']
    for i,animal in enumerate(animals):
        for file in os.listdir(f'{path}/{animal}'):
            if '.txt' in file:
                classname = i
                f = open(f'{animal}/{file}')
                label = f.readlines()
                result = [getLabel(l,animal) for l in label]
                labels[f'{animal}/{file}']=result
    return labels

def draw_bbox(img, label, height, width):
    # TODO: different color for each class
    bboxw = label.wide
    bboxt = label.tall
    # take the most confident box
    if(bboxw.C > bboxt.C):
        x=bboxw.x
        y=bboxw.y
        w=bboxw.w
        h=bboxw.h
    else:
        x=bboxt.x
        y=bboxt.y
        w=bboxt.w
        h=bboxt.h
    # draw bboxes
    l = int((x - w / 2) * width)
    r = int((x + w / 2) * width)
    t = int((y - h / 2) * height)
    b = int((y + h / 2) * height)
    #set limits for bbox to image edges
    if l < 0:
        l = 0
    if r > width - 1:
        r = width - 1
    if t < 0:
        t = 0
    if b > height - 1:
        b = height - 1
    #draw with openCV
    cv2.rectangle(img, (l, t), (r, b), (0, 0, 255), 6)

labels = make_labels(PATH)
print(labels)

{'buffalo/289.txt': [Label(wide=Bbox(x=0.535156, y=0.496622, w=0.726563, h=0.633784, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))], 'buffalo/262.txt': [Label(wide=Bbox(x=0.503125, y=0.515541, w=0.979688, h=0.95, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))], 'buffalo/276.txt': [Label(wide=Bbox(x=0.498437, y=0.502027, w=0.989062, h=0.987838, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))], 'buffalo/060.txt': [Label(wide=Bbox(x=0.45671, y=0.603333, w=0.904762, h=0.78, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))], 'buffalo/074.txt': [Label(wide=Bbox(x=0.8, y=0.527205, w=0.4, h=0.78424, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0)), 

In [200]:
# let's make our own torch dataset type...
# as we have custom labels
# inherits torch.utils.data.Dataset
# first we need to define our grid

class YOLODataset(Dataset):
    """YOLO dataset."""

    def __init__(self, split_size, num_boxes=2, num_classes=4, path=PATH):
        """
        Args:
            path: path to the dataset root folder, 
            default set by global var
        """
        self.path = path
        self.labels = make_labels(self.path)
        # grid dimension
        self.S = split_size
        # number of Bboxes predicted for each grid cell
        self.B = num_boxes
        # number of classes possible
        self.num_classes=4

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        fname = list(self.labels.keys())[idx]
        labels = self.labels[fname]
        img_name=f"{self.path}/{fname}"
        img_name=img_name.replace('.txt','.jpg')
        with Image.open(img_name) as im:
            im = np.asarray(im)
            shape = im.shape
            #gotta reshape h,w,c -> c,h,w for torch
            im = im.reshape([3,shape[0],shape[1]])
            sample = {'image': np.asarray(im), 
                      'labels': labels, 
                      'grid': Grid(self.S, labels, im.shape[1], im.shape[2])}
        return sample
    
    def __showbbox__(self,idx):
        # return image with drawn bbox
        if torch.is_tensor(idx):
            idx = idx.tolist()
        res = self.__getitem__(idx)
        img = res['image']
        shape = img.shape
        img = img.reshape([shape[1],shape[2],3])
        # Create figure and axes
        fig, ax = plt.subplots()
        print("image shape: ",img.shape)
        height = img.shape[0]
        width = img.shape[1]
        labels = res['labels']
        # draw all bboxes
        for label in labels:
            draw_bbox(img, label, height, width)
        # Display the image
        plt.imshow(img)

In [235]:
# make dataset, load an image, show bbox
ds = YOLODataset(10)
a=ds.__getitem__(2)
ds.__showbbox__(2)

total dimensions: 100x143
cell width:  10
cell height:  14
10 x 10  size grid
assigning labels to grid cells
Label(wide=Bbox(x=0.498437, y=0.502027, w=0.989062, h=0.987838, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))
Label(wide=Bbox(x=49.8437, y=71.789861, w=0.989062, h=0.987838, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))
i,j:  5 5
total dimensions: 100x143
cell width:  10
cell height:  14
10 x 10  size grid
assigning labels to grid cells
Label(wide=Bbox(x=9.843699999999998, y=15.789861000000002, w=0.989062, h=0.987838, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))
Label(wide=Bbox(x=984.3699999999999, y=2257.950123, w=0.989062, h=0.987838, C=1.0, p1=1.0, p2=0.0, p3=0.0, p4=0.0), tall=Bbox(x=0.0, y=0.0, w=0.0, h=0.0, C=0.0, p1=0.0, p2=0.0, p3=0.0, p4=0.0))
i,j:  143 89


IndexError: list index out of range

In [None]:
# need to write custom collate function to prevent 
# torch stacking different size input images
# for now just use batch_size=1
train_dataloader = DataLoader(ds, batch_size=1)

In [117]:
# Get gpu device for training if available otherwise use cpu
device = "mps" #if using mac
print(f"Using {device} device")

# darknet conv function adapted from Darknet53 implementation
# https://github.com/developer0hye/PyTorch-Darknet53/blob/master/model.py

def dark_conv(c_in, c_out, kernel, stride, padding):
    """
    Construct a convolutional block function 
    given the input dimensions
    c_in: channels in
    c_out: channels out
    kernel: filter / convolution size (is square)
    stride: convolution stride 
    padding: convolution padding 

    Based on DarkNet19 configuration
    """
    return nn.Sequential(
            nn.Conv2d(c_in, c_out, kernel, stride, padding, bias=False),
            nn.BatchNorm2d(c_out),
            nn.LeakyReLU(0.1))
    
def fully_connected(split_size, num_boxes, num_classes=4):
    """
    Create our "fully connected layers"
    that make our activations
    """
    S,B,C = split_size, num_boxes, num_classes
    layers = nn.Sequential(
        nn.Flatten(),
        nn.Linear(1024, 496),
        nn.Dropout(0.0),
        nn.LeakyReLU(0.1),
        nn.Linear(496, S * S * (C + B * 5)),
    )
    return layers

# Our CNN model
class YOLOv1(nn.Module):

    def __init__(self, config, split_size, num_boxes, num_classes=4):
        super(YOLOv1, self).__init__()
        self.split_size = split_size
        self.num_boxes = num_boxes
        self.num_classes = num_classes
        self.darknet = []
        for l in config:
            if l=="M":
                self.darknet.append(nn.MaxPool2d(kernel_size=(2,2),stride=(2,2)))
            elif type(l)==tuple:
                convlay = dark_conv(l[0],l[1],l[2],l[3],l[4])
                self.darknet.append(convlay)
        # turn this list of layers into a sequential NN
        self.darknet = nn.Sequential(*self.darknet)
        # define our sequential nn
        self.fc = fully_connected(split_size, num_boxes, num_classes)

    def forward(self, x):
        x = self.darknet(x)
        output = self.fc(x)
        output = torch.flatten(output, start_dim=1)
        return output

#declare CNN model instance
model = YOLOv1(architecture_config, 2, 2).to(device)
print("Neural Network PyTorch Architecture:")
print(model)

#define loss function and optimizer
#MSE loss for now until we define actual YOLO loss
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)

# let's pass some random data through as a test
# 10 instances of 500x448 RBG images
data = torch.rand(10,3,500,448).to(device)
out = model(data)
S=2
C=4
B=2
# can reshape into label format
print(out.reshape(10,S,S,C+B*5).shape)

Using mps device
Neural Network PyTorch Architecture:
YOLOv1(
  (darknet): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.1)
    )
    (1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (2): Sequential(
      (0): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.1)
    )
    (3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.1)