In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
import nb_002
from nb_002c import *

import operator
from random import sample
from torch.utils.data.sampler import Sampler

In [None]:
DATA_PATH = Path('data')
PATH = DATA_PATH/'caltech101' # http://www.vision.caltech.edu/Image_Datasets/Caltech101/

# Caltech 101

## Create validation set

The first step will be to create a dataset from our files. We need to separate a definite amount of files to be used as our validation set. We will do this randomly by setting a percentage apart, in this case 0.2.

In [None]:
#export
class FilesDataset(Dataset):
    def __init__(self, fns, labels, classes=None):
        if classes is None: classes = list(set(labels))
        self.classes = classes
        self.class2idx = {v:k for k,v in enumerate(classes)}
        self.fns = np.array(fns)
        self.y = [self.class2idx[o] for o in labels]
        
    def __len__(self): return len(self.fns)

    def __getitem__(self,i): return open_image(self.fns[i]),self.y[i]
    
    @classmethod
    def from_folder(cls, folder, classes=None, test_pct=0.):
        if classes is None: classes = [cls.name for cls in find_classes(folder)]
            
        fns,labels = [],[]
        for cl in classes:
            fnames = get_image_files(folder/cl)
            fns += fnames
            labels += [cl] * len(fnames)
            
        if test_pct==0.: return cls(fns, labels, classes=classes)
        
        fns,labels = np.array(fns),np.array(labels)
        is_test = np.random.uniform(size=(len(fns),)) < test_pct
        return (cls(fns[~is_test], labels[~is_test], classes=classes),
                cls(fns[is_test], labels[is_test], classes=classes))

In [None]:
classes = ["airplanes", "Motorbikes", "BACKGROUND_Google", "Faces", "watch", "Leopards", "bonsai",
    "car_side", "ketch", "chandelier", "hawksbill", "grand_piano", "brain", "butterfly", "helicopter", "menorah",
    "trilobite", "starfish", "kangaroo", "sunflower", "ewer", "buddha", "scorpion", "revolver", "laptop", "ibis", "llama",
    "minaret", "umbrella", "electric_guitar", "crab", "crayfish",]

np.random.seed(42)
train_ds,valid_ds = FilesDataset.from_folder(PATH, test_pct=0.2)

x = train_ds[1114][0]
classes = train_ds.classes
c = len(classes)

len(train_ds),len(valid_ds),c

## Rectangular affine fix

In [None]:
show_image(x, figsize=(6,3), hide_axis=False)
print(x.shape)

In [None]:
rot_m = np.array(rotate(40.)); rot_m

In [None]:
show_image(apply_affine([rot_m])(x), figsize=(6,3))

In [None]:
#export
def affine_grid(x, matrix, size=None):
    h,w = x.shape[1:]
    if size is None: size=x.shape
    matrix[0,1] *= h/w; matrix[1,0] *= w/h
    return F.affine_grid(matrix[None,:2], torch.Size((1,)+size))

nb_002.affine_grid = affine_grid

In [None]:
show_image(apply_affine([rot_m])(x), figsize=(6,3))

## Crop with padding

Now we are going to add padding or crop automatically according to a desired final size. The best way to do this is to integrate both transforms into the same function. 

We will do the padding necessary to achieve a _size x size_ (square) image. If _size_ is greater than either the height or width dimension of our image, we know we will need to add padding. If _size_ is smaller than either _height_ or _width_ dimension of our image, we will have to crop. We might have to do one, the other, both or neither. In this example we are only adding padding since both our _height_ and _width_ are smaller than 300, our desired dimension for the new _height_ and _width_.

As is the case with our original function, we can add a *row_pct* or *col_pct* to our transform to focus on different parts of the image instead of the center which is our default.

Note: While experimenting take into account that this example image contains a thin black border in the original. This affects our transforms and can be seen when we use reflect padding.

In [None]:
#export
TfmType = IntEnum('TfmType', 'Start Affine Coord Pixel Lighting Crop')

@reg_transform
def crop_pad(x, size, padding_mode='reflect',
             row_pct:uniform = 0.5, col_pct:uniform = 0.5) -> TfmType.Crop:
    size = listify(size,2)
    rows,cols = size
    if x.size(1)<rows or x.size(2)<cols:
        row_pad = max((rows-x.size(1)+1)//2, 0)
        col_pad = max((cols-x.size(2)+1)//2, 0)
        x = F.pad(x[None], (col_pad,col_pad,row_pad,row_pad), mode=padding_mode)[0]
    row = int((x.size(1)-rows+1)*row_pct)
    col = int((x.size(2)-cols+1)*col_pct)

    x = x[:, row:row+rows, col:col+cols]
    return x.contiguous() # without this, get NaN later - don't know why

In [None]:
show_image(crop_pad(x, 300, row_pct=0.,col_pct=0., padding_mode='constant'))

In [None]:
show_image(crop_pad(x, 150))

In [None]:
show_image(crop_pad(x, 150, row_pct=0.,col_pct=0.98, padding_mode='constant'))

In [None]:
tfm = crop_pad_tfm(row_pct=(0,1.), col_pct=(0,1.))

_,axes = plt.subplots(1,4, figsize=(12,3))
for ax in axes.flat:
    tfm.resolve()
    show_image(tfm(x, size=150), ax)

## Combine crop/resize

Next, we are going to combine our cropping and padding with the resize operation. In other words, we will get a picture, and crop/pad it in such a way that we get our desired size. It is similar to our previous transform only this time the final dimensions don't have to be square. This gives us more flexibility since our network architecture might take rectangular pictures as input.

First, we will get the target dimensions. For this we have built *get_crop_target*. This function takes three arguments: a target_px, a target_aspect and a multiple. *target_px* is our base dimension, *target_aspect* is our relation between width and height and _mult_ is what do we need our dimensions to be a multiple of. 

To understand this better, let's take our example where our values are *target_px*=220, *target_aspect*=2., _mult_=32 (default). In plain text we are telling our function: return the dimensions that meet a ~220\*220 area image with a width twice as long as the height and a height and width are multiples of 32.


In [None]:
#export
def round_multiple(x, mult): return (int(x/mult+0.5)*mult)

def get_crop_target(target_px, target_aspect=1., mult=32):
    target_px = listify(target_px, 2)
    target_r = math.sqrt(target_px[0]*target_px[1]/target_aspect)
    target_c = target_r*target_aspect
    return round_multiple(target_r,mult),round_multiple(target_c,mult)

In [None]:
get_crop_target(220)

In [None]:
crop_target = get_crop_target(220, 2.);
target_r,target_c = crop_target
crop_target, target_r*target_c

In [None]:
_,r,c = x.shape; x.shape

We are now going to transform our image to our desired dimensions by using crop or padding. Before we crop or pad we will make an intermediate transform that will allow us to later get our output image with the desired dimensions. Let's call our initial dimensions h_i, w_i, our intermediate dimensions h_m, w_m and our output dimensions h_o, w_o.

Our objective will be to get our output image by cropping or padding but not both. To achive this, we will first enlarge or reduce our original image. **get_resize_target will enlarge or reduce our input image (keeping the shape or h_i/w_i constant) until one of the dimensions is equal to the corresponding final output dimension (i.e. h_m=h_o or w_m=w_o)**. But how does it know which dimension to equate? We can figure this out intuitively. If we intend to crop, our intermediate image's area has to be larger than our output image (since we are going to crop out some pixels) and if we intend to pad, our intermediate image's area has to be smaller than our output image (since we will add some pixels). This means that the dimension we will chose to equate will depend on the relationship between the ratios h_i/h_0 and w_i/w_o. If we want to **crop** we will want to equate the dimension with **the smallest ratio** since that would mean that (h_m, w_m) >= (h_o, w_o) which is exactly what we want (a larger area). Conversely if we want to **pad**, we will equate the dimension with **the largest ratio** since that will guarantee that (h_m, w_m) <= (h_o, w_o) (a smaller area).

As an example say we have our image with dimensions h_i = 192 and w_i = 128 and our target dimensions are h_o=160 w_o=320. That is, we have to turn a vertical rectangle into a horizontal rectangle. We can do this in to ways:

1. Padding the borders so we make our image wider
2. Cropping the top and bottom so we squash our image and make it wider

If we intend to crop, our intermediate dimensions will be (h_m, w_m) = (480, 320). If we intend to pad (h_m, w_m) = (160, 107). Note that 480/320 ≈ 160/107 ≈ 192/128.

In [None]:
r_ratio = r/target_r
c_ratio = c/target_c
# min -> crop; max -> pad
ratio = max(r_ratio,c_ratio)
r_ratio,c_ratio,ratio

In [None]:
r2,c2 = round(r/ratio),round(c/ratio); r2,c2

In [None]:
#export
def get_resize_target(img, crop_target, do_crop=False):
    if crop_target is None: return None
    ch,r,c = img.shape
    target_r,target_c = crop_target
    ratio = (min if do_crop else max)(r/target_r, c/target_c)
    return ch,round(r/ratio),round(c/ratio)

In [None]:
get_resize_target(x, crop_target, False)

In [None]:
get_resize_target(x, crop_target, True)

In [None]:
#export
def is_listy(x)->bool: return isinstance(x, (tuple,list))

def _apply_affine(img, size=None, padding_mode='reflect', do_crop=False, aspect=None, mult=32,
                  mats=None, func=None, crop_func=None, **kwargs):
    if size is not None and not is_listy(size):
        size = listify(size,2) if aspect is None else get_crop_target(size, aspect, mult)
    if (not mats) and func is None and size is None: return img
    resize_target = get_resize_target(img, size, do_crop=do_crop)
    c = affine_grid(img, torch.eye(3), size=resize_target)
    if func is not None: c = func(c, img.size())
    if mats:
        m = affines_mat(mats)
        c = affine_mult(c, img.new_tensor(m))
    res = grid_sample(img, c, padding_mode=padding_mode, **kwargs)
    if padding_mode=='zeros': padding_mode='constant'
    if crop_func is not None: res = crop_func(res, size=size, padding_mode=padding_mode)
    return res

def apply_affine(mats=None, func=None, crop_func=None):
    return partial(_apply_affine, mats=mats, func=func, crop_func=crop_func)

nb_002.apply_affine = apply_affine

In [None]:
img = apply_affine([rot_m])(x, size=crop_target, do_crop=False)
show_image(img, figsize=(6,3))
crop_target, img.shape

In [None]:
get_crop_target(160,2)

In [None]:
img = apply_affine([rot_m])(x, size=160, aspect=2, do_crop=True)
show_image(img, figsize=(6,3))
img.shape

In [None]:
img = apply_affine([rot_m], crop_func=crop_pad)(x, do_crop=False, size=crop_target)
show_image(img, figsize=(6,3))
img.shape

In [None]:
img = apply_affine([rot_m], crop_func=crop_pad)(x, do_crop=False, size=crop_target, padding_mode='zeros')
show_image(img, figsize=(6,3))
img.shape

In [None]:
img = apply_affine([rot_m], crop_func=crop_pad)(x, do_crop=True, size=crop_target)
show_image(img, figsize=(6,3))
img.shape

# Fit

Let's see how our transforms look for different values of zoom, rotate and crop_pad.

## Transform

In [None]:
#export
from nb_002 import _apply_tfm_funcs

def apply_tfms(tfms):
    resolve_tfms(tfms)
    grouped_tfms = dict_groupby(listify(tfms), lambda o: o.tfm_type)
    start_tfms,affine_tfms,coord_tfms,pixel_tfms,lighting_tfms,crop_tfms = [
        (grouped_tfms.get(o)) for o in TfmType]
    lighting_func = apply_lighting(compose(lighting_tfms))
    mats = [o() for o in listify(affine_tfms)]
    affine_func = apply_affine(mats, func=compose(coord_tfms), crop_func=compose(crop_tfms))
    return partial(_apply_tfm_funcs,
        compose(pixel_tfms),lighting_func,affine_func,compose(start_tfms))

nb_002.apply_tfms = apply_tfms
import nb_002b
nb_002b.apply_tfms = apply_tfms

In [None]:
tfms = [
    rotate_tfm(degrees=(-20,20.)),
    zoom_tfm(scale=(1.,1.95)),
]

_,axes = plt.subplots(2,2, figsize=(7,5))
for ax in axes.flat:
    show_image(apply_tfms(tfms)(x, do_crop=True, size=(60,100)), ax, hide_axis=False)

In [None]:
tfms = [
    rotate_tfm(degrees=(-20,20.)),
    zoom_tfm(scale=(1.,1.95), row_pct=(0,1), col_pct=(0,1)),
    crop_pad_tfm(row_pct=(0,1), col_pct=(0,1))
]

_,axes = plt.subplots(2,2, figsize=(6,6))
for ax in axes.flat:
    show_image(apply_tfms(tfms)(x, do_crop=False, size=100, padding_mode='zeros'), ax)

In [None]:
tfms = [crop_tfm(size=100)]
_,axes = plt.subplots(1,4, figsize=(9,3))
for ax in axes.flat: show_image(apply_tfms(tfms)(x, do_crop=True), ax)

In [None]:
tfms = [crop_tfm(size=100, row_pct=(0,1), col_pct=(0,1))]
_,axes = plt.subplots(1,4, figsize=(9,3))
for ax in axes.flat: show_image(apply_tfms(tfms)(x, do_crop=True), ax)

## Fit

Finally, with our choice of transforms and parameters we are going to fit our Darknet model and check our results. To fit our model we will need to resize our images to have the same size so we can feed them in batches to our model. We face the same decisions as before. 

In this case we chose to pad our images (since in \_apply_affine do_crop default is False). If we wanted to crop instead, we can easily add do_crop=True to train_tds. 

We also decided to make our images square, with dimension size x size. If we wanted a rectangle with width to height ratio *a* we could have added aspect=*a* to train_ds.

In [None]:
[Image.open(fn).size for fn in np.random.choice(train_ds.fns, 5)]

In [None]:
size = 150

In [None]:
train_tfms = [
    rotate_tfm(degrees=(-20,20.)),
    zoom_tfm(scale=(1.,1.5), row_pct=(0,1.), col_pct=(0,1.)),
    crop_pad_tfm(row_pct=(0,1.), col_pct=(0,1.))
]
valid_tfms = [
    zoom_tfm(),
    crop_pad_tfm()
]

In [None]:
_,axes = plt.subplots(1,4, figsize=(10,5))
for ax in axes.flat:
    show_image(apply_tfms(train_tfms)(x, do_crop=True, size=size), ax)

In [None]:
show_image(apply_tfms(valid_tfms)(x, do_crop=True, size=size))

In [None]:
bs = 128

In [None]:
valid_tds = TfmDataset(valid_ds, valid_tfms, size=150, padding_mode='zeros')
data = DataBunch(valid_tds, valid_tds, bs=bs, num_workers=0)
xb,yb = next(iter(data.train_dl))
b = xb.transpose(1,0).reshape(3,-1)
data_mean=b.mean(1).cpu()
data_std=b.std(1).cpu()
data_mean,data_std

In [None]:
show_image_batch(data.train_dl, train_ds.classes, 4)

In [None]:
valid_tds = TfmDataset(valid_ds, valid_tfms, size=150, padding_mode='zeros')
train_tds = TfmDataset(train_ds, train_tfms, size=150, padding_mode='zeros')

In [None]:
norm,denorm = normalize_funcs(data_mean,data_std)

In [None]:
data = DataBunch(train_tds, valid_tds, bs=bs, num_workers=12, tfms=norm)
len(data.train_dl),len(data.valid_dl)

In [None]:
model = Darknet([1, 2, 4, 4, 2], num_classes=c, nf=16)
learn = Learner(data, model)
opt_fn = partial(optim.SGD, momentum=0.9)

In [None]:
learn.fit(1, 0.1, opt_fn=opt_fn)

In [None]:
# learn.fit(1, 0.2, opt_fn=opt_fn)

In [None]:
# learn.fit(5, 0.4, opt_fn=opt_fn)

In [None]:
# learn.fit(5, 0.1, opt_fn=opt_fn)

# Fin