In [1]:
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as rng
import numpy.linalg as alg

import copy
import metric_learn #need to install first

from scipy.ndimage import imread, affine_transform
from scipy.spatial.distance import cdist, pdist
from skimage.measure import block_reduce
from skimage.transform import rescale, resize, downscale_local_mean
from functools import reduce
from PIL import Image

# Fellowship.AI Challenge

NOTE: Relevant code in the Omniglot repository. This notebook is for short examples and snippets.

For the fellowship challenge, I chose the one-shot learning problem. 
My reason for choosing this challenge was that it involved image processing, which is a field that I am interested in but have limited experience with. 

To attack this problem, I attempted two learning methods.  

The first was metric learning using Large Margin Nearest Neighbors(LMNN). This idea came from the fact that the omniglot repository contains an example using a different distance metric than just Euclidean distance. LMNN learns a matrix for maximizing a Mahalanobis distance between datapoints that are dissimilar. I used the following metric-learning codebase as a reference: 

https://github.com/metric-learn/metric-learn

The idea behind LMNN is to learn a metric such that every data instance is surrounded by at least k members of the same class. However with the small number of examples in the Omniglot dataset, I needed to augment the data. I decided to try to augment each example with a random affine transform, similar to figure 5 in this paper: https://www.cs.cmu.edu/~rsalakhu/papers/oneshot1.pdf

In [4]:
def AffineTransImg(I):
    # Input: 
    # Image ndarray pf floats
    #
    # Output:
    # D : [n x 2] rows are coordinates
    theta = rng.uniform(-np.pi/36, np.pi/36)
    rhox = rng.uniform(-0.2,0.2)
    rhoy = rng.uniform(-0.2,0.2)
    sx = rng.uniform(0.8,1.2)
    sy = rng.uniform(0.8,1.2)
    tx = rng.uniform(-2,2)
    ty = rng.uniform(-2,2)
    c = np.cos(theta)
    s = np.sin(theta)
    Rot = np.array([[c,s],[-s,c]])
    Shr = np. array([[1,rhox],[rhoy,1]])
    Sca = np.array([[sx,0],[0,sy]])
    A = reduce(np.dot, [Sca, Shr, Rot])
    b = np.transpose([[0,0]])
    try:
        Ainv = alg.inv(A)
        HomoA = np.concatenate((Ainv,-np.dot(Ainv,b)),axis=1)
        HomoA = np.concatenate((HomoA,[[0,0,1]]))
        I = affine_transform(I, Ainv)
    except np.linalg.LinAlgError as err:
        if 'Singular matrix' in str(err):
            pass
        else:
            raise     
    I = downscale_local_mean(I, (3, 3))
    I[I >= 0.5] = 1
    I[I < 0.5] = 0
    I = I.flatten()
    return(I)

The affine transform returns a flattened feature vector for each class, which was then compiled into feature matrix. The learning process of LMNN is done using the metric-learn library:

In [6]:
def classification_run(folder,f_load,f_cost,ftype='cost'):
    # Compute error rate for one run of one-shot classification
    #
    # Input
    #  folder : contains images for a run of one-shot classification
    #  f_load : itemA = f_load('file.png') should read in the image file and process it
    #  f_cost : f_cost(itemA,itemB) should compute similarity between two images, using output of f_load
    #  ftype  : 'cost' if small values from f_cost mean more similar, or 'score' if large values are more similar
    #
    # Output
    #  perror : percent errors (0 to 100% error)
    # 
    assert ((ftype=='cost') | (ftype=='score'))

    # get file names
    with open(folder+'/'+fname_label) as f:
	    content = f.read().splitlines()
    pairs = [line.split() for line in content]
    test_files  = [pair[0] for pair in pairs]
    train_files = [pair[1] for pair in pairs]
    answers_files = copy.copy(train_files)
    test_files.sort()
    train_files.sort()	
    ntrain = len(train_files)
    ntest = len(test_files)

    # load the images (and, if needed, extract features)
    train_items = [f_load(f) for f in train_files]
    test_items  = [f_load(f) for f in test_files ]

    # Augment with 5 affine transforms
    # Creates 6 total examples per training item
    nexample = 6
    feat_mtx = np.zeros((nexample*ntrain,1024),dtype=float)
    for i, item in enumerate(train_items):
        I = rescale(item, 1.0 / 3.0, anti_aliasing=False)
        I = I.astype(bool)
        I = I.astype(float)
        feat_mtx[(nexample*i),:] = I.flatten()
        for j in range(1,nexample):
            feat_mtx[(nexample*i)+j,:] = AffineTransImg(item)

    # gather the class numbers for each file
    classes = np.repeat(np.arange(1,ntrain+1),nexample)

    Y = classes
    X = feat_mtx

    # setting up LMNN
    # tried 14 classes because of training data size (want 14 nearest neighbors for each class)
    # lowered to k=5 because training takes very long
    lmnn = metric_learn.LMNN(k=5, min_iter=50, max_iter=1000, learn_rate=1e-6, regularization=1)

    # fit the data
    # Use already saved matrices to save time
    try:
        Minv = np.load(folder+'Minv.npy')
    except FileNotFoundError:
        print("Matrix file not available.\n")
        print("Fitting data...\n")
        lmnn.fit(X, Y)
    # Save Mahalanobis metric matrix as a file for later
        Minv = lmnn.metric()
        np.save(folder+'Minv.npy', Minv)
    
    

    # compute cost matrix
    costM = np.zeros((ntest,ntrain),float)
    for i in range(ntest):
	    for c in range(ntrain):
		    costM[i,c] = f_cost(test_items[i],train_items[c],Minv)
    
    if ftype == 'cost':
	    YHAT = np.argmin(costM,axis=1)
    elif ftype == 'score':
	    YHAT = np.argmax(costM,axis=1)
    else:
	    assert False

    # compute the error rate
    correct = 0.0
    for i in range(ntest):
	    if train_files[YHAT[i]] == answers_files[i]:
		    correct += 1.0
    pcorrect = 100 * correct / ntest
    perror = 100 - pcorrect
    return perror

Most of the code from the example is used for loading files, etc. Running LMNN as I wrote it resulted in only 10-15% accuracy, which is not much better than random guessing. In order to improve metric learning, I would need to use much larger samples from the omniglot set and more data augmentation via random affine transformations such as rotation, scaling, shearing, etc.

I attempted to improve these results using deep learning. Several papers have had success on Omniglot with two CNNs running in a comparative manner. I used this one heavily for reference: http://openaccess.thecvf.com/content_cvpr_2018/papers_backup/Sung_Learning_to_Compare_CVPR_2018_paper.pdf

My network architecture was as follows.

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchvision
import torchvision.utils
import torchvision.datasets as dset
import torchvision.transforms as T

from torch.autograd import Variable

from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import Sampler

import os
import math
import random

In [8]:
class SiameseNetwork(nn.Module):
    def __init__(self, in_channel=1, channel_num=64, hidden_num=64, output_size=1):
        super().__init__()

        self.layer1 = nn.Sequential(
                        nn.Conv2d(in_channel,channel_num,kernel_size=3,padding=1),
                        nn.BatchNorm2d(channel_num, momentum=1, affine=True),
                        nn.ReLU(),
                        nn.MaxPool2d(2))
        self.layer2 = nn.Sequential(
                        nn.Conv2d(channel_num,channel_num,kernel_size=3,padding=1),
                        nn.BatchNorm2d(channel_num, momentum=1, affine=True),
                        nn.ReLU(),
                        nn.MaxPool2d(2))
        self.layer3 = nn.Sequential(
                        nn.Conv2d(channel_num,channel_num,kernel_size=3,padding=1),
                        nn.BatchNorm2d(channel_num, momentum=1, affine=True),
                        nn.ReLU(),
                        nn.MaxPool2d(2))
        self.layer4 = nn.Sequential(
                        nn.Conv2d(channel_num,channel_num,kernel_size=3,padding=1),
                        nn.BatchNorm2d(channel_num, momentum=1, affine=True),
                        nn.ReLU(),
                        nn.MaxPool2d(2))
        # output is 2x2x64 after conv section

        # input is 2x2x64 = 256, output is hidden layer num
        self.fc1 = nn.Sequential(
                    nn.Linear(channel_num*2*2,hidden_num),
                    nn.Sigmoid())

        # only do last FC layer after difference function
        # input is hidden layer, output is 1 
        self.fc2 = nn.Sequential(
                    nn.Linear(hidden_num,output_size),
                    nn.Sigmoid())

    def forward_once(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = flatten(out)
        out = self.fc1(out)
        return out

    def forward(self,x1,x2):
        out1 = self.forward_once(x1)
        out2 = self.forward_once(x2)
        out_diff = torch.abs(out1-out2)
        out3 = self.fc2(out_diff)
        return out3

This network uses 4 convolutional blocks and accepts 2 images for comparison. The last layers are fully connected, with the final step being an absolute difference and a sigmoid activation.

I used this architecture because I saw that comparative networks work well for Omniglot, however, I was unable to achieve good results in time. The Relation Network paper uses 1 million training episodes in order to sample the massive dataset size that comes from pairwise-matching of 1623x20 images.

In order to improve my work, I will have to work on ways to avoid overfitting due to the massive number of classes in omniglot. I also want to work on a more powerful machine because all of my work was done locally. Unfortunately this was not a good idea because CNNs and other large learning methods require a lot of computing/GPU power.