<a href="https://colab.research.google.com/github/gmshroff/metaLearning2022/blob/main/code/utils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Generation

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset
import sklearn.datasets as skds
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import math
from numpy import sin, pi
from matplotlib import pyplot as plt
from torchvision.datasets import MNIST
import pathlib
from PIL import Image
from PIL.ImageOps import invert
from IPython import display
from time import sleep
import pickle
#hide_toggle('Imports')

In [None]:
def hide_toggle(x):
    return

Data generation routines will return a MyDS dataset class of type torch.utils.Dataset

In [None]:
class MyDS(Dataset):
    def __init__(self, X,y):
        self.samples = torch.Tensor(X)
        self.labels = torch.LongTensor(y)
        
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return (self.samples[idx],self.labels[idx])
#hide_toggle('Class MyDS')

Generate data in $\mathcal{R}^n$ with $n$ features, a number of classes, prescribed class separation and clusters per class. Return datasets for training and testing data and data-loader for training data using prescribed batch size.

In [None]:
def euclideanDataset(n_samples=1000,n_classes=2,class_sep=2.0,n_features=5,batch_size=1):
    X,y = skds.make_classification(n_samples=n_samples,n_classes=n_classes,
                                   class_sep=class_sep,n_features=n_features, 
                                 n_informative=n_features,n_redundant=0,
                                 n_repeated=0,n_clusters_per_class=1)
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    ds = MyDS(X_train,y_train)
    ds_hold = MyDS(X_test,y_test)
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=True)
    return ds,ds_hold,dsloader
#hide_toggle('Function euclideanDataset')

Generate samples from a sinwave $a$ sin$(f\pi t)$ of scale $a$, frequency $f$ at intervals $t = i \delta t$ 

In [None]:
class mysin():
    def __init__(self,ampl = 1.0, freq=0.5, delta=0.1,phase=0.0):
        self.f,self.d,self.a,self.p = freq, delta, ampl, phase
        self.t = 0.0
        return
    def reset(self,t=0.0):
        self.t = 0.0
        return self.func()
    def next(self):
        val = self.func() 
        self.t += self.d
        return val
    def __call__(self,t):
        old_t = self.t
        self.t = t
        val = self.func()
        self.t = old_t
        return val
    def func(self):
        return self.a * sin(pi*(self.f*self.t+self.p))
    def series(self,n=10):
        return np.array([self(t*self.d) for t in range(n)])
    def set_phase(self,phase=0.0):
        self.p = phase

In [None]:
def sinmix(X,m):
    w = []
    for i in range(X.shape[0]):
        s = np.zeros(m)
        for j in [2*k for k in range(int(X.shape[1]/2))]:
            if (j/2)%2 == 0: s+=mysin(freq=X[i,j],phase=1.5,ampl=X[i,j+1]).series(m)
            elif (j/2)%2 == 1: s+=np.tanh(mysin(freq=X[i,j],phase=1.5,ampl=X[i,j+1]).series(m))
        w+=[s]
    return np.array(w)

Random time-series = random frequency (f) and amplitude (a); random instance of a class = random phase (p) and length (l). We will generate n random clusters in 4D = {$f_1$,$a_1$,$f_2$,$a_2$}. For starters we will choose phase and length as constants and sum the two sine waves for each 4D vector to get a time series of fixed length.

In [None]:
def sinDataset(n_samples=10,n_classes=10,length=10,batch_size=1):
    n_features = 4
    X,y = skds.make_classification(n_samples=n_samples,n_classes=n_classes,
                                   class_sep=4.0,n_features=n_features, 
                                 n_informative=n_features,n_redundant=0,
                                 n_repeated=0,n_clusters_per_class=1)
    X = X-X.min()+0.1
    S = sinmix(X,length)
    X_train,X_test,y_train,y_test = train_test_split(S,y)
    ds = MyDS(X_train,y_train)
    ds_hold = MyDS(X_test,y_test)
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=False)
    return ds,ds_hold,dsloader

In [None]:
def mnist_data(batch_size=32,frac=1.0,download=False):
    trainset = MNIST('./data', train=True, download=download)
    X_train = trainset.data.numpy().reshape(60000, -1).astype(np.float) / 255
    n_train = int(X_train.shape[0]*frac)
    X_train = X_train[0:n_train]
    y_train = trainset.targets
    y_train = y_train[0:n_train]
    testset = MNIST('./data', train=False, download=download)
    X_test = testset.data.numpy().reshape(10000, -1).astype(np.float) / 255
    y_test = testset.targets
    ds = MyDS(X_train,y_train)
    ds_hold = MyDS(X_test,y_test)
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=True)
    return ds,ds_hold,dsloader
#hide_toggle('Function mnist_data')

Image data from NAR project

In [None]:
def load_shape_map(save_dir: str) -> dict:
    SHAPE_IMG_SIZE=(20,20)
    """Loads PIL images into a dictionary with shapes as keys"""
    save_dir = pathlib.Path(save_dir)
    img_dict = {}
    for img_path in save_dir.glob("*.png"):
        img_dict[img_path.stem] = (
            Image.open(img_path).convert("L").resize(SHAPE_IMG_SIZE)
        )
    return img_dict

In [None]:
def remove_redundant_shapes(images):
    redundant_shapes=['90circle','180circle','270circle','90square','180square','270square','0delta','90delta','180delta'
                 '270delta','180theta','270theta','180x','270x','180z','270z']
    images_temp=images.copy()
    for key in images_temp.keys():
        if key in redundant_shapes: images.pop(key,None)
    return images

In [None]:
def image_data(batch_size=1,return_images=False,flatten=False):
    images=load_shape_map('./images_rotate')
    images=remove_redundant_shapes(images)
    y=[]
    X=[]
    images_def=[]
    names=[]
    txfs=[]
    mapping={}
    for (i,k) in enumerate(list(images.keys())):
        iDef,iTxL=deformed_images(images[k])
        images_def+=iDef
        if flatten==True: X+=[np.asarray(img).flatten() for img in iDef]
        else: X+=[np.asarray(img) for img in iDef]
        y+=[i]*len(iDef)
        names+=[k]*len(iDef)
        txfs+=iTxL
        mapping[i]=k
    #y=[k for (k,i) in enumerate(list(images.keys()))]
    #X=[np.asarray(images[k]) for k in images.keys()]
    X_train,X_test,y_train,y_test,images_train,images_test,names_train,names_test,tr_txL,te_txL = train_test_split(
        X,y,images_def,names,txfs)
    ds = MyDS(X_train,y_train)
    ds_hold = MyDS(X_test,y_test)
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=False)
    if return_images==True:return ds,ds_hold,dsloader,images_train,images_test,names_train,names_test,mapping,tr_txL,te_txL
    else: return ds,ds_hold,dsloader,mapping
    #return X_train,X_test,y_train,y_test,images_train,images_test,names_train,names_test

In [None]:
# Deform an image via an affine transformation while retaining its size to fit in the same box
def deform_image(t,a,b,c,d):    
    def aff(a,b,d,e,z):
        #x => (a(x-z)+b(y-z) = ax + by -az-bz
        #y >= (d(x-z)+e(y-z) = dx + ey -dz-ez
        return (a,b,-a*z-b*z+z,d,e,-d*z-e*z+z)
    u=invert(t)
    #u=u.transform((20,20),Image.AFFINE,aff(a,b,c,d,10))
    u=u.transform((80,80),Image.AFFINE,(1, 0, -30, 0, 1,-30))
    u=u.transform((80,80),Image.AFFINE,aff(a,b,c,d,40))
    u=u.transform((80,80),Image.AFFINE,(1,0,30,0,1,30))
    u=u.transform((20,20),Image.AFFINE,(1,0,0,0,1,0))
    return u

In [None]:
def deformed_images(t):
    tL=[]
    txL=[]
    i=0
    for sx in range(-10,11,1):
        for sy in range(-10,11,1):
            tL+=[deform_image(t,1,sx/10,sy/10,1)]
            txL+=[(sx,sy)]
            i+=1
    return tL,txL

Save image data

In [None]:
class ImageData():
    def __init__(self,train_ds,test_ds,dloader,images_train,images_test,names_train,names_test,mapping):
        self.train_ds=train_ds
        self.test_ds=test_ds
        self.images_train=images_train
        self.images_test=images_test
        self.names_train=names_train
        self.names_test=names_test
        self.dloader=dloader
        self.mapping=mapping