# Dataset Generation

In [2]:
import torch
import numpy as np
from torch.utils.data import Dataset
import sklearn.datasets as skds
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import math
from numpy import sin, pi
# from matplotlib import pyplot as plt
from torchvision.datasets import MNIST

Data generation routines will return a MyDS dataset class of type torch.utils.Dataset

In [3]:
class MyDS(Dataset):
    def __init__(self, X,y):
        self.samples = torch.Tensor(X)
        self.labels = torch.LongTensor(y)
        
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return (self.samples[idx],self.labels[idx])

Generate data in $\mathcal{R}^n$ with $n$ features, a number of classes, prescribed class separation and clusters per class. Return datasets for training and testing data and data-loader for training data using prescribed batch size.

In [4]:
def euclideanDataset(n_samples=1000,n_classes=2,class_sep=2.0,n_features=5,batch_size=1):
    X,y = skds.make_classification(n_samples=n_samples,n_classes=n_classes,
                                   class_sep=class_sep,n_features=n_features, 
                                 n_informative=n_features,n_redundant=0,
                                 n_repeated=0,n_clusters_per_class=1)
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    ds = MyDS(X_train,y_train)
    ds_hold = MyDS(X_test,y_test)
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=True)
    return ds,ds_hold,dsloader

Generate samples from a sinwave $a$ sin$(f\pi t)$ of scale $a$, frequency $f$ at intervals $t = i \delta t$ 

In [5]:
class mysin():
    def __init__(self,ampl = 1.0, freq=0.5, delta=0.1,phase=0.0):
        self.f,self.d,self.a,self.p = freq, delta, ampl, phase
        self.t = 0.0
        return
    def reset(self,t=0.0):
        self.t = 0.0
        return self.func()
    def next(self):
        val = self.func() 
        self.t += self.d
        return val
    def __call__(self,t):
        old_t = self.t
        self.t = t
        val = self.func()
        self.t = old_t
        return val
    def func(self):
        return self.a * sin(pi*(self.f*self.t+self.p))
    def series(self,n=10):
        return np.array([self(t*self.d) for t in range(n)])
    def set_phase(self,phase=0.0):
        self.p = phase

In [6]:
def sinmix(X,m):
    w = []
    for i in range(X.shape[0]):
        s = np.zeros(m)
        for j in [2*k for k in range(int(X.shape[1]/2))]:
            if (j/2)%2 == 0: s+=mysin(freq=X[i,j],phase=1.5,ampl=X[i,j+1]).series(m)
            elif (j/2)%2 == 1: s+=np.tanh(mysin(freq=X[i,j],phase=1.5,ampl=X[i,j+1]).series(m))
        w+=[s]
    return np.array(w)

Random time-series = random frequency (f) and amplitude (a); random instance of a class = random phase (p) and length (l). We will generate n random clusters in 4D = {$f_1$,$a_1$,$f_2$,$a_2$}. For starters we will choose phase and length as constants and sum the two sine waves for each 4D vector to get a time series of fixed length.

In [7]:
def sinDataset(n_samples=10,n_classes=10,length=10,batch_size=1):
    n_features = 4
    X,y = skds.make_classification(n_samples=n_samples,n_classes=n_classes,
                                   class_sep=4.0,n_features=n_features, 
                                 n_informative=n_features,n_redundant=0,
                                 n_repeated=0,n_clusters_per_class=1)
    X = X-X.min()+0.1
    S = sinmix(X,length)
    X_train,X_test,y_train,y_test = train_test_split(S,y)
    ds = MyDS(X_train,y_train)
    ds_hold = MyDS(X_test,y_test)
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=False)
    return ds,ds_hold,dsloader

MNIST Dataset

In [8]:
def mnist_data(batch_size=32,frac=1.0):
    trainset = MNIST('./data', train=True, download=False)
    X_train = trainset.data.numpy().reshape(60000, -1).astype(np.float) / 255
    n_train = int(X_train.shape[0]*frac)
    X_train = X_train[0:n_train]
    y_train = trainset.targets
    y_train = y_train[0:n_train]
    testset = MNIST('./data', train=False, download=False)
    X_test = testset.data.numpy().reshape(10000, -1).astype(np.float) / 255
    y_test = testset.targets
    ds = MyDS(X_train,y_train)
    ds_hold = MyDS(X_test,y_test)
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=True)
    return ds,ds_hold,dsloader