# Dataset Generation
Generate 4 different datasets which are saved and used to train and evaluate each of the models. Each dataset is
generated with 2000 training points and 500 test points.

In [None]:
import numpy as np
import matplotlib.pylab as plt
import os

In [None]:
# Directory for storing datasets
data_dir = 'dataset'
os.makedirs(data_dir, exist_ok=True)

# Random number generator
rng = np.random.default_rng()

In [None]:
def plot_dataset(X_train, X_test):
    plt.scatter(X_train[:, 0], X_train[:, 1], alpha=0.7, marker='.', label='Train')
    plt.scatter(X_test[:, 0], X_test[:, 1], alpha=0.7, marker='.', label='Test')
    plt.xlabel("$x_1$")
    plt.ylabel("$x_2$")
    plt.axis('equal')
    plt.legend()
    plt.show()  

## Pinwheel

In [None]:
def generate_pinwheel(*, num: int, rng: np.random.Generator) -> np.ndarray:
    # https://github.com/Formulator/Spiral/blob/master/PyTorch_Spiral_DataLoader.ipynb
    D = 2 # dimensionality
    K = 3 # number of classes
    N = num // K  # note: we will not generate exactly num datapoints unless K divides num
    X = np.zeros((N*K,D))
    y = np.zeros(N*K, dtype='uint8')
    for j in range(K):
      ix = range(N*j,N*(j+1))
      r = np.linspace(0.0,1,N) # radius
      t = np.linspace(j*4,(j+1)*4,N) + rng.standard_normal(N)*0.2 # theta
      X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
    return X

In [None]:
# Generate dataset
X_train_pinwheel = generate_pinwheel(num=2000, rng=rng)
X_test_pinwheel = generate_pinwheel(num=500, rng=rng)

# Save and plot dataset
np.save(os.path.join(data_dir, 'X_train_pinwheel.npy'), X_train_pinwheel)
np.save(os.path.join(data_dir, 'X_test_pinwheel.npy'), X_train_pinwheel)
plot_dataset(X_train_pinwheel, X_test_pinwheel)

## Spiral

In [None]:
def generate_spiral(*, num: int, rng: np.random.Generator) -> np.ndarray:
    thetas = rng.uniform(low=0.0, high=6*np.pi, size=(num,))
    rs = thetas * 0.05
    X = np.hstack((
        (rs * np.cos(thetas))[:, None],
        (rs * np.sin(thetas))[:, None],
    ))
    X = X + 0.04 * rng.standard_normal(X.shape) 
    return X

In [None]:
# Generate dataset
X_train_spiral = generate_spiral(num=2000, rng=rng)
X_test_spiral = generate_spiral(num=500, rng=rng)

# Save and plot dataset
np.save(os.path.join(data_dir, 'X_train_spiral.npy'), X_train_spiral)
np.save(os.path.join(data_dir, 'X_test_spiral.npy'), X_test_spiral)
plot_dataset(X_train_spiral, X_test_spiral)

## Checkerboard

In [None]:
def generate_checkerboard(*, num: int, rng: np.random.Generator) -> np.ndarray:
    # https://github.com/malbergo/stochastic-interpolants/blob/main/notebooks/checker-mirror.ipynb
    x1 = rng.uniform(size=num) * 4 - 2
    x2_ = rng.uniform(size=num) - rng.choice([0, 1], size=(num,)) * 2
    x2 = x2_ + (np.floor(x1) % 2)
    x = np.hstack([x1[:, None], x2[:, None]]) * 2
    return x

In [None]:
# Generate dataset
X_train_checkerboard = generate_checkerboard(num=2000, rng=rng)
X_test_checkerboard = generate_checkerboard(num=500, rng=rng)

# Save and plot dataset
np.save(os.path.join(data_dir, 'X_train_checkerboard.npy'), X_train_checkerboard)
np.save(os.path.join(data_dir, 'X_test_checkerboard.npy'), X_test_checkerboard)
plot_dataset(X_train_checkerboard, X_test_checkerboard)

## Gaussian mixtures

In [None]:
def generate_gaussian_mixtures(*, num: int, rng: np.random.Generator) -> np.ndarray:
    angles = np.linspace(0, 2*np.pi, 8)[:-1]
    idxs = rng.choice(np.arange(len(angles)), size=num)
    X = np.hstack((
        1.2*np.cos(angles[idxs])[:, None],
        1.2*np.sin(angles[idxs])[:, None]
    ))
    X = X + rng.standard_normal(size=X.shape) * 0.12
    return X

In [None]:
# Generate dataset
X_train_gaussian_mixture = generate_gaussian_mixtures(num=2000, rng=rng)
X_test_gaussian_mixture = generate_gaussian_mixtures(num=500, rng=rng)

# Save and plot dataset
np.save(os.path.join(data_dir, 'X_train_gaussian_mixture.npy'), X_train_gaussian_mixture)
np.save(os.path.join(data_dir, 'X_test_gaussian_mixture.npy'), X_test_gaussian_mixture)
plot_dataset(X_train_gaussian_mixture, X_test_gaussian_mixture)