In [2]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import shutil
import imutils
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_train = pd.read_csv("./assets/DATA/mnist_train.csv")
data_test = pd.read_csv("./assets/DATA/mnist_test.csv")

# Vanilla Dataset Creation

## Now, we would want our data to be dominated by the SE(2) group. To do this, we would discretise the group elements to multiples of $\frac{\pi}{6}$ and produce new samples by rotation of the original samples. Moreover, since 6 and 9 become identical with $\pi$ rotation we would take only 0, 1, 2, 3, 4, 5, 6, 7, 8 for this analysis

### Removing the sample 9 from our dataset

In [9]:
df_data = pd.concat([data_train,data_test])
data_filtered = df_data.sort_values("label").reset_index(drop = True).iloc[:63000,:].sample(frac = 1).reset_index(drop = True)
train_dataset, test_dataset = train_test_split(data_filtered, test_size=0.1)

In [10]:
X_train = np.array(train_dataset.iloc[:,1:]).reshape(-1,28,28)
Y_train = np.array(train_dataset.iloc[:,0])

X_test = np.array(test_dataset.iloc[:,1:]).reshape(-1,28,28)
Y_test = np.array(test_dataset.iloc[:,0])

### Creating the circular mask
#### Just to be sure

In [11]:
def create_circular_mask(h, w, center=None, radius=None):

    if center is None: # use the middle of the image
        center = (int(w/2), int(h/2))
    if radius is None: # use the smallest distance between the center and image walls
        radius = min(center[0], center[1], w-center[0], h-center[1])

    Y, X = np.ogrid[:h, :w]
    dist_from_center = np.sqrt((X - center[0])**2 + (Y-center[1])**2)

    mask = dist_from_center <= radius
    return mask

In [12]:
mask = create_circular_mask(28,28,(14,14),14)

In [13]:
X_test = (X_test*mask).astype(np.uint8)
X_train = (X_train*mask).astype(np.uint8)

### Creating the rotated versions

In [14]:
X_train_augmented = []
X_test_augmented = []
Y_train_augmented = []
Y_test_augmented = []
for angle in tqdm(np.arange(0,360,30)):
    for i in range(X_train.shape[0]):
        X_train_augmented.append(imutils.rotate(X_train[i,:,:],angle))
        Y_train_augmented.append((Y_train[i]))

for angle in tqdm(np.arange(0,360,30)):
    for i in range(X_test.shape[0]):
        X_test_augmented.append(imutils.rotate(X_test[i,:,:],angle))
        Y_test_augmented.append((Y_test[i]))

100%|██████████| 12/12 [00:06<00:00,  1.92it/s]
100%|██████████| 12/12 [00:00<00:00, 17.13it/s]


In [15]:
Dataset = {
    "X_train":X_train_augmented,
    "Y_train":Y_train_augmented,
    "X_test":X_test_augmented,
    "Y_test":Y_test_augmented}

In [17]:
with open('./assets/DATA/augmented_dataset.data', 'wb') as f:
    pickle.dump(Dataset, f)

# Distilled Dataset Creation

## For distilling our dataset we will first train an autoencoder and then use the reconstruction loss to decide on the cutoff