In [None]:
import pandas as pd
import os, json, re
import numpy as np
import pandas as pd
import PIL
from PIL import Image
import scipy.io as sio
from matplotlib import cm
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
# import mne

# COIL-20

COIL-20 is a set of 1440 greyscale images consisting of 20 objects under 72 different rotations spanning 360 degrees. Each image is a 128x128
image which we treat as a single 16384 dimensional vector for the purposes
of computing distance between images.

In [None]:
!ls data

In [None]:
!mkdir -p data/coil

In [None]:
import os
import urllib.request

# URL of the Coil-20 dataset
dataset_url = "http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-20/coil-20-proc.zip"

# Directory to save the downloaded dataset
download_dir = "data"

# Create the download directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Path to save the downloaded ZIP file
zip_file_path = os.path.join(download_dir, "coil-20.zip")

# Download the ZIP file
urllib.request.urlretrieve(dataset_url, zip_file_path)

print("Dataset downloaded successfully!")

# You can now extract the contents of the ZIP file using a library like zipfile.


In [None]:
import zipfile
import os

# Path to the downloaded ZIP file
zip_file_path = "data/coil-20.zip"

# Directory where you want to extract the contents
!mkdir data/COIL-20
extracted_dir = "data/COIL-20"

# Create the extraction directory if it doesn't exist
os.makedirs(extracted_dir, exist_ok=True)

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all contents to the specified directory
    zip_ref.extractall(extracted_dir)

print("ZIP file extracted successfully!")


In [None]:
filenames = os.listdir("data/COIL-20/coil-20-proc/")
dirname = "data/COIL-20/coil-20-proc/"

labels = []
data = []
for file in tqdm(filenames):
    img = Image.open(dirname + file)
    objId, imgId = file.split('__')
    imgId = int(imgId[:-4])
    objId = int(objId[3:])
    data.append(np.array(img))
    labels.append(objId)
data = np.asarray(data)
labels = np.asarray(labels)

In [None]:
objId = 1
fig, axes = plt.subplots(9, 8, figsize=(2*9, 2*8))

for i, ax in enumerate(axes.flatten()):
    ax.imshow(data[labels==objId][i])
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

In [None]:
# demonstration
img.show()

In [None]:
labels.shape

In [None]:
!mkdir data/COIL-20/prepared
np.save('data/COIL-20/prepared/data.npy', data)
np.save('data/COIL-20/prepared/labels.npy', labels)

In [None]:
labels

In [None]:
ids = (labels == 1) | (labels == 2) | (labels == 3)

In [None]:
clabels = labels[ids]
cdata = data[ids]

In [None]:
cdata.shape

In [None]:
data.shape

In [None]:
np.save('data/COIL-20/prepared/data_3obj.npy', cdata)
np.save('data/COIL-20/prepared/labels_3obj.npy', clabels)

# COIL-100

COIL-100 is a set of 7200 colour images consisting of 100 objects under 72 different rotations spanning 360 degrees. Each image consists of 3 128x128 intensity matrices (one for each color channel). We treat this as a single 49152 dimensional vector for the purposes of computing distance between images.

In [None]:
import os
import urllib.request
import zipfile

# URL of the Coil-100 dataset
dataset_url = "http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-100/coil-100.zip"

# Directory to save the downloaded dataset
!mkdir data/COIL-100
download_dir = "data/COIL-100"

# Create the download directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Path to save the downloaded ZIP file
zip_file_path = os.path.join(download_dir, "coil-100.zip")

# Download the ZIP file
urllib.request.urlretrieve(dataset_url, zip_file_path)

print("Dataset downloaded successfully!")

# Unzip the downloaded ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(download_dir)

print("Dataset unzipped successfully!")

In [None]:
!ls data/COIL-100/

In [None]:
!mv data/COIL-100/coil-100 data/COIL-100/images

In [None]:
dirname = "data/COIL-100/images/"
filenames = os.listdir(dirname)

In [None]:
labels = []
data = []
for file in tqdm(filenames):
    if re.match("obj[]+__[0-9]+\.png", file) is None:
        continue
    img = Image.open(dirname + file)
    objId, imgId = file.split('__')
    imgId = int(imgId[:-4])
    objId = int(objId[3:])
    data.append(np.array(img))
    labels.append([objId, imgId])
data = np.asarray(data)
labels = np.asarray(labels)

In [None]:
img.show()

In [None]:
!mkdir data/COIL-100/prepared
np.save('data/COIL-100/prepared/data.npy', data)
np.save('data/COIL-100/prepared/labels.npy', labels)

# MNIST

In [None]:
import os
import urllib.request
import gzip
import shutil

# URLs for the MNIST dataset files
images_url = "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
labels_url = "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"

# Directory to save the downloaded dataset
#!mkdir data/MNIST
download_dir = "data/MNIST"

# Create the download directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Function to download and extract a gzip file
def download_and_extract(url, file_path):
    urllib.request.urlretrieve(url, file_path + '.gz')
    with gzip.open(file_path + '.gz', 'rb') as f_in:
        with open(file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    #os.remove(file_path + '.gz')

# Download and extract the images file
images_file_path = os.path.join(download_dir, "train-images-idx3-ubyte")
download_and_extract(images_url, images_file_path)

# Download and extract the labels file
labels_file_path = os.path.join(download_dir, "train-labels-idx1-ubyte")
download_and_extract(labels_url, labels_file_path)

print("MNIST dataset downloaded and extracted successfully!")


In [None]:
!ls data/MNIST

In [None]:
import gzip
f = gzip.open('data/MNIST/train-images-idx3-ubyte.gz','r')

image_size = 28
num_images = 60000

f.read(16)
buf = f.read(image_size * image_size * num_images)
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
data = data.reshape(num_images, image_size, image_size)

f = gzip.open('data/MNIST/train-labels-idx1-ubyte.gz','r')
f.read(8)
labels = []
for i in range(num_images):   
    buf = f.read(1)
    labels.append(np.frombuffer(buf, dtype=np.uint8).astype(np.int64)[0])
labels = np.array(labels)

In [None]:
!mkdir data/MNIST/prepared/
np.save('data/MNIST/prepared/train_data.npy', data)
np.save('data/MNIST/prepared/train_labels.npy', labels)

In [None]:
# demonstration
Image.fromarray(data[0].astype(np.uint8))

In [None]:
import os
import urllib.request
import gzip
import shutil

# URLs for the MNIST dataset files
test_images_url = "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
test_labels_url = "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"

# Directory to save the downloaded dataset
#!mkdir data/MNIST
download_dir = "data/MNIST"

# Create the download directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Function to download and extract a gzip file
def download_and_extract(url, file_path):
    urllib.request.urlretrieve(url, file_path + '.gz')
    with gzip.open(file_path + '.gz', 'rb') as f_in:
        with open(file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    #os.remove(file_path + '.gz')

# Download and extract the images file
images_file_path = os.path.join(download_dir, "test-images-idx3-ubyte")
download_and_extract(test_images_url, images_file_path)

# Download and extract the labels file
labels_file_path = os.path.join(download_dir, "test-labels-idx1-ubyte")
download_and_extract(test_labels_url, labels_file_path)

print("MNIST dataset downloaded and extracted successfully!")

In [None]:
!ls data/MNIST

In [None]:
import gzip
f = gzip.open('data/MNIST/test-images-idx3-ubyte.gz','r')

image_size = 28
num_images = 10000

f.read(16)
buf = f.read(image_size * image_size * num_images)
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
data = data.reshape(num_images, image_size, image_size)

f = gzip.open('data/MNIST/test-labels-idx1-ubyte.gz','r')
f.read(8)
labels = []
for i in range(num_images):   
    buf = f.read(1)
    labels.append(np.frombuffer(buf, dtype=np.uint8).astype(np.int64)[0])
labels = np.array(labels)

In [None]:
np.save('data/MNIST/prepared/test_data.npy', data)
np.save('data/MNIST/prepared/test_labels.npy', labels)

# F-MNIST

In [None]:
import os
import urllib.request
import gzip
import shutil


download_dir = "data/F-MNIST"

def download_and_extract(url, download_dir):
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    filename = os.path.basename(url)
    gzip_file_path = os.path.join(download_dir, filename)
    binary_file_path = os.path.splitext(gzip_file_path)[0]

    urllib.request.urlretrieve(url, gzip_file_path)

    with gzip.open(gzip_file_path, 'rb') as f_in:
        with open(binary_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    #os.remove(gzip_file_path)

    print(f"{filename} downloaded and extracted successfully!")

# URLs for F-MNIST test images and labels
test_images_url = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz"
test_labels_url = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz"
# URLs for F-MNIST train images and labels
train_images_url = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz"
train_labels_url = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz"

# Download and extract F-MNIST test images and labels
download_and_extract(test_images_url, download_dir)
download_and_extract(test_labels_url, download_dir)

# Download and extract F-MNIST train images and labels
download_and_extract(train_images_url, download_dir)
download_and_extract(train_labels_url, download_dir)

In [None]:
import gzip
f = gzip.open('data/F-MNIST/train-images-idx3-ubyte.gz','r')

image_size = 28
num_images = 60000

f.read(16)
buf = f.read(image_size * image_size * num_images)
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
data = data.reshape(num_images, image_size, image_size)

f = gzip.open('data/F-MNIST/train-labels-idx1-ubyte.gz','r')
f.read(8)
labels = []
for i in range(num_images):   
    buf = f.read(1)
    labels.append(np.frombuffer(buf, dtype=np.uint8).astype(np.int64)[0])
labels = np.array(labels)

In [None]:
# demonstration
Image.fromarray(data[0].astype(np.uint8))

In [None]:
!mkdir data/F-MNIST/prepared
np.save('data/F-MNIST/prepared/train_data.npy', data)
np.save('data/F-MNIST/prepared/train_labels.npy', labels)

In [None]:
f = gzip.open('data/F-MNIST/t10k-images-idx3-ubyte.gz','r')

image_size = 28
num_images = 10000

f.read(16)
buf = f.read(image_size * image_size * num_images)
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
data = data.reshape(num_images, image_size, image_size)

f = gzip.open('data/F-MNIST/t10k-labels-idx1-ubyte.gz','r')
f.read(8)
labels = []
for i in range(num_images):   
    buf = f.read(1)
    labels.append(np.frombuffer(buf, dtype=np.uint8).astype(np.int64)[0])
labels = np.array(labels)

In [None]:
np.save('data/F-MNIST/prepared/test_data.npy', data)
np.save('data/F-MNIST/prepared/test_labels.npy', labels)

# CIFAR-10

In [None]:
import os
import pickle

In [None]:
import os
import urllib.request
import tarfile

# URL for the CIFAR-10 dataset
cifar10_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"

# Directory to save the downloaded dataset
download_dir = "data/CIFAR-10"

# Create the download directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Path to save the downloaded tar.gz file
tar_file_path = os.path.join(download_dir, "cifar-10-python.tar.gz")

# Download the tar.gz file
urllib.request.urlretrieve(cifar10_url, tar_file_path)

# Extract the tar.gz file
with tarfile.open(tar_file_path, 'r:gz') as tar:
    tar.extractall(download_dir)

# Remove the downloaded tar.gz file
#os.remove(tar_file_path)

print("CIFAR-10 dataset downloaded and extracted successfully!")


In [None]:
!ls data/CIFAR-10/cifar-10-batches-py

In [None]:
def unpickle(file):
    with open(file, 'rb') as fo:
        d = pickle.load(fo, encoding='bytes')
    return d

dirname = 'data/CIFAR-10/cifar-10-batches-py/'

In [None]:
os.listdir(dirname)

In [None]:
train_files = sorted([file for file in os.listdir(dirname) if 'data_batch' in file])

In [None]:
labels = []
data = []
for file in train_files:
    loaded = unpickle(dirname+file)
    data.append(loaded[b'data'])
    labels.extend(loaded[b'labels'])

In [None]:
np.concatenate(data).shape

In [None]:
!mkdir data/CIFAR-10/prepared
np.save('data/CIFAR-10/prepared/train_labels.npy', np.array(labels))
np.save('data/CIFAR-10/prepared/train_data.npy', np.concatenate(data, axis=0))

In [None]:
loaded = unpickle(dirname+'test_batch')
test_data = loaded[b'data']
test_labels = np.array(loaded[b'labels'])

In [None]:
np.save('data/CIFAR-10/prepared/test_labels.npy', test_labels)
np.save('data/CIFAR-10/prepared/test_data.npy', test_data)

## Spheres

In [None]:
from src.spheres import create_sphere_dataset

In [None]:
!pip install tadasets

In [None]:
import numpy as np 
#import tadasets 
import matplotlib
import matplotlib.pyplot as plt
from src.custom_shapes import dsphere 

from IPython import embed
plot=True
n_samples=1000
d=100
n_spheres=11
r=5

#it seemed that rescaling the shift variance by sqrt of d lets big sphere stay around the inner spheres
variance=20/np.sqrt(d)

shift_matrix = np.random.normal(0,variance,[n_spheres, d+1])

spheres = [] 
n_datapoints = 0
for i in np.arange(n_spheres-1):
    sphere = dsphere(n=n_samples, d=d, r=r)[0]
    spheres.append(sphere + shift_matrix[i,:])
    #print(n_datapoints)
    n_datapoints += n_samples

#Additional big surrounding sphere:
n_samples_big = 10*n_samples #int(n_samples/2)
big = dsphere(n=n_samples_big, d=d, r=r*5)[0]
spheres.append(big)
n_datapoints += n_samples_big

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection="3d")
colors = matplotlib.cm.rainbow(np.linspace(0, 1, n_spheres-1))

for idx, (data, color) in enumerate(zip(spheres[:-1], colors)):
    ax.scatter(data[:, 77], data[:, 78], data[:, 79], c=[color], s=5)

# Visualize the last sphere in gray with some degree of transparency
gray_color = (0.5, 0.5, 0.5)  # Gray color tuple
alpha = 0.1  # Adjust transparency


last_sphere = spheres[-1]
ax.scatter(
    last_sphere[:, 0], last_sphere[:, 1], last_sphere[:, 2],
    c=[gray_color], s=5, alpha=alpha  # Apply gray color and transparency
)
ax.view_init(elev=20, azim=45)  # Adjust azim angle to 90 degrees
plt.show()


In [None]:
dataset = np.concatenate(spheres, axis=0)
labels = np.zeros(n_datapoints) 
label_index=0
for index, data in enumerate(spheres):
    n_sphere_samples = data.shape[0]
    labels[label_index:label_index + n_sphere_samples] = index
    label_index += n_sphere_samples

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection="3d")
colors = matplotlib.cm.rainbow(np.linspace(0, 1, n_spheres))
for data, color in zip(spheres[:-1], colors):
    #print(data.shape,color)
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=[color],s=5)
plt.show()

In [None]:
# train test split
ids = []
for i, l in enumerate(np.unique(labels)):
    idx = np.random.choice(np.arange(0, 1000, dtype=np.uint), replace=False, size=250) + i*1000
    ids.append(idx)
test_ids = np.concatenate(ids)
test_data, test_labels = dataset[test_ids], labels[test_ids]
train_ids = np.asarray(list(set(np.arange(len(dataset))) - set(test_ids)))
train_data, train_labels = dataset[train_ids], labels[train_ids]

In [None]:
!mkdir -p data/Spheres/prepared

In [None]:
np.save('data/Spheres/prepared/train_data.npy', train_data)
np.save('data/Spheres/prepared/train_labels.npy', train_labels)
np.save('data/Spheres/prepared/test_data.npy', test_data)
np.save('data/Spheres/prepared/test_labels.npy', test_labels)

## Swiss Roll

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import make_swiss_roll

In [None]:
# Generate the data
n_samples = 20000  # Number of samples
noise = 0.2  # Noise level
X, color = make_swiss_roll(n_samples=n_samples, noise=noise, random_state=42)

# X is a (n_samples, 3) array, containing the 3D points
# color is a (n_samples,) array, containing the colors for plotting

In [None]:
# Plot
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral,s=100)
ax.set_title("Swiss Roll")
#ax.view_init(elev=0, azim=90)  # Adjust azim angle to 90 degrees
plt.show()

In [None]:
# Splitting the data and labels into training and testing sets
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(X, color, test_size=0.3, random_state=42)

In [None]:
dataset_name = 'Swiss'
!mkdir -p data/{dataset_name}/prepared

In [None]:
# Save the splits to .npy files
np.save(f'data/{dataset_name}/prepared/train_data.npy', train_data)
np.save(f'data/{dataset_name}/prepared/train_labels.npy', train_labels)
np.save(f'data/{dataset_name}/prepared/test_data.npy', test_data)
np.save(f'data/{dataset_name}/prepared/test_labels.npy', test_labels)