### Setup

In [1]:
import os
import torchvision 

FASHIONNIST_PATH = os.path.join("data", "fashion_mnist")
CIFAR10_PATH = os.path.join("data", "cifar10")
CIFAR100_PATH = os.path.join("data", "cifar100")
NEWSGROUPS_PATH = os.path.join("data", "newsgroups")

os.makedirs("data", exist_ok = True)

to_pil = torchvision.transforms.ToPILImage()

### FASHION MNIST

In [None]:
# Create folder
os.makedirs(FASHIONNIST_PATH, exist_ok = True)
os.makedirs(os.path.join(FASHIONNIST_PATH, "train"), exist_ok = True)
os.makedirs(os.path.join(FASHIONNIST_PATH, "test"), exist_ok = True)

# Download dataset
os.makedirs("fmnist_train", exist_ok = True)
os.makedirs("fmnist_test", exist_ok = True)
train_dataset = torchvision.datasets.FashionMNIST(root = "fmnist_train", train = True, download = True)
test_dataset = torchvision.datasets.FashionMNIST(root = "fmnist_test", train = False, download = True)

# Training set
for i in range(len(train_dataset)):
    img, label = train_dataset[i]  
    pil_img = to_pil(img)
    filename = f"x{i}_y{label}.png"
    filepath = os.path.join(os.path.join(FASHIONNIST_PATH, "train"), filename)
    pil_img.save(filepath)

# Test set
for i in range(len(test_dataset)):
    img, label = test_dataset[i]  
    pil_img = to_pil(img)
    filename = f"x{i}_y{label}.png"
    filepath = os.path.join(os.path.join(FASHIONNIST_PATH, "test"), filename)
    pil_img.save(filepath)

### CIFAR 10

In [None]:
# Create folder
os.makedirs(CIFAR10_PATH, exist_ok = True)
os.makedirs(os.path.join(CIFAR10_PATH, "train"), exist_ok = True)
os.makedirs(os.path.join(CIFAR10_PATH, "test"), exist_ok = True)

# Download dataset
os.makedirs("c10_train", exist_ok = True)
os.makedirs("c10_test", exist_ok = True)
train_dataset = torchvision.datasets.CIFAR10(root = "c10_train", train = True, download = True)
test_dataset = torchvision.datasets.CIFAR10(root = "c10_test", train = False, download = True)

# Training set
for i in range(len(train_dataset)):
    img, label = train_dataset[i]  
    pil_img = to_pil(img)
    filename = f"x{i}_y{label}.png"
    filepath = os.path.join(os.path.join(CIFAR10_PATH, "train"), filename)
    pil_img.save(filepath)

# Test set
for i in range(len(test_dataset)):
    img, label = test_dataset[i]  
    pil_img = to_pil(img)
    filename = f"x{i}_y{label}.png"
    filepath = os.path.join(os.path.join(CIFAR10_PATH, "test"), filename)
    pil_img.save(filepath)

### CIFAR 100

In [None]:
# Create folder
os.makedirs(CIFAR100_PATH, exist_ok = True)
os.makedirs(os.path.join(CIFAR100_PATH, "train"), exist_ok = True)
os.makedirs(os.path.join(CIFAR100_PATH, "test"), exist_ok = True)

# Download dataset
os.makedirs("c10_train", exist_ok = True)
os.makedirs("c10_test", exist_ok = True)
train_dataset = torchvision.datasets.CIFAR100(root = "c10_train", train = True, download = True)
test_dataset = torchvision.datasets.CIFAR100(root = "c10_test", train = False, download = True)

# Training set
for i in range(len(train_dataset)):
    img, label = train_dataset[i]  
    pil_img = to_pil(img)
    filename = f"x{i}_y{label}.png"
    filepath = os.path.join(os.path.join(CIFAR100_PATH, "train"), filename)
    pil_img.save(filepath)

# Test set
for i in range(len(test_dataset)):
    img, label = test_dataset[i]  
    pil_img = to_pil(img)
    filename = f"x{i}_y{label}.png"
    filepath = os.path.join(os.path.join(CIFAR100_PATH, "test"), filename)
    pil_img.save(filepath)

### NEWSGROUPS

In [2]:
import re
import torch
import torch.nn as nn 
import zipfile 
import urllib.request 
import numpy as np 
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Create folder
os.makedirs(NEWSGROUPS_PATH, exist_ok = True)
os.makedirs(os.path.join(NEWSGROUPS_PATH, "train"), exist_ok = True)
os.makedirs(os.path.join(NEWSGROUPS_PATH, "test"), exist_ok = True)

# Load dataset 
data = fetch_20newsgroups(subset = 'all', remove = ('headers', 'footers', 'quotes'))
texts, labels = data['data'], data['target']

# Preprocessing
def preprocess(text):
    text = text.lower()                               
    text = re.sub(r'[^a-z\s]', '', text)              
    tokens = text.split()                             
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS] 
    return tokens

def encode(tokens):
    return [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

tokenized_texts = [preprocess(doc) for doc in texts]

word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = 2
for tokens in tokenized_texts:
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

texts_ids = [torch.tensor(encode(tokens)) for tokens in tokenized_texts]
X_pad = pad_sequence(texts_ids, batch_first = True, padding_value = 0)

# Download Glove 
glove_dir = os.path.join(NEWSGROUPS_PATH, "glove")
glove_zip_path = os.path.join(NEWSGROUPS_PATH, 'glove.6B.zip')
glove_txt_path = os.path.join(NEWSGROUPS_PATH, 'glove.6B.50d.txt')
os.makedirs(glove_dir, exist_ok = True)

if not os.path.exists(glove_txt_path):
    print("Downloading GloVe embeddings...")
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    urllib.request.urlretrieve(url, glove_zip_path)
    print("Extracting...")
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)
    print("GloVe download and extraction complete.")
else:
    print("GloVe already downloaded.")

# Embedding 
embedding_index = {}
with open(glove_txt_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        embedding_index[word] = vector

embedding_dim = 50
vocab_size = len(word2idx)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word2idx.items():
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,)) 

embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)
embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=False, padding_idx=0)

with torch.no_grad():
    X_embedding = embedding_layer(X_pad)

X_embedding = torch.swapaxes(X_embedding, 1, 2).numpy()

# Split 
test_samples_per_class = 100
unique_labels = np.unique(labels)
test_indices = []
train_indices = []

for label in unique_labels:
    label_idx = np.where(labels == label)[0]
    test_indices.extend(label_idx[:test_samples_per_class])
    train_indices.extend(label_idx[test_samples_per_class:])

test_indices = np.array(test_indices)
train_indices = np.array(train_indices)
X_train = X_embedding[train_indices]
y_train = labels[train_indices]
X_test = X_embedding[test_indices]
y_test = labels[test_indices]

# Save files
for i, (x, y) in tqdm(enumerate(zip(X_train, y_train))): 
    fname = f"x{i}_y{y}.txt"
    np.savetxt(os.path.join(NEWSGROUPS_PATH, "train", fname), x)

for i, (x, y) in tqdm(enumerate(zip(X_test, y_test))): 
    fname = f"x{i}_y{y}.txt"
    np.savetxt(os.path.join(NEWSGROUPS_PATH, "test", fname), x)

GloVe already downloaded.
