In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import torchvision as tv
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
print(device)

num_workers = os.cpu_count()
print(num_workers)

cuda:0
6


In [2]:
# provide fixed seed
seed = 2**12
np.random.seed(seed)

In [3]:
root = os.path.join('.', 'dataset_root')
print(f'Path directory: {root}')
batch_size = 128

Path directory: ./dataset_root


In [4]:
def scale_normalize(data, xmax, xmin):
    assert (type(data) == np.ndarray and 
        type(xmax) == np.ndarray and 
        type(xmin) == np.ndarray)
    
    assert data.shape[1] == len(xmax) and data.shape[1] == len(xmin)
    
    return (data - xmin) / (xmax - xmin)

In [5]:
class NumeralDataset(torch.utils.data.Dataset):
    def __init__(self, data, label):
        assert isinstance(data, torch.Tensor) \
            and isinstance(label, torch.Tensor)
        
        self.data = data
        self.label = label
        
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    
    def __len__(self):
        return len(self.data)

## Banknote Authentication

- Input: 4
- Output: {0, 1}
- Classes: Binary classification. 
- Samples: 1372
- 80:20 split 
- Train: 1098
- Test: 274
- Resacling to [0, 1]

https://archive.ics.uci.edu/ml/datasets/banknote+authentication  
https://www.kaggle.com/ritesaluja/bank-note-authentication-uci-data#BankNote_Authentication.csv

In [6]:
banknote_dir = os.path.join(root, 'data_banknote_authentication.txt')
print(banknote_dir)

./dataset_root/data_banknote_authentication.txt


In [7]:
banknote_frame = pd.read_csv(
    banknote_dir,
    header=None,
    names=['variance', 'skewness', 'curtosis', 'entropy', 'class'],
    dtype=np.float32
)
print(banknote_frame.shape)
n = len(banknote_frame.index)
m = banknote_frame.shape[1] -1
print(f'{n} samples, {m} attributes')
shuffle_idx = np.random.permutation(n)
assert len(shuffle_idx) == banknote_frame.shape[0]
banknote_frame = banknote_frame.iloc[shuffle_idx]
banknote_frame.head()

# 80:20 split
n_train = int(np.round(n * .8))
n_test = n - n_train
assert n_train + n_test == n
print(f'Train: {n_train}, Test: {n_test}')

# only apply scaling to [0, 1]
banknote_data_train = banknote_frame.iloc[:n_train, :4].values
x_max = np.max(banknote_data_train, axis=0)
x_min = np.min(banknote_data_train, axis=0)
banknote_data_train = scale_normalize(banknote_data_train, x_max, x_min)
assert banknote_data_train.shape == (n_train, m)

banknote_data_test = banknote_frame.iloc[-n_test:, :4].values
banknote_data_test = scale_normalize(banknote_data_test, x_max, x_min)
assert banknote_data_test.shape == (n_test, m)
print(banknote_data_train.shape, banknote_data_test.shape)

banknote_label_train = banknote_frame.iloc[:n_train, -1].values
assert banknote_label_train.shape == (n_train,)
banknote_label_test = banknote_frame.iloc[-n_test:, -1].values
assert banknote_label_test.shape == (n_test,)
print(banknote_label_train.shape, banknote_label_test.shape)

(1372, 5)
1372 samples, 4 attributes
Train: 1098, Test: 274
(1098, 4) (274, 4)
(1098,) (274,)


In [8]:
dataset_banknote_train = NumeralDataset(
    torch.Tensor(banknote_data_train), 
    torch.Tensor(banknote_label_train))
dataset_banknote_test = NumeralDataset(
    torch.Tensor(banknote_data_test), 
    torch.Tensor(banknote_label_test))

dataloader_banknote_train = DataLoader(
    dataset_banknote_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

dataloader_banknote_test = DataLoader(
    dataset_banknote_test,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

# check batch size
sample, label = next(iter(dataloader_banknote_train))
assert sample.size() == (batch_size, m) and label.size() == (batch_size,)

## Iris Data Set

https://archive.ics.uci.edu/ml/datasets/Iris  
https://www.kaggle.com/uciml/iris

In [9]:
iris_frame = None
iris_data_train = None
iris_data_test = None
iris_label_train = None
iris_label_test = None

In [10]:
dataset_iris_train = None
dataset_iris_test = None
dataloader_iris_train = None
dataloader_iris_test = None

## Breast Cancer Wisconsin (Diagnostic) Data Set

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)  
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [11]:
bc_frame = None
bc_data_train = None
bc_data_test = None
bc_label_train = None
bc_label_test = None

In [12]:
dataset_bc_train = None
dataset_bc_test = None
dataloader_bc_train = None
dataloader_bc_test = None

## Seeds of Wheat Data Set

https://archive.ics.uci.edu/ml/datasets/seeds  
https://www.kaggle.com/dongeorge/seed-from-uci

In [13]:
seed_frame = None
seed_data_train = None
seed_data_test = None
seed_label_train = None
seed_label_test = None

In [14]:
dataset_seed_train = None
dataset_seed_test = None
dataloader_seed_train = None
dataloader_seed_test = None

## HTRU2 Data Set

- PREDICTING A PULSAR STAR

https://archive.ics.uci.edu/ml/datasets/HTRU2  
https://www.kaggle.com/pavanraj159/predicting-a-pulsar-star

In [15]:
htru2_frame = None
htru2_data_train = None
htru2_data_test = None
htru2_label_train = None
htru2_label_test = None

In [16]:
dataset_htru2_train = None
dataset_htru2_test = None
dataloader_htru2_train = None
dataloader_htru2_test = None