In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import torchvision as tv
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
print(device)

num_workers = os.cpu_count()
print(num_workers)

cuda:0
6


In [2]:
# provide fixed seed
seed = 2**12
np.random.seed(seed)

In [3]:
root = os.path.join('.', 'dataset_root')
print(f'Path directory: {root}')
batch_size = 128

Path directory: ./dataset_root


In [4]:
def split_dataframe(frame, train_ratio=.8):
    assert isinstance(frame, pd.DataFrame)
    assert (isinstance(train_ratio, float) and 
            train_ratio > 0. and train_ratio < 1.)
    
    n, m = frame.shape
    m = m - 1  # not include output column
    # expecting last column (y) has integer values
    last_col = frame.columns[m]
    assert frame[last_col].dtype in (np.int8, np.int16, np.int32, np.int64), \
        f'{frame[last_col].dtype}'
    
    n_train = int(np.round(n * train_ratio))
    n_test = n - n_train
    
    x_train = frame.iloc[:n_train, :m].values
    y_train = frame.iloc[:n_train, -1].values
    x_test = frame.iloc[-n_test:, :m].values
    y_test = frame.iloc[-n_test:, -1].values
    
    # checking shapes
    assert x_train.shape == (n_train, m)
    assert y_train.shape == (n_train,)
    assert x_test.shape == (n_test, m)
    assert y_test.shape == (n_test,)
    
    return x_train, y_train, x_test, y_test

In [5]:
def scale_normalize(data, xmax, xmin):
    assert (type(data) == np.ndarray and 
        type(xmax) == np.ndarray and 
        type(xmin) == np.ndarray)
    assert data.shape[1] == len(xmax) and data.shape[1] == len(xmin)
    
    return (data - xmin) / (xmax - xmin)

In [6]:
class NumeralDataset(torch.utils.data.Dataset):
    def __init__(self, data, label):
        assert isinstance(data, torch.Tensor) \
            and isinstance(label, torch.Tensor)
        
        self.data = data
        self.label = label
        
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    
    def __len__(self):
        return len(self.data)

## Banknote Authentication

- Input: 4
- Output: {0, 1}
- Classes: Binary classification. 
- Samples: 1372
- 80:20 split 
- Train: 1098
- Test: 274
- Resacling to [0, 1]

https://archive.ics.uci.edu/ml/datasets/banknote+authentication  
https://www.kaggle.com/ritesaluja/bank-note-authentication-uci-data#BankNote_Authentication.csv

In [7]:
banknote_dir = os.path.join(root, 'data_banknote_authentication.txt')
print(banknote_dir)

./dataset_root/data_banknote_authentication.txt


In [8]:
banknote_frame = pd.read_csv(
    banknote_dir,
    header=None,
    names=['variance', 'skewness', 'curtosis', 'entropy', 'class'],
    dtype=np.float32)
print(banknote_frame.shape)

n = len(banknote_frame.index)
m = banknote_frame.shape[1] -1
print(f'{n} samples, {m} attributes')

# use small integer for categorical data
banknote_frame['class'] = banknote_frame['class'].astype('int8')

shuffle_idx = np.random.permutation(n)
assert len(shuffle_idx) == banknote_frame.shape[0]
banknote_frame = banknote_frame.iloc[shuffle_idx]
banknote_frame.head()

(1372, 5)
1372 samples, 4 attributes


Unnamed: 0,variance,skewness,curtosis,entropy,class
579,1.1588,8.9331,-2.0807,-1.1272,0
202,-0.78689,9.5663,-3.7867,-7.5034,0
950,-2.0891,-0.48422,1.704,1.7435,1
1196,-2.0149,3.6874,-1.9385,-3.8918,1
740,-2.4473,12.6247,0.73573,-7.6612,0


In [9]:
x = banknote_frame.values[:, :m]
x_max = np.max(x, axis=0)
x_min = np.min(x, axis=0)
print('max', x_max, '\nmin', x_min)

banknote_x_train, banknote_y_train, banknote_x_test, banknote_y_test = split_dataframe(banknote_frame)
banknote_x_train = scale_normalize(banknote_x_train, x_max, x_min)
banknote_x_test = scale_normalize(banknote_x_test, x_max, x_min)

print('train', banknote_x_train.shape)
print('test', banknote_x_test.shape)

max [ 6.8248 12.9516 17.9274  2.4495] 
min [ -7.0421 -13.7731  -5.2861  -8.5482]
train (1098, 4)
test (274, 4)


In [10]:
# create DataLoader for pytorch
dataset_banknote_train = NumeralDataset(
    torch.Tensor(banknote_x_train), 
    torch.Tensor(banknote_y_train))
dataset_banknote_test = NumeralDataset(
    torch.Tensor(banknote_x_test), 
    torch.Tensor(banknote_y_test))

dataloader_banknote_train = DataLoader(
    dataset_banknote_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

dataloader_banknote_test = DataLoader(
    dataset_banknote_test,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

# check batch size
sample, label = next(iter(dataloader_banknote_train))
assert sample.size() == (batch_size, m) and label.size() == (batch_size,)

## Iris Data Set

- Input: 4
- Output: 3
- Sample: 150
- Classes: ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
- Split: 80:20
- Train: 120
- Test: 30
- Resacling to [0, 1]
- Data URL: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data

https://archive.ics.uci.edu/ml/datasets/Iris  
https://www.kaggle.com/uciml/iris

In [12]:
iris_dir = os.path.join(root, 'iris.data')
iris_frame = pd.read_csv(
    iris_dir,
    header=None,
    names=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Class'],
    dtype={'SepalLength': np.float32, 
           'SepalWidth': np.float32, 
           'PetalLength': np.float32, 
           'PetalWidth': np.float32, 
           'Class': np.str})
print(iris_frame.shape)
n = len(iris_frame.index)
m = iris_frame.shape[1] - 1
print(f'{n} samples, {m} attributes')

# reset seed
np.random.seed(seed)
shuffle_idx = np.random.permutation(n)
assert len(shuffle_idx) == iris_frame.shape[0]
iris_frame = iris_frame.iloc[shuffle_idx]

# convert categorical data to integer codes
iris_frame['Class'] = iris_frame['Class'].astype('category')
iris_classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
iris_frame['Class'] = iris_frame['Class'].cat.codes
iris_frame.head()

(150, 5)
150 samples, 4 attributes


Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Class
65,6.7,3.1,4.4,1.4,1
80,5.5,2.4,3.8,1.1,1
12,4.8,3.0,1.4,0.1,0
131,7.9,3.8,6.4,2.0,2
6,4.6,3.4,1.4,0.3,0


In [None]:
# 80:20 split
n_train = int(np.round(n * .8))
n_test = n - n_train
assert n_train + n_test == n
print(f'Train: {n_train}, Test: {n_test}')

iris_data_train = iris_frame.iloc[:n_train, :4].values
iris_data_test = iris
iris_label_train = None
iris_label_test = None

In [None]:
dataset_iris_train = None
dataset_iris_test = None
dataloader_iris_train = None
dataloader_iris_test = None

## Breast Cancer Wisconsin (Diagnostic) Data Set

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)  
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [None]:
bc_frame = None
bc_data_train = None
bc_data_test = None
bc_label_train = None
bc_label_test = None

In [None]:
dataset_bc_train = None
dataset_bc_test = None
dataloader_bc_train = None
dataloader_bc_test = None

## Seeds of Wheat Data Set

https://archive.ics.uci.edu/ml/datasets/seeds  
https://www.kaggle.com/dongeorge/seed-from-uci

In [None]:
seed_frame = None
seed_data_train = None
seed_data_test = None
seed_label_train = None
seed_label_test = None

In [None]:
dataset_seed_train = None
dataset_seed_test = None
dataloader_seed_train = None
dataloader_seed_test = None

## HTRU2 Data Set

- PREDICTING A PULSAR STAR

https://archive.ics.uci.edu/ml/datasets/HTRU2  
https://www.kaggle.com/pavanraj159/predicting-a-pulsar-star

In [None]:
htru2_frame = None
htru2_data_train = None
htru2_data_test = None
htru2_label_train = None
htru2_label_test = None

In [None]:
dataset_htru2_train = None
dataset_htru2_test = None
dataloader_htru2_train = None
dataloader_htru2_test = None