In [1]:
import os
import numpy as np
import pandas as pd
from scipy.io import arff
import torch
from torch.utils.data import DataLoader

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
print(device)

num_workers = os.cpu_count()
print(num_workers)

cuda:0
6


In [2]:
# provide fixed seed
seed = 2**12

In [3]:
root = os.path.join('.', 'dataset_root')
print(f'Path directory: {root}')
batch_size = 128

Path directory: ./dataset_root


In [4]:
def split_dataframe(frame, train_ratio=.8):
    assert isinstance(frame, pd.DataFrame)
    assert (isinstance(train_ratio, float) and 
            train_ratio > 0. and train_ratio < 1.)
    
    n, m = frame.shape
    m = m - 1  # not include output column
    # expecting last column (y) has integer values
    last_col = frame.columns[m]
    assert frame[last_col].dtype in (np.int8, np.int16, np.int32, np.int64), \
        f'{frame[last_col].dtype}'
    
    n_train = int(np.round(n * train_ratio))
    n_test = n - n_train
    
    x_train = frame.iloc[:n_train, :m].values
    y_train = frame.iloc[:n_train, -1].values
    x_test = frame.iloc[-n_test:, :m].values
    y_test = frame.iloc[-n_test:, -1].values
    
    # checking shapes
    assert x_train.shape == (n_train, m)
    assert y_train.shape == (n_train,)
    assert x_test.shape == (n_test, m)
    assert y_test.shape == (n_test,)
    
    return x_train, y_train, x_test, y_test

In [5]:
def get_max_min(data):
    assert type(data) == np.ndarray
    
    x_max = np.max(data, axis=0)
    x_min = np.min(data, axis=0)
    return x_max, x_min

In [6]:
def scale_normalize(data, xmax, xmin):
    assert (type(data) == np.ndarray and 
        type(xmax) == np.ndarray and 
        type(xmin) == np.ndarray)
    assert data.shape[1] == len(xmax) and data.shape[1] == len(xmin)
    
    return (data - xmin) / (xmax - xmin)

In [7]:
class NumeralDataset(torch.utils.data.Dataset):
    def __init__(self, data, label):
        assert isinstance(data, torch.Tensor) \
            and isinstance(label, torch.Tensor)
        
        self.data = data
        self.label = label
        
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    
    def __len__(self):
        return len(self.data)

## Banknote Authentication

- Input: 4
- Output: {0, 1}
- Classes: Binary classification. 
- Samples: 1372
- 80:20 split 
- Train: 1098
- Test: 274
- Resacling to [0, 1]
- Data URL: https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt

https://archive.ics.uci.edu/ml/datasets/banknote+authentication  
https://www.kaggle.com/ritesaluja/bank-note-authentication-uci-data#BankNote_Authentication.csv

In [8]:
# load data
banknote_dir = os.path.join(root, 'data_banknote_authentication.txt')
print(banknote_dir)

banknote_frame = pd.read_csv(
    banknote_dir,
    header=None,
    names=['variance', 'skewness', 'curtosis', 'entropy', 'class'],
    dtype=np.float32)
print(banknote_frame.shape)

n = len(banknote_frame.index)
m = banknote_frame.shape[1] -1
print(f'{n} samples, {m} attributes')

# use small integer for categorical data
banknote_frame['class'] = banknote_frame['class'].astype('int8')

# reset seed
np.random.seed(seed)
shuffle_idx = np.random.permutation(n)
assert len(shuffle_idx) == banknote_frame.shape[0]
banknote_frame = banknote_frame.iloc[shuffle_idx]
banknote_frame.head()

./dataset_root/data_banknote_authentication.txt
(1372, 5)
1372 samples, 4 attributes


Unnamed: 0,variance,skewness,curtosis,entropy,class
579,1.1588,8.9331,-2.0807,-1.1272,0
202,-0.78689,9.5663,-3.7867,-7.5034,0
950,-2.0891,-0.48422,1.704,1.7435,1
1196,-2.0149,3.6874,-1.9385,-3.8918,1
740,-2.4473,12.6247,0.73573,-7.6612,0


In [9]:
# rescale, train-test split
x_max, x_min = get_max_min(banknote_frame.values[:, :m])
print('max', x_max, '\nmin', x_min)

banknote_x_train, banknote_y_train, banknote_x_test, banknote_y_test = \
    split_dataframe(banknote_frame)
banknote_x_train = scale_normalize(banknote_x_train, x_max, x_min)
banknote_x_test = scale_normalize(banknote_x_test, x_max, x_min)

print('train', banknote_x_train.shape)
print('test', banknote_x_test.shape)

max [ 6.8248 12.9516 17.9274  2.4495] 
min [ -7.0421 -13.7731  -5.2861  -8.5482]
train (1098, 4)
test (274, 4)


In [10]:
# create DataLoader for pytorch
dataset_banknote_train = NumeralDataset(
    torch.Tensor(banknote_x_train), 
    torch.Tensor(banknote_y_train))
dataset_banknote_test = NumeralDataset(
    torch.Tensor(banknote_x_test), 
    torch.Tensor(banknote_y_test))

dataloader_banknote_train = DataLoader(
    dataset_banknote_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers)
dataloader_banknote_test = DataLoader(
    dataset_banknote_test,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

# check batch size
sample, label = next(iter(dataloader_banknote_train))
assert sample.size() == (batch_size, m) and label.size() == (batch_size,)

## Iris Data Set

- Input: 4
- Output: 3
- Sample: 150
- Classes: ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
- Split: 80:20
- Train: 120
- Test: 30
- Resacling to [0, 1]
- Data URL: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data

https://archive.ics.uci.edu/ml/datasets/Iris  
https://www.kaggle.com/uciml/iris

In [11]:
# load data
iris_dir = os.path.join(root, 'iris.data')
print(iris_dir)

iris_frame = pd.read_csv(
    iris_dir,
    header=None,
    names=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Class'],
    dtype={'SepalLength': np.float32, 
           'SepalWidth': np.float32, 
           'PetalLength': np.float32, 
           'PetalWidth': np.float32, 
           'Class': np.str})
print(iris_frame.shape)
n = len(iris_frame.index)
m = iris_frame.shape[1] - 1
print(f'{n} samples, {m} attributes')

# reset seed
np.random.seed(seed)
shuffle_idx = np.random.permutation(n)
assert len(shuffle_idx) == iris_frame.shape[0]
iris_frame = iris_frame.iloc[shuffle_idx]

# convert categorical data to integer codes
iris_frame['Class'] = iris_frame['Class'].astype('category')
iris_classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
iris_frame['Class'] = iris_frame['Class'].cat.codes
iris_frame.head()

./dataset_root/iris.data
(150, 5)
150 samples, 4 attributes


Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Class
65,6.7,3.1,4.4,1.4,1
80,5.5,2.4,3.8,1.1,1
12,4.8,3.0,1.4,0.1,0
131,7.9,3.8,6.4,2.0,2
6,4.6,3.4,1.4,0.3,0


In [12]:
# resacle, train-test split
x_max, x_min = get_max_min(iris_frame.values[:, :m])
print('max', x_max, '\nmin', x_min)

# 80:20 split
iris_x_train, iris_y_train, iris_x_test, iris_y_test = split_dataframe(iris_frame)
iris_x_train = scale_normalize(iris_x_train, x_max, x_min) 
iris_x_test = scale_normalize(iris_x_test, x_max, x_min) 

print('train', iris_x_train.shape)
print('test', iris_x_test.shape)

max [7.9 4.4 6.9 2.5] 
min [4.3 2.  1.  0.1]
train (120, 4)
test (30, 4)


In [13]:
# create DataLoader for pytorch
dataset_iris_train = NumeralDataset(
    torch.Tensor(banknote_x_train),
    torch.Tensor(banknote_y_train))
dataset_iris_test = NumeralDataset(
    torch.Tensor(banknote_x_test),
    torch.Tensor(banknote_y_test))

dataloader_iris_train = DataLoader(
    dataset_iris_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers)
dataloader_iris_test = DataLoader(
    dataset_iris_test,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

# check batch size
sample, label = next(iter(dataloader_iris_train))
assert sample.size() == (batch_size, m) and label.size() == (batch_size,)

## Breast Cancer Wisconsin (Diagnostic) Data Set

- Input: 30 (31 - 1) id is used for indexing
- Output: {0, 1}
- Classes: Binary classification (M = malignant = 1, B = benign = 0)
- Samples: 569
- 80:20 split 
- Train: 455
- Test: 114
- Resacling to [0, 1]
- Data URL: The CSV file was downloaded from kaggle


https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)  
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [14]:
# load data
bc_dir = os.path.join(root, 'BreastCancerWisconsin.csv')
print(bc_dir)

# already has header
# use id as index
bc_frame = pd.read_csv(
    bc_dir,
    index_col=0)

# map categorical outputs to integer codes
classes = ['B', 'M']
bc_frame['diagnosis'] = bc_frame['diagnosis'].astype('category')
bc_frame['diagnosis'] = bc_frame['diagnosis'].cat.codes

# remove empty column
bc_frame = bc_frame.drop(
    bc_frame.columns[bc_frame.columns.str.contains('^Unnamed')], 
    axis=1)

# move output column to the end of table
col_names = bc_frame.columns
col_names = [c for c in col_names if c !='diagnosis'] + ['diagnosis']
bc_frame = bc_frame[col_names]
print(col_names)

n, m = bc_frame.shape
m = m - 1
print(f'\n{n} samples, {m} attributes')

bc_frame.head()

./dataset_root/BreastCancerWisconsin.csv
['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'diagnosis']

569 samples, 30 attributes


Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [15]:
# shuffle indices
indices = bc_frame.index.values

# reset seed
np.random.seed(seed)
shuffle_idx = np.random.permutation(len(indices))
shuffle_idx = indices[shuffle_idx]
assert len(shuffle_idx) == bc_frame.shape[0], f'{len(shuffle_idx)}'
bc_frame = bc_frame.loc[shuffle_idx]
bc_frame.head()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91805,8.571,13.1,54.53,221.3,0.1036,0.07632,0.02565,0.0151,0.1678,0.07126,...,18.45,63.3,275.6,0.1641,0.2235,0.1754,0.08512,0.2983,0.1049,0
89344,13.2,15.82,84.07,537.3,0.08511,0.05251,0.001461,0.003261,0.1632,0.05894,...,20.45,92.0,636.9,0.1128,0.1346,0.0112,0.025,0.2651,0.08385,0
90291,14.6,23.29,93.97,664.7,0.08682,0.06636,0.0839,0.05271,0.1627,0.05416,...,31.71,102.2,758.2,0.1312,0.1581,0.2675,0.1359,0.2477,0.06836,1
922296,13.21,28.06,84.88,538.4,0.08671,0.06877,0.02987,0.03275,0.1628,0.05781,...,37.17,92.48,629.6,0.1072,0.1381,0.1062,0.07958,0.2473,0.06443,0
894335,12.43,17.0,78.6,477.3,0.07557,0.03454,0.01342,0.01699,0.1472,0.05561,...,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932,0


In [16]:
# rescale, train-test split
x_max, x_min = get_max_min(bc_frame.values[:, :m])
print('max', x_max, '\nmin', x_min)

# 80:20 split
bc_x_train, bc_y_train, bc_x_test, bc_y_test = split_dataframe(bc_frame)
bc_x_train = scale_normalize(bc_x_train, x_max, x_min) 
bc_x_test = scale_normalize(bc_x_test, x_max, x_min) 

print('\ntrain', bc_x_train.shape)
print('test', bc_x_test.shape)

max [2.811e+01 3.928e+01 1.885e+02 2.501e+03 1.634e-01 3.454e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 3.113e-02 1.354e-01 3.960e-01 5.279e-02 7.895e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.910e-01
 6.638e-01 2.075e-01] 
min [6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.602e-01 7.570e-01 6.802e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]

train (455, 30)
test (114, 30)


In [17]:
# create DataLoader for pytorch
dataset_bc_train = NumeralDataset(
    torch.Tensor(bc_x_train),
    torch.Tensor(bc_y_train))
dataset_bc_test = NumeralDataset(
    torch.Tensor(bc_x_test),
    torch.Tensor(bc_y_test))

dataloader_bc_train = DataLoader(
    dataset_bc_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers)
dataloader_bc_test = DataLoader(
    dataset_bc_test,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

# check batch size
sample, label = next(iter(dataloader_bc_train))
assert sample.size() == (batch_size, m) and label.size() == (batch_size,)

## Seeds of Wheat Data Set

- Input: 7
- Output: {0, 1, 2}
- Classes: 3
- Samples: 210
- 80:20 split 
- Train: 168
- Test: 42
- Resacling to [0, 1]
- Data URL: https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt

https://archive.ics.uci.edu/ml/datasets/seeds  
https://www.kaggle.com/dongeorge/seed-from-uci

In [18]:
# load data
seeds_dir = os.path.join(root, 'seeds_dataset.txt')
print(seeds_dir)

col_names = ['area', 'perimeter', 'compactness', 'kernel length', 
    'kernel width', 'asymmetry coefficient', 'kernel groove length', 'class']
seeds_frame = pd.read_csv(
    seeds_dir,
    header=None,
    names=col_names,
    sep='\s+')

print(seeds_frame.shape)
n, m = seeds_frame.shape
m = m - 1  # not include output
print(f'{n} samples, {m} attributes')

# shuffle indices
np.random.seed(seed)  # reset seed
shuffle_idx = np.random.permutation(n)
assert len(shuffle_idx) == seeds_frame.shape[0]
seeds_frame = seeds_frame.iloc[shuffle_idx]

# convert categorical data to integer codes
seeds_frame['class'] = seeds_frame['class'].astype('category')
# map [1, 2, 3] to [0, 1, 2]
seeds_frame['class'] = seeds_frame['class'].cat.codes

seeds_frame.head()

./dataset_root/seeds_dataset.txt
(210, 8)
210 samples, 7 attributes


Unnamed: 0,area,perimeter,compactness,kernel length,kernel width,asymmetry coefficient,kernel groove length,class
188,11.23,12.82,0.8594,5.089,2.821,7.524,4.957,2
86,18.88,16.26,0.8969,6.084,3.764,1.649,6.109,1
175,10.8,12.57,0.859,4.981,2.821,4.773,5.063,2
69,12.73,13.75,0.8458,5.412,2.882,3.533,5.067,0
79,17.12,15.55,0.8892,5.85,3.566,2.858,5.746,1


In [19]:
# rescale, train-test split
x_max, x_min = get_max_min(seeds_frame.values[:, :m])
print('max', x_max, '\nmin', x_min)

seeds_x_train, seeds_y_train, seeds_x_test, seeds_y_test = \
    split_dataframe(seeds_frame)
seeds_x_train = scale_normalize(seeds_x_train, x_max, x_min)
seeds_x_test = scale_normalize(seeds_x_test, x_max, x_min)

print('train', seeds_x_train.shape)
print('test', seeds_x_test.shape)

max [21.18   17.25    0.9183  6.675   4.033   8.456   6.55  ] 
min [10.59   12.41    0.8081  4.899   2.63    0.7651  4.519 ]
train (168, 7)
test (42, 7)


In [20]:
# create DataLoader for pytorch
dataset_seeds_train = NumeralDataset(
    torch.Tensor(seeds_x_train), 
    torch.Tensor(seeds_y_train))
dataset_seeds_test = NumeralDataset(
    torch.Tensor(seeds_x_test), 
    torch.Tensor(seeds_y_test))

dataloader_seeds_train = DataLoader(
    dataset_seeds_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers)
dataloader_seeds_test = DataLoader(
    dataset_seeds_test,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

# check batch size
sample, label = next(iter(dataloader_seeds_train))
assert sample.size() == (batch_size, m) and label.size() == (batch_size,)

## HTRU2 Data Set

- Predicting a pulsar star
- Input: 8
- Output: {0, 1}
- Classes: Binary calssification
- Samples: 17898
- 80:20 split 
- Train: 14318
- Test: 3580
- Resacling to [0, 1]
- Data URL: https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip


https://archive.ics.uci.edu/ml/datasets/HTRU2  
https://www.kaggle.com/pavanraj159/predicting-a-pulsar-star

In [21]:
# load data
htru2_dir = os.path.join(root, 'HTRU2', 'HTRU_2.arff')
print(htru2_dir)

data = arff.loadarff(htru2_dir)
htru2_frame = pd.DataFrame(data[0])
print(htru2_frame.shape)
n, m = htru2_frame.shape
m = m - 1  # not include output
print(f'{n} samples, {m} attributes')

# shuffle indices
np.random.seed(seed)  # reset seed
shuffle_idx = np.random.permutation(n)
assert len(shuffle_idx) == htru2_frame.shape[0]
htru2_frame = htru2_frame.iloc[shuffle_idx]

# convert categorical data to integer codes
htru2_frame['class'] = htru2_frame['class'].astype('category')
htru2_frame['class'] = htru2_frame['class'].cat.codes

htru2_frame.head()

./dataset_root/HTRU2/HTRU_2.arff
(17898, 9)
17898 samples, 8 attributes


Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis,class
10745,136.265625,42.35794,-0.119417,0.448959,3.948161,23.408878,6.699311,48.727637,0
1331,19.414062,46.191099,3.705014,13.046057,96.028428,61.410014,0.610825,-0.194555,1
4191,96.890625,61.796318,0.975022,0.178443,32.746656,71.679577,2.010513,2.512631,1
6100,97.703125,41.930813,0.642869,1.391719,2.168896,15.313117,10.311409,126.942323,0
1505,116.960938,49.397685,0.200477,-0.034041,2.434783,22.13258,9.67982,95.19062,0


In [22]:
# rescale, train-test split
x_max, x_min = get_max_min(htru2_frame.values[:, :m])
print('max', x_max, '\nmin', x_min)

htru2_x_train, htru2_y_train, htru2_x_test, htru2_y_test = \
    split_dataframe(htru2_frame)
htru2_x_train = scale_normalize(htru2_x_train, x_max, x_min)
htru2_x_test = scale_normalize(htru2_x_test, x_max, x_min)

print('train', htru2_x_train.shape)
print('test', htru2_x_test.shape)

max [ 192.6171875    98.77891067    8.06952205   68.10162173  223.3921405
  110.6422106    34.53984419 1191.000837  ] 
min [ 5.8125     24.77204176 -1.87601118 -1.79188598  0.2132107   7.37043217
 -3.13926961 -1.9769756 ]
train (14318, 8)
test (3580, 8)


In [23]:
# create DataLoader for pytorch
dataset_htru2_train = NumeralDataset(
    torch.Tensor(htru2_x_train), 
    torch.Tensor(htru2_y_train))
dataset_htru2_test = NumeralDataset(
    torch.Tensor(htru2_x_test), 
    torch.Tensor(htru2_y_test))

dataloader_htru2_train = DataLoader(
    dataset_htru2_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers)
dataloader_htru2_test = DataLoader(
    dataset_htru2_test,
    batch_size,
    shuffle=True,
    num_workers=num_workers)

# check batch size
sample, label = next(iter(dataloader_htru2_train))
assert sample.size() == (batch_size, m) and label.size() == (batch_size,)