# Code for loading UCI datasets
Using preprocessed data downloaded from: https://github.com/bioinf-jku/SNNs

In [7]:
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
path = './'
dataset_name = 'breast-cancer'
dataset_path = path + dataset_name + '/'

In [3]:
x = np.loadtxt(open(dataset_path + dataset_name + '_py.dat'), delimiter=",")
y = np.loadtxt(open(dataset_path + 'labels_py.dat'), delimiter=",")

print(x.shape)
print(y.shape)

# The folds and validation aren't useful to us
folds = np.loadtxt(open(dataset_path + 'folds_py.dat'), delimiter=",")
validation_folds = np.loadtxt(open(dataset_path + 'validation_folds_py.dat'), delimiter=",")

(286, 9)
(286,)


In [4]:
# This assumes that the labels are in [0,1,2,...,classes], which seems to be the case
classes = np.max(y) + 1 

print(np.max(y))
print(np.min(y))
print(classes)

1.0
0.0
2.0


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [6]:
y_train

array([0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0.,
       0., 0., 1., 1., 0.

# Work out how to load all datasets

In [26]:
directory = '.'
sub_directories = sorted([d for d in os.listdir(directory) if d.count('.') == 0])
# sub_directories[:68]

In [24]:
def get_num_datapoints_classes(dataset_name):
    dataset_path = './' + dataset_name + '/'

    x = np.loadtxt(open(dataset_path + dataset_name + '_py.dat'), delimiter=",")
    y = np.loadtxt(open(dataset_path + 'labels_py.dat'), delimiter=",")

    num_datapoints = y.shape[0]
    num_classes = np.max(y) + 1 
    
    return num_datapoints, num_classes

    

In [28]:
small_datasets = []
large_datasets = []

for dataset_name in sub_directories:
    num_datapoints, num_classes = get_num_datapoints_classes(dataset_name)
    if num_datapoints >= 1000:
        large_datasets.append(dataset_name)
    else:
        small_datasets.append(dataset_name)

abalone
acute-inflammation
acute-nephritis
adult
annealing
arrhythmia
audiology-std
balance-scale
balloons
bank
blood
breast-cancer
breast-cancer-wisc
breast-cancer-wisc-diag
breast-cancer-wisc-prog
breast-tissue
car
cardiotocography-10clases
cardiotocography-3clases
chess-krvk
chess-krvkp
congressional-voting
conn-bench-sonar-mines-rocks
conn-bench-vowel-deterding
connect-4
contrac
credit-approval
cylinder-bands
dermatology
echocardiogram
ecoli
energy-y1
energy-y2
fertility
flags
glass
haberman-survival
hayes-roth
heart-cleveland
heart-hungarian
heart-switzerland
heart-va
hepatitis
hill-valley
horse-colic
ilpd-indian-liver
image-segmentation
ionosphere
iris
led-display
lenses
letter
libras
low-res-spect
lung-cancer
lymphography
magic
mammographic
miniboone
molec-biol-promoter
molec-biol-splice
monks-1
monks-2
monks-3
mushroom
musk-1
musk-2
nursery
oocytes-merluccius-nucleus-4d
oocytes-merluccius-states-2f
oocytes-trisopterus-nucleus-2f
oocytes-trisopterus-states-5b
optical
ozone
page-

In [30]:
small_datasets

['acute-inflammation',
 'acute-nephritis',
 'annealing',
 'arrhythmia',
 'audiology-std',
 'balance-scale',
 'balloons',
 'blood',
 'breast-cancer',
 'breast-cancer-wisc',
 'breast-cancer-wisc-diag',
 'breast-cancer-wisc-prog',
 'breast-tissue',
 'congressional-voting',
 'conn-bench-sonar-mines-rocks',
 'conn-bench-vowel-deterding',
 'credit-approval',
 'cylinder-bands',
 'dermatology',
 'echocardiogram',
 'ecoli',
 'energy-y1',
 'energy-y2',
 'fertility',
 'flags',
 'glass',
 'haberman-survival',
 'hayes-roth',
 'heart-cleveland',
 'heart-hungarian',
 'heart-switzerland',
 'heart-va',
 'hepatitis',
 'horse-colic',
 'ilpd-indian-liver',
 'ionosphere',
 'iris',
 'lenses',
 'libras',
 'low-res-spect',
 'lung-cancer',
 'lymphography',
 'mammographic',
 'molec-biol-promoter',
 'monks-1',
 'monks-2',
 'monks-3',
 'musk-1',
 'oocytes-trisopterus-nucleus-2f',
 'oocytes-trisopterus-states-5b',
 'parkinsons',
 'pima',
 'pittsburg-bridges-MATERIAL',
 'pittsburg-bridges-REL-L',
 'pittsburg-bridge