# 種々のベンチマークデータ

個別のノートブックを用意していないデータセットを整える段取りを記録している。

__目次__

- <a href="#iris">Irisデータ</a>

___

<a id="iris"></a>
## Irisデータ

20世紀前半からある古くて小さなデータセットであるが、線形モデルでは完全に識別できないことはわかっているので、多種の学習機のプロトタイプを手軽に試すデータセットとして重宝されてきた。学習課題はアヤメの花の品種を、花びらの長さなどの指標で識別することである。

In [4]:
import os
import csv
import numpy as np

In [5]:
! cat data/iris/iris_training.csv | head -n 5
! cat data/iris/iris_test.csv | head -n 5

120,4,setosa,versicolor,virginica
6.4,2.8,5.6,2.2,2
5.0,2.3,3.3,1.0,1
4.9,2.5,4.5,1.7,2
4.9,3.1,1.5,0.1,0
30,4,setosa,versicolor,virginica
5.9,3.0,4.2,1.5,1
6.9,3.1,5.4,2.1,2
5.1,3.3,1.7,0.5,0
6.0,3.4,4.5,1.6,1


In [6]:
NUM_CLASSES = 3
NUM_LABELS = 1

これでサンプル数、特徴量の数、各ラベルの名称がわかった。CSVファイルを開いてみよう。

In [7]:
toread = os.path.join("data", "iris", "iris_training.csv")

with open(toread, newline="") as f_table:
    
    f_reader = csv.reader(f_table, delimiter=",")
    
    cnt = 0
    i = 0
    for line in f_reader:
        
        # Get info from first row.
        if cnt == 0:
            NUM_TRAIN = int(line[0])
            d = int(line[1])
            LABEL_DICT = { i: str(line[2+i]) for i in range(NUM_CLASSES)}
            data_X = np.zeros((NUM_TRAIN,d), dtype=np.float32)
            data_y = np.zeros((NUM_TRAIN,1), dtype=np.uint8)
            
        # From all other rows, populate the data set.
        else:
            data_X[i,:] = np.array(line[0:-1], dtype=data_X.dtype)
            data_y[i,:] = np.array(line[-1], dtype=data_y.dtype)
            i += 1
        
        cnt += 1

訓練データを読み込んだのだが、検証データとともに一つの階層型ファイルにまとめるために、__PyTables__というパッケージを利用する。

In [8]:
import tables

In [9]:
# Open file connection, writing new file to disk.
myh5 = tables.open_file("data/iris/data.h5",
                        mode="w",
                        title="Iris data")
print(myh5) # currently empty.

data/iris/data.h5 (File) 'Iris data'
Last modif.: 'Fri Jul 27 11:06:54 2018'
Object Tree: 
/ (RootGroup) 'Iris data'



In [10]:
myh5.create_group(myh5.root, "train", "Training data")
myh5.create_group(myh5.root, "test", "Testing data")
print(myh5)

data/iris/data.h5 (File) 'Iris data'
Last modif.: 'Fri Jul 27 11:06:54 2018'
Object Tree: 
/ (RootGroup) 'Iris data'
/test (Group) 'Testing data'
/train (Group) 'Training data'



In [11]:
# Training data arrays.
a = tables.UInt8Atom()
myh5.create_earray(myh5.root.train,
                   name="labels",
                   atom=a,
                   shape=(0,NUM_LABELS),
                   title="Label values")
a = tables.Float32Atom()
myh5.create_earray(myh5.root.train,
                   name="inputs",
                   atom=a,
                   shape=(0,d),
                   title="Input images")

# Testing data arrays.
a = tables.UInt8Atom()
myh5.create_earray(myh5.root.test,
                   name="labels",
                   atom=a,
                   shape=(0,NUM_LABELS),
                   title="Label values")
a = tables.Float32Atom()
myh5.create_earray(myh5.root.test,
                   name="inputs",
                   atom=a,
                   shape=(0,d),
                   title="Input images")

print(myh5)

data/iris/data.h5 (File) 'Iris data'
Last modif.: 'Fri Jul 27 11:06:54 2018'
Object Tree: 
/ (RootGroup) 'Iris data'
/test (Group) 'Testing data'
/test/inputs (EArray(0, 4)) 'Input images'
/test/labels (EArray(0, 1)) 'Label values'
/train (Group) 'Training data'
/train/inputs (EArray(0, 4)) 'Input images'
/train/labels (EArray(0, 1)) 'Label values'



In [12]:
for i in range(NUM_TRAIN):
    myh5.root.train.inputs.append([data_X[i,:]])
    myh5.root.train.labels.append([data_y[i,:]])
    
print(myh5)

data/iris/data.h5 (File) 'Iris data'
Last modif.: 'Fri Jul 27 11:06:54 2018'
Object Tree: 
/ (RootGroup) 'Iris data'
/test (Group) 'Testing data'
/test/inputs (EArray(0, 4)) 'Input images'
/test/labels (EArray(0, 1)) 'Label values'
/train (Group) 'Training data'
/train/inputs (EArray(120, 4)) 'Input images'
/train/labels (EArray(120, 1)) 'Label values'



検証データに対して、同じ一連の操作を行う。

In [13]:
toread = os.path.join("data", "iris", "iris_test.csv")

with open(toread, newline="") as f_table:
    
    f_reader = csv.reader(f_table, delimiter=",")
    
    cnt = 0
    i = 0
    for line in f_reader:
        
        # Get info from first row.
        if cnt == 0:
            NUM_TEST = int(line[0])
            d = int(line[1])
            LABEL_DICT = { i: str(line[2+i]) for i in range(NUM_CLASSES)}
            data_X = np.zeros((NUM_TEST,d), dtype=np.float32)
            data_y = np.zeros((NUM_TEST,1), dtype=np.uint8)
            
        # From all other rows, populate the data set.
        else:
            data_X[i,:] = np.array(line[0:-1], dtype=data_X.dtype)
            data_y[i,:] = np.array(line[-1], dtype=data_y.dtype)
            i += 1
        
        cnt += 1

In [14]:
for i in range(NUM_TEST):
    myh5.root.test.inputs.append([data_X[i,:]])
    myh5.root.test.labels.append([data_y[i,:]])
    
print(myh5)

data/iris/data.h5 (File) 'Iris data'
Last modif.: 'Fri Jul 27 11:06:54 2018'
Object Tree: 
/ (RootGroup) 'Iris data'
/test (Group) 'Testing data'
/test/inputs (EArray(30, 4)) 'Input images'
/test/labels (EArray(30, 1)) 'Label values'
/train (Group) 'Training data'
/train/inputs (EArray(120, 4)) 'Input images'
/train/labels (EArray(120, 1)) 'Label values'



In [15]:
myh5.close()

___