# Create own dataset:
  * override 两个函数： \_\_len\_\_ and \_\_getitem\_\_

## 1.1 A minimal working example 

In [3]:
# A minimal working example

from torch.utils.data import Dataset

class NumberDataset(Dataset):
    def __init__(self):
        self.samples = list(range(1,1001))
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self,idx):
        return self.samples[idx]
    
    
if __name__ == '__main__':
    dataset = NumberDataset()
    print(len(dataset))
    print(dataset[100])
    print(dataset[122:361])

1000
101
[123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 

##  1.2 extend dataset


In [7]:
class NumberDataset_ex(Dataset):
    def __init__(self,low,high):
        self.samples = list(range(low,high))
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self,idx):
        return self.samples[idx]
    

if __name__ == '__main__':
    dataset = NumberDataset_ex(10,20)
    print(len(dataset))
    print(dataset[1])
    print(dataset[2:10])

10
11
[12, 13, 14, 15, 16, 17, 18, 19]


## 2. The Elder Scrolls dataset

sorted by race & gender



In [10]:
import os
from torch.utils.data import Dataset

class TESNamesDataset(Dataset):
    def __init__(self,data_root):
        self.samples=[]
        
        for race in os.listdir(data_root):
            race_folder = os.path.join(data_root,race)
            
            for gender in os.listdir(race_folder):
                gender_filepath = os.path.join(race_folder, gender)
                
                with open(gender_filepath,'r') as gender_file:
                    for name in gender_file.read().splitlines():
                        self.samples.append((race,gender,name))
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self,idx):
        return self.samples[idx]
    

if __name__ == '__main__':
    dataset = TESNamesDataset('D:\\tes-names')
    print(len(dataset))
    print(dataset[420])

19491
('Altmer', 'Female', 'Hanyarie')


In [11]:
print(dataset[10:20])

[('Altmer', 'Female', 'Alanwe'), ('Altmer', 'Female', 'Alanya'), ('Altmer', 'Female', 'Alcalime'), ('Altmer', 'Female', 'Alcardawe'), ('Altmer', 'Female', 'Alcildilwe'), ('Altmer', 'Female', 'Alcorana'), ('Altmer', 'Female', 'Aldamaire'), ('Altmer', 'Female', 'Aldanya'), ('Altmer', 'Female', 'Aldarenya'), ('Altmer', 'Female', 'Aldewe')]


## 3.Flowing data with the DataLoader
While the Dataset class is a nice way of containing data systematically, it seems that in a training loop, we will need to index or slice the dataset's samples list. This is no better than what we would do for a typical list or NumPy matrix. Rather than going down that route, PyTorch supplies another utility function called the DataLoader which acts as a data feeder for a Dataset object. The parallel I see here is the data generator flow function in Keras, if you are familiar with that. The DataLoader takes a Dataset object (and, therefore, any subclass extending it) and several other optional parameters (listed on the PyTorch DataLoader docs). Among the parameters, we have the option of shuffling the data, determining the batch size and the number of workers to load data in parallel. Here is a simple example of flowing through the TESNamesDataset in a enumerate loop.

In [13]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size = 5, shuffle = True, num_workers = 0)

for i, batch in enumerate(dataloader):
    print(i, batch)

0 [('Bosmer', 'Nord', 'Argonian', 'Khajiit', 'Altmer'), ('Male', 'Male', 'Male', 'Female', 'Male'), ('Sandaenion', 'Nielsold', 'Dal-Eekwa', 'Sashima', 'Itermerel')]
1 [('Altmer', 'Khajiit', 'Nord', 'Breton', 'Redguard'), ('Male', 'Male', 'Male', 'Male', 'Female'), ('Fiirofalion', "J'mhad", 'Pactur', 'Harbin', 'Namasah')]
2 [('Orc', 'Nord', 'Khajiit', 'Dunmer', 'Khajiit'), ('Female', 'Male', 'Female', 'Male', 'Male'), ('Mornamph', 'Jon', 'Nakhtar', 'Malur', 'Omgh')]
3 [('Bosmer', 'Argonian', 'Khajiit', 'Argonian', 'Bosmer'), ('Male', 'Male', 'Male', 'Female', 'Male'), ('Lorchon', 'Right-Foot-Rock', 'Amsien', 'Seed-Neeus', 'Athragar')]
4 [('Breton', 'Dunmer', 'Nord', 'Khajiit', 'Nord'), ('Male', 'Female', 'Male', 'Male', 'Male'), ('Gondyn', 'Mirusu', 'Jakaral', 'Turan', 'Hrolvar')]
5 [('Dunmer', 'Orc', 'Redguard', 'Bosmer', 'Orc'), ('Male', 'Female', 'Female', 'Male', 'Male'), ('Nisimar', 'Dulug', 'Falorah', 'Rithrannir', 'Ghaz')]
6 [('Breton', 'Orc', 'Orc', 'Bosmer', 'Nord'), ('Male', '

1105 [('Orc', 'Khajiit', 'Bosmer', 'Dunmer', 'Khajiit'), ('Male', 'Male', 'Female', 'Female', 'Male'), ('Kurz', "Ab'dul", 'Uurwaerion', 'Nivene', "J'zuraar")]
1106 [('Redguard', 'Nord', 'Altmer', 'Breton', 'Altmer'), ('Male', 'Male', 'Female', 'Female', 'Female'), ('Namvar', 'Sig', 'Valsirenn', 'Chrystyvyra', 'Hanerye')]
1107 [('Bosmer', 'Bosmer', 'Altmer', 'Breton', 'Khajiit'), ('Female', 'Female', 'Male', 'Male', 'Male'), ('Aradraen', 'Iirdilin', 'Norarubel', 'Philien', 'Sinurabi')]
1108 [('Dunmer', 'Nord', 'Redguard', 'Redguard', 'Nord'), ('Male', 'Male', 'Female', 'Male', 'Female'), ('Muriil', 'Littrel', 'Jahhaira', 'Walik', 'Tolgredda')]
1109 [('Altmer', 'Orc', 'Khajiit', 'Redguard', 'Breton'), ('Female', 'Female', 'Male', 'Male', 'Female'), ('Penardil', 'Batul', 'Makmargo', 'Isleif', 'Olivie')]
1110 [('Khajiit', 'Nord', 'Nord', 'Altmer', 'Imperial'), ('Female', 'Male', 'Female', 'Male', 'Male'), ('Zalapu', 'Jofstrom', 'Nikolvara', 'Filpormo', 'Crispinus')]
1111 [('Bosmer', 'Orc',

1897 [('Breton', 'Imperial', 'Khajiit', 'Dunmer', 'Dunmer'), ('Male', 'Female', 'Female', 'Female', 'Female'), ('Frubert', 'Ariela', 'Thanala', 'Dralosa', 'Ramavel')]
1898 [('Redguard', 'Khajiit', 'Breton', 'Nord', 'Dunmer'), ('Female', 'Female', 'Male', 'Male', 'Female'), ("Dh'emka", 'Dawnwalker', 'Stephenn', 'Gar', 'Adansa')]
1899 [('Redguard', 'Argonian', 'Altmer', 'Bosmer', 'Breton'), ('Male', 'Male', 'Male', 'Female', 'Female'), ('Khudriran', 'Sings-in-Shadows', 'Nonvarel', 'Namradis', 'Amarie')]
1900 [('Redguard', 'Nord', 'Nord', 'Altmer', 'Redguard'), ('Male', 'Male', 'Female', 'Male', 'Female'), ('Irwad', 'Bren', 'Vaereid', 'Henaril', 'Alida')]
1901 [('Orc', 'Imperial', 'Imperial', 'Redguard', 'Khajiit'), ('Male', 'Female', 'Female', 'Female', 'Female'), ('Borzighu', 'Aemilia', 'Placidia', 'Rashihi', 'Elanita')]
1902 [('Nord', 'Redguard', 'Nord', 'Altmer', 'Argonian'), ('Male', 'Female', 'Male', 'Male', 'Female'), ('Sahun', 'Anireh', 'Orstag', 'Linwirmion', 'Spills-No-Drinks')]

2854 [('Altmer', 'Bosmer', 'Altmer', 'Dunmer', 'Argonian'), ('Male', 'Male', 'Male', 'Female', 'Male'), ('Vanando', 'Monynen', 'Salmo', 'Bivessa', 'Chilwir')]
2855 [('Altmer', 'Nord', 'Breton', 'Orc', 'Khajiit'), ('Male', 'Male', 'Male', 'Female', 'Male'), ('Erannin', 'Logvaar', 'Benjamund', 'Logru', "J'Hanir")]
2856 [('Altmer', 'Nord', 'Breton', 'Altmer', 'Breton'), ('Male', 'Female', 'Male', 'Female', 'Female'), ('Norfando', 'Lydia', 'Adalard', 'Nesaranwe', 'Vannyn')]
2857 [('Argonian', 'Redguard', 'Bosmer', 'Breton', 'Redguard'), ('Male', 'Male', 'Male', 'Female', 'Male'), ('Soft-Scale', 'Ahad', 'Orchelor', 'Valaunce', 'Anjan')]
2858 [('Bosmer', 'Redguard', 'Altmer', 'Nord', 'Bosmer'), ('Male', 'Female', 'Male', 'Male', 'Female'), ('Mongor', 'Seren', 'Solilian', 'Gjudir', 'Kirstar')]
2859 [('Altmer', 'Dunmer', 'Dunmer', 'Argonian', 'Redguard'), ('Male', 'Male', 'Male', 'Female', 'Female'), ('Ganra', 'Elms', 'Baladar', 'Nikomeda', 'Hatmarah')]
2860 [('Orc', 'Dunmer', 'Redguard', 'Orc

3864 [('Breton', 'Redguard', 'Bosmer', 'Bosmer', 'Orc'), ('Male', 'Female', 'Male', 'Male', 'Female'), ('Wynn', 'Samayna', 'Artan', 'Aerolor', 'Urshra')]
3865 [('Redguard', 'Nord', 'Argonian', 'Nord', 'Bosmer'), ('Male', 'Female', 'Male', 'Male', 'Female'), ('Dinok', 'Larula', 'Diocletian', 'Llewellyn', 'Dondreth')]
3866 [('Argonian', 'Bosmer', 'Nord', 'Argonian', 'Orc'), ('Male', 'Male', 'Male', 'Female', 'Female'), ('Grinds-Nails', 'Glonnir', 'Orgnar', 'Keesa', 'Gogul')]
3867 [('Breton', 'Altmer', 'Khajiit', 'Altmer', 'Dunmer'), ('Female', 'Male', 'Male', 'Male', 'Male'), ('Charmela', 'Landorganil', 'Kaushjhargo', 'Engulion', 'Gethan')]
3868 [('Dunmer', 'Altmer', 'Altmer', 'Dunmer', 'Imperial'), ('Male', 'Male', 'Male', 'Male', 'Male'), ('Garer', 'Talonir', 'Minducil', 'Rararyn', 'Musius')]
3869 [('Bosmer', 'Nord', 'Dunmer', 'Altmer', 'Nord'), ('Male', 'Female', 'Female', 'Male', 'Female'), ('Paliman', 'Hrordis', 'Mehra', 'Kelkemmedalf', 'Elfriede')]
3870 [('Imperial', 'Khajiit', 'Kh

Hang on, that is not how it looked like when we sliced our dataset earlier! What’s going on here? Well, as it turns out, the DataLoader loads the data in a systematic way such that we stack data vertically instead of horizontally. This is particularly useful for flowing batches of tensors as tensors stack vertically (i.e. in the first dimension) to form batches. Also, the DataLoader also handled the shuffling of data for you so there's no need to shuffle matrices or keep track of indices when feeding data.

## 4. Flowing tensors and other types 

In [19]:
import torch
class NumbersDataset_tensor(Dataset):
    def __init__(self,low,high):
        self.samples = list(range(low,high))
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self,idx):
        n = self.samples[idx]
        successors = torch.arange(4).float() + n + 1
        noisy = torch.randn(4) + successors
        return n, successors, noisy
    
if __name__ == '__main__':
    dataset = NumbersDataset_tensor(100,120)
    dataloader = DataLoader(dataset,batch_size =5, num_workers=0,shuffle =True )
    print(next(iter(dataloader)))

[tensor([117, 100, 104, 103, 105]), tensor([[118., 119., 120., 121.],
        [101., 102., 103., 104.],
        [105., 106., 107., 108.],
        [104., 105., 106., 107.],
        [106., 107., 108., 109.]]), tensor([[119.3076, 119.6959, 119.6208, 120.5671],
        [102.0798, 102.4947, 102.6132, 102.5563],
        [104.3082, 106.3842, 106.1872, 108.5922],
        [104.5020, 105.9822, 105.6308, 107.9713],
        [105.4833, 105.2405, 107.1628, 108.8143]])]


In general, the loader will try stacking batches of 1-dimensional tensors into 2-dimensional tensors, batches of 2-dimensional tensors into 3-dimensional tensors, and so on. At this point, I implore you to realize the life-changing impact this has on traditional data handling in other machine learning libraries and how clean the solution looks. It is quite incredible! If you are not sharing my sentiments, well, at least you now know one other method that you can have in your toolbox.

## 5. Completing the TES dataset code

Let’s get back to the TES names dataset. It seems like the initialization function is a little dirty (at least for my standards and there should really be a way to make the code look better. Remember that I said the PyTorch API is Pythonic? Well, there is no stopping you from declaring other utility functions in your dataset or even making internal functions for initialization. To clean up the TES names dataset code, we will update the TESNamesDataset code to achieve the following:
* update the constructor to include a character set,
* create an internal function to initialize the dataset,
* create a utility function that converts nominal variables into one-hot tensors,
* create a utility function that converts a sample into a set of three one-hot tensors representing the race, gender, and name.


In [27]:
import os
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
import torch

class TESNamesDataset(Dataset):
    def __init__(self, data_root, charset):
        self.data_root = data_root
        self.charset = charset
        self.samples = []
        self.race_codec = LabelEncoder()
        self.gender_codec = LabelEncoder()
        self.char_codec = LabelEncoder()
        self._init_dataset()
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self,idx):
        race, gender, name = self.samples[idx]
        return self.one_hot_sample(race, gender, name)
    
    def _init_dataset(self):
        races = set()
        genders = set()
        
        for race in os.listdir(self.data_root):
            race_folder = os.path.join(self.data_root,race)
            races.add(race)
            
            for gender in os.listdir(race_folder):
                gender_filepath = os.path.join(race_folder, gender)
                genders.add(gender)
                
                with open(gender_filepath,'r') as gender_file:
                    for name in gender_file.read().splitlines():
                        self.samples.append((race, gender, name))
        self.race_codec.fit(list(races))
        self.gender_codec.fit(list(genders))
        self.char_codec.fit(list(self.charset))
    
    def to_one_hot(self, codec , values):
        value_idxs = codec.transform(values)
        return torch.eye(len(codec.classes_))[value_idxs]
    
    def one_hot_sample(self, race, gender, name):
        t_race = self.to_one_hot(self.race_codec, [race])
        t_gender = self.to_one_hot(self.gender_codec, [gender])
        t_name = self.to_one_hot(self.char_codec, list(name))
        return t_race, t_gender, t_name
    
if __name__ == '__main__':
    import string

    data_root = 'D:\\tes-names'
    charset = string.ascii_letters + "-' "
    dataset = TESNamesDataset(data_root, charset)
    print(len(dataset))
    print(dataset[420])

19491
(tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]), tensor([[1., 0.]]), tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 