#### This function acts as a generator to produce batches of numpy arrays and labels read from a director
#### to use first instantiate the class with
#### mygen=data_gen(s_dir, batch_size, data_shape, shuffle, num_of_classes,categorical) .flow()where
#### s_dir is the full path to the directory contain two subdirectories labeled 'data' and 'labels'
####    the data sub directory hold the arrays with extension .npy
####   the label directory holds the arrays of integer label values with extension .npy
#### batch size is an integer of the number of values the generator will return
#### data_shape is a tuple defining the shape of the arrays in the data sub directory
#### shuffle is a binary. If true the data for each batch is randomly selected. If set to False
#### the data is selected in sequential order as present in the directories
#### num_of_classes is an integer specifying the number of classes, that is the maximum
#### value of a label. Only used when categorical=True
#### categorical  is a binary. If False labels are provided as integers
#### If categorical=True  labels are provided as 1 hot vectors
#### ,flow causes the generator to read in the data and produce the batch output of the generator
#### The outpus of the generator are arrays, labels, array_paths, label_paths
#### arrays are the input arrays of the shape(batch_size, data_shape)
#### labels are input arrays of the labels of the form  shape(batch_size,) when categorical=False
#### and shape(batch_size, num_of_classes if categorical is true
#### The generator has two attributes which may be useful
#### .label_path contains a sequentially ordered list of all label file names produced by the generator
#### .array_path contains a sequentially ordered list of all array file names produced by the generator
#### array_paths is an ordered list associated with the input arrays specifying the array name
#### label_paths is an ordered list associated with the input labels specifying the array name

#### Import needed modules

In [1]:
import os
import shutil
import numpy as np

#### Define data_gen class

In [2]:
class data_gen():
    def __init__(self, s_dir, batch_size, data_shape,shuffle, num_of_classes, categorical):
        self.batch_index=0
        self.s_dir=s_dir
        self.batch_size=batch_size
        self.data_dir=os.path.join(self.s_dir, 'data')
        self.label_dir= os.path.join(self.s_dir, 'labels')       
        self.data_dir_list=os.listdir(self.data_dir)
        self.label_dir_list=os.listdir(self.label_dir)
        self.length=len(self.data_dir_list)
        self.data_shape=data_shape
        self.shuffle=shuffle
        self.num_of_classes=num_of_classes
        self.categorical=categorical
        shape_list=[self.batch_size]
        for i in self.data_shape:
            shape_list.append(i) 
        self.data_array=np.zeros((tuple(shape_list))) 
        if not self.categorical:            
            self.label_array=np.zeros((self.batch_size))
        else:
            self.label_array=np.zeros((self.batch_size, self.num_of_classes))  
            
    # define function to read in batches of arrays and labels  
    
    def flow(self ):  
        self.label_paths=[]
        self.array_paths=[]
        start=self.batch_index * self.batch_size # set start value of iteration
        end=start + self.batch_size   # set end value of iteration yield 1 batch of data
        for i in range(start, end):
            j=i % self.length
            k=j % self.batch_size 
            if self.shuffle:
                m=np.random.randint(0, high=self.length-1, size=None, dtype=int)
            else:
                m=j            
            path_to_data=os.path.join(self.data_dir, self.data_dir_list[m])
            path_to_labels=os.path.join(self.label_dir, self.label_dir_list[m])
            self.label_paths.append(os.path.basename(path_to_labels))
            self.array_paths.append(os.path.basename(path_to_data))
            self.data_array[k]=np.load(path_to_data)
            if  not categorical:
                self.label_array[k]=np.load(path_to_labels) 
            else:
                label_value=np.load(path_to_labels)
                label_list=[]
                for i in range(self.num_of_classes):
                    if i == label_value:
                        element=1
                    else:
                        element=0
                    label_list.append(element)                
                self.label_array[k]=label_list
                
        self.batch_index=self.batch_index +1 
        yield (self.data_array, self.label_array)
        

### Example of use

#### Create directories c:\temp\arrays, c:\temp\arrays\data and c:\temp\arrays\labels
#### create an array of shape (70, 28, 28,3)

In [3]:
length=70 # make 70 arrays of data and labels
num_of_classes=10
data=np.random.randint(0, 1, size=(70,28, 28,3), dtype=int)
label_list=[]
for i in range(length):
    j=i % 10
    label_list.append(j)    
labels=np.array(label_list)
#labels=np.random.randint(0, num_of_classes-1, size=(100), dtype=int)
save_dir=r'c:\temp\arrays'
data_file_path=os.path.join(save_dir, 'data')
label_file_path=os.path.join(save_dir, 'labels')
if os.path.isdir(save_dir):
    shutil.rmtree(save_dir)
    os.mkdir(save_dir)
    os.mkdir(data_file_path)
    os.mkdir(label_file_path)
for i in range (data.shape[0]):
    # pad is used to pad the file names with leading zeros so when read in the order is preserved
    if i<9:
        pad='00'
    else:
        pad='0'
    path_to_data=os.path.join(data_file_path, pad +str(i+1) + '.npy')
    np.save(path_to_data, data[i])
    path_to_label=os.path.join(label_file_path, pad + str(i+1) + '.npy')
    np.save(path_to_label, labels[i])   

#### Instantiate an instance of the generator

In [4]:
s_dir=r'c:\temp\arrays'
batch_size=30
data_shape=(28,28,3)
shuffle=False
categorical=True
mygen=data_gen(s_dir, batch_size, data_shape, shuffle, num_of_classes, categorical)

#### use .flow on the instance to produce a batch of data

In [5]:
for k in range (length//batch_size + 1):
    data, labels =next(mygen.flow())
    print ('Batch number ', k)
        # print out the labels each time. 
    for i in range (batch_size):
        print ('     ', labels[i], '   ', mygen.label_paths[i])
        if mygen.label_paths[i]=='070.npy':
            print ('       NOTICE GENERATOR RECYCLES')
    

Batch number  0
      [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]     001.npy
      [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]     002.npy
      [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]     003.npy
      [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]     004.npy
      [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]     005.npy
      [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]     006.npy
      [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]     007.npy
      [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]     008.npy
      [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]     009.npy
      [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]     010.npy
      [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]     011.npy
      [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]     012.npy
      [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]     013.npy
      [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]     014.npy
      [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]     015.npy
      [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]     016.npy
      [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]     017.npy
      [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]     018.npy
      [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]     019.npy
      [0. 0. 0. 0. 0. 0. 0. 0. 0. 