In [1]:
import random
import numpy as np

In [2]:
# Example of traversing a list of indexes to create a circular list
a = [1, 2, 3, 4]
b = [0] * 10

a_size = len(a)
b_size = len(b)
lines_index = [*range(a_size)]
index = 0

for i in range(b_size):
    if(index >= a_size):
        index = 0
        
    b[i] = a[lines_index[index]]
    index += 1
    
print(b)

[1, 2, 3, 4, 1, 2, 3, 4, 1, 2]


### Shuffling the data order

In [4]:
# Example of traversing a list of indexes to create a circular list
a = [1, 2, 3, 4]
b = []

a_size = len(a)
b_size = 10
lines_index = [*range(a_size)]
print("Original order of index:",lines_index)


# If we shuffle the index_list we can change the order of our circular list
# without modifying the order or our original data
random.shuffle(lines_index)
print(f'Shuffled order of index: {lines_index}')

print(f'New value order for first batch: {[a[index] for index in lines_index]}')
batch_counter = 1
index = 0

for i in range(b_size):
    # We wrap by resetting index to 0
    if(index >= a_size):
        index = 0
        batch_counter += 1
        random.shuffle(lines_index)
        print(f'\nShuffled Indexes for Batch No. {batch_counter} :{lines_index}')
        print(f'Values for Batch No. {batch_counter}, {[a[index] for index in lines_index]}')
        
    b.append(a[lines_index[index]])
    index += 1
    
print()
print(f"Final value of b: {b}")

Original order of index: [0, 1, 2, 3]
Shuffled order of index: [0, 1, 3, 2]
New value order for first batch: [1, 2, 4, 3]

Shuffled Indexes for Batch No. 2 :[2, 3, 0, 1]
Values for Batch No. 2, [3, 4, 1, 2]

Shuffled Indexes for Batch No. 3 :[3, 2, 0, 1]
Values for Batch No. 3, [4, 3, 1, 2]

Final value of b: [1, 2, 4, 3, 3, 4, 1, 2, 4, 3]


#### Epochs
- Epoch is each time an algorithm passes over all the training sample
- Shuffling the samples for each epoch reduces the variance
- Makes the model more general and less overfit



In [15]:
def data_generator(batch_size, data_x, data_y, shuffle=True):
    """
    Input:
        batch_size: Integer describing the batch size
        data_x - List containing samples
        data_y - List containing labels
        shuffle - Shuffle the data order
    Output: 
        a tuple containing 2 elements
        X - List of dim (batch_size) of samples
        Y - List of dim (batch_size) of labels
    """
    # len(data_x) must be equal to len(data_y)
    data_lng = len(data_x) 
    index_list = [*range(data_lng)]
    
    if(shuffle == True):
        random.shuffle(index_list) # Inplace shuffle of the list 
    index = 0
    
    while(True):
        # Create a list with batch_size elements
        X = [0] * batch_size
        Y = [0] * batch_size
        
        for i in range(batch_size):
            # Wrap the index each time that we reach the end of the list
            if(index >= data_lng):
                index = 0
                if(shuffle == True):
                    random.shuffle(index_list)
                    
            X[i] = data_x[index_list[index]]
            Y[i] = data_y[index_list[index]]
            
            index += 1
        yield((X, Y))

In [16]:
def test_data_generator():
    
    x = [1, 2, 3, 4]
    y = [xi**2 for xi in x]
    
    generator = data_generator(3, x, y, shuffle=False)
    
    assert np.allclose(next(generator), ([1, 2, 3], [1, 4, 9])),  "First batch does not match"
    assert np.allclose(next(generator), ([4, 1, 2], [16, 1, 4])), "Second batch does not match"
    assert np.allclose(next(generator), ([3, 4, 1], [9, 16, 1])), "Third batch does not match"
    assert np.allclose(next(generator), ([2, 3, 4], [4, 9, 16])), "Fourth batch does not match"

    print("\033[92mAll tests passed!")

test_data_generator()

[92mAll tests passed!
