# Notes

- What we need for a good replay buffer: fixed-size, FIFO behavior, O(1) insertion at the end, O(1) sampling. Limited memory footprint.
- [deque](https://docs.python.org/3/library/collections.html#collections.deque) has O(1) insertion time at the end, but O(n) access time (which made me doubt its ability to make a good replay buffer and try an np.array-based solution)
- Can anybody explain the difference between the times measured by tqdm and timeit?
- When we draw a mini-batch for DQN, it would be best to receive (separately) an array of states only, an array of actions, an array of rewards, an array of next states and a last array of "done", that we can pass these to the Q-network. What's the best way of doing that? Store them separately?

# Setting the frame

In [1]:
import gym
from gym import logger
import numpy as np
logger.set_level(gym.logger.DISABLED)

In [2]:
cartpole = gym.make('CartPole-v1')

In [3]:
state = cartpole.reset()
action = cartpole.action_space.sample()
next_state, reward, done, _ = cartpole.step(action)

In [4]:
replay_buffer_size = int(1e6)
nb_samples = int(2e6)
nb_batches = int(1e4)
batch_size = 50

# Testing functions

In [5]:
from tqdm import trange

def test_insertion_tqdm(buffer, nb_samples):
    state = cartpole.reset()
    for _ in trange(nb_samples):
        buffer.append(state, action, reward, next_state, done)

def test_sampling_tqdm(buffer, nb_batches):
    for _ in trange(nb_batches):
        buffer.sample(batch_size)

In [6]:
import timeit
import gc

def test_insertion_timeit(buffer, nb_samples):
    print("Insertion of", nb_samples, "samples:", 
      timeit.timeit('memory.append(state,action,reward,next_state,done)', 
                    globals=globals(), 
                    setup='gc.enable()', 
                    number=nb_samples))

def test_sampling_timeit(buffer, nb_batches):
    print("Sampling of", nb_batches, "batches:",
          timeit.timeit('memory.sample(batch_size)', 
                        globals=globals(), 
                        setup='gc.enable()', 
                        number=nb_batches))

# Replay buffer classes

In [37]:
from collections import deque, namedtuple
Transition = namedtuple('Transition', 
                        ('state', 'action', 'reward', 'next_state', 'done'))

# But for the sake of the exercise, we will wrap this in a dedicated class.

import random
    
class ReplayBuffer1(object):
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    def append(self, *args):
        self.memory.append(Transition(*args))
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    def __len__(self):
        return len(self.memory)
    def capacity(self):
        return self.memory.maxlen
    
class ReplayBuffer2(object):
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    def __len__(self):
        return len(self.memory)
    def capacity(self):
        return self.memory.maxlen
    
class ReplayBuffer3(deque):
    def __init__(self, capacity):
        super().__init__(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        super().append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        return random.sample(self, batch_size)
    
class ReplayBuffer4(deque):
    def __init__(self, capacity):
        super().__init__(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        super().append(Transition(state, action, reward, next_state, done))
    def sample(self, batch_size):
        return random.sample(self, batch_size)
# An even more optimized version can be found in TODO.py 
# and many libraries (such as gym) provide ready-to-use tools replay buffers

class ReplayBuffer5(object):
    def __init__(self, capacity):
        self.capacity = capacity # capacity of the buffer
        self.data = np.empty(capacity, dtype=Transition)
        self.index = 0 # index of the next cell to be filled
        self.size = 0 # number of elements in the buffer
        
    def append(self, *args):
        self.data[self.index] = Transition(*args)
        self.index = (self.index + 1) % self.capacity
        if self.size < self.capacity:
            self.size+=1
        
    def sample(self, batch_size):
        #indices = np.random.choice(self.size, size=batch_size, replace=False)
        #return self.memory[indices]
        return np.random.choice(self.data[:self.size], size=batch_size, replace=False)
    
    def __len__(self):
        return self.size
    
class ReplayBuffer6(object):
    def __init__(self, capacity):
        self.data = deque(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        self.data.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = random.sample(self.data, batch_size)
        return list(map(np.array, list(zip(*batch))))
    def __len__(self):
        return len(self.data)
    def capacity(self):
        return self.data.maxlen


# Pseudo-unit testing

In [8]:
# init
memory = ReplayBuffer4(replay_buffer_size)
print(memory)
# len
print(len(memory))
# append
memory.append(state, action, reward, next_state, done)
print(memory)
print(len(memory))

deque([], maxlen=1000000)
0
deque([Transition(state=array([-0.02472349,  0.04229546, -0.04424957,  0.01717173]), action=0, reward=1.0, next_state=array([-0.02387758, -0.15216489, -0.04390614,  0.2955716 ]), done=False)], maxlen=1000000)
1


In [9]:
# init
memory = ReplayBuffer5(replay_buffer_size)
print(memory.data)
# len
print(len(memory))
# append
memory.append(state, action, reward, next_state, done)
print(memory.data)
print(len(memory))

[None None None ... None None None]
0
[Transition(state=array([-0.02472349,  0.04229546, -0.04424957,  0.01717173]), action=0, reward=1.0, next_state=array([-0.02387758, -0.15216489, -0.04390614,  0.2955716 ]), done=False)
 None None ... None None None]
1


# Time testing

In [10]:
memory = ReplayBuffer1(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_insertion_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:03<00:00, 617242.12it/s]
100%|██████████| 10000/10000 [00:16<00:00, 603.79it/s]


Insertion of 2000000 samples: 1.9286490439990303
Insertion of 10000 samples: 0.009928485000273213


In [11]:
memory = ReplayBuffer2(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_insertion_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:01<00:00, 1365707.44it/s]
100%|██████████| 10000/10000 [00:17<00:00, 587.06it/s]


Insertion of 2000000 samples: 0.797827681999479
Insertion of 10000 samples: 0.004038371000206098


In [12]:
memory = ReplayBuffer3(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_insertion_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:01<00:00, 1180981.61it/s]
100%|██████████| 10000/10000 [00:16<00:00, 596.92it/s]


Insertion of 2000000 samples: 1.0713992589990085
Insertion of 10000 samples: 0.005935952998697758


In [13]:
memory = ReplayBuffer4(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_insertion_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:03<00:00, 535732.55it/s]
100%|██████████| 10000/10000 [00:17<00:00, 573.48it/s]


Insertion of 2000000 samples: 2.129768590002641
Insertion of 10000 samples: 0.011250553998252144


In [14]:
memory = ReplayBuffer5(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_insertion_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:04<00:00, 456939.44it/s]
100%|██████████| 10000/10000 [06:21<00:00, 26.22it/s]


Insertion of 2000000 samples: 3.1594234669973957
Insertion of 10000 samples: 0.016628654000669485


In [None]:
memory = ReplayBuffer6(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_insertion_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:01<00:00, 1348870.92it/s]
 56%|█████▌    | 5581/10000 [00:10<00:07, 578.96it/s]