# Notes

- What we need for a good replay buffer: fixed-size, FIFO behavior, O(1) insertion at the end, O(1) sampling. Limited memory footprint.
- [deque](https://docs.python.org/3/library/collections.html#collections.deque) has O(1) insertion time at the end, but O(n) access time (which made me doubt its ability to make a good replay buffer and try an np.array-based solution)
- When we draw a mini-batch for DQN, it would be best to receive (separately) an array of states only, an array of actions, an array of rewards, an array of next states and a last array of "done", that we can pass these to the Q-network. What's the best way of doing that? Store them separately?

# Setting the frame

In [1]:
import gym
from gym import logger
import numpy as np
logger.set_level(gym.logger.DISABLED)
import torch

In [2]:
cartpole = gym.make('CartPole-v1')

In [3]:
state = cartpole.reset()
action = cartpole.action_space.sample()
next_state, reward, done, _ = cartpole.step(action)

In [4]:
replay_buffer_size = int(1e6)
nb_samples = int(2e6)
nb_batches = int(1e4)
batch_size = 50

# Testing functions

In [5]:
from tqdm import trange

def test_insertion_tqdm(buffer, nb_samples):
    state = cartpole.reset()
    for _ in trange(nb_samples):
        buffer.append(state, action, reward, next_state, done)

def test_sampling_tqdm(buffer, nb_batches):
    for _ in trange(nb_batches):
        buffer.sample(batch_size)

In [6]:
import timeit
import gc

def test_insertion_timeit(buffer, nb_samples):
    print("Insertion of", nb_samples, "samples:", 
      timeit.timeit('memory.append(state,action,reward,next_state,done)', 
                    globals=globals(), 
                    setup='gc.enable()', 
                    number=nb_samples))

def test_sampling_timeit(buffer, nb_batches):
    print("Sampling of", nb_batches, "batches:",
          timeit.timeit('memory.sample(batch_size)', 
                        globals=globals(), 
                        setup='gc.enable()', 
                        number=nb_batches))

# Replay buffer classes

In [7]:
from collections import deque, namedtuple
Transition = namedtuple('Transition', 
                        ('state', 'action', 'reward', 'next_state', 'done'))

# But for the sake of the exercise, we will wrap this in a dedicated class.

import random
    
class ReplayBuffer1(object):
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    def append(self, *args):
        self.memory.append(Transition(*args))
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    def __len__(self):
        return len(self.memory)
    def capacity(self):
        return self.memory.maxlen
    
class ReplayBuffer2(object):
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    def __len__(self):
        return len(self.memory)
    def capacity(self):
        return self.memory.maxlen
    
class ReplayBuffer3(deque):
    def __init__(self, capacity):
        super().__init__(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        super().append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        return random.sample(self, batch_size)
    
class ReplayBuffer4(deque):
    def __init__(self, capacity):
        super().__init__(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        super().append(Transition(state, action, reward, next_state, done))
    def sample(self, batch_size):
        return random.sample(self, batch_size)

class ReplayBuffer5(object):
    def __init__(self, capacity):
        self.capacity = capacity # capacity of the buffer
        self.data = np.empty(capacity, dtype=Transition)
        self.index = 0 # index of the next cell to be filled
        self.size = 0 # number of elements in the buffer
        
    def append(self, *args):
        self.data[self.index] = Transition(*args)
        self.index = (self.index + 1) % self.capacity
        if self.size < self.capacity:
            self.size+=1
        
    def sample(self, batch_size):
        #indices = np.random.choice(self.size, size=batch_size, replace=False)
        #return self.memory[indices]
        return np.random.choice(self.data[:self.size], size=batch_size, replace=False)
    
    def __len__(self):
        return self.size
    
class ReplayBuffer6(object):
    def __init__(self, capacity):
        self.data = deque(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        self.data.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = random.sample(self.data, batch_size)
        return list(map(np.array, list(zip(*batch))))
    def __len__(self):
        return len(self.data)
    def capacity(self):
        return self.data.maxlen

class ReplayBuffer7(deque):
    def __init__(self, capacity):
        super().__init__(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        super().append(Transition(state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = random.sample(self, batch_size)
        return list(map(np.array, list(zip(*batch))))
    def capacity(self):
        return self.maxlen
    
class ReplayBuffer8(object):
    def __init__(self, capacity):
        self.data = deque(maxlen=capacity)
    def append(self, state, action, reward, next_state, done):
        self.data.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = random.sample(self.data, batch_size)
        return list(map(torch.Tensor, list(zip(*batch))))
    def __len__(self):
        return len(self.data)
    def capacity(self):
        return self.data.maxlen

In [8]:
class ReplayBuffer9:
    def __init__(self, capacity):
        self.capacity = capacity # capacity of the buffer
        self.data = []
        self.index = 0 # index of the next cell to be filled
        
    def append(self, s, a, r, s_, d):
        if len(self.data) < self.capacity:
            self.data.append(None)
        self.data[self.index] = (s, a, r, s_, d)
        self.index = (self.index + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.data, batch_size)
    
    def __len__(self):
        return len(self.data)
    
class ReplayBuffer10:
    def __init__(self, capacity):
        self.capacity = capacity # capacity of the buffer
        self.data = []
        self.index = 0 # index of the next cell to be filled
    def append(self, s, a, r, s_, d):
        if len(self.data) < self.capacity:
            self.data.append(None)
        self.data[self.index] = (s, a, r, s_, d)
        self.index = (self.index + 1) % self.capacity
    def sample(self, batch_size):
        batch = random.sample(self.data, batch_size)
        return list(map(torch.Tensor, list(zip(*batch))))
    def __len__(self):
        return len(self.data)

# Pseudo-unit testing

In [9]:
# init
memory = ReplayBuffer4(replay_buffer_size)
print(memory)
# len
print(len(memory))
# append
memory.append(state, action, reward, next_state, done)
print(memory)
print(len(memory))

deque([], maxlen=1000000)
0
deque([Transition(state=array([-0.01853433, -0.0036665 ,  0.00995653, -0.02980118]), action=1, reward=1.0, next_state=array([-0.01860766,  0.19131126,  0.00936051, -0.31932616]), done=False)], maxlen=1000000)
1


In [10]:
# init
memory = ReplayBuffer5(replay_buffer_size)
print(memory.data)
# len
print(len(memory))
# append
memory.append(state, action, reward, next_state, done)
print(memory.data)
print(len(memory))

[None None None ... None None None]
0
[Transition(state=array([-0.01853433, -0.0036665 ,  0.00995653, -0.02980118]), action=1, reward=1.0, next_state=array([-0.01860766,  0.19131126,  0.00936051, -0.31932616]), done=False)
 None None ... None None None]
1


# Time testing

In [11]:
memory = ReplayBuffer1(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:03<00:00, 566818.83it/s]
100%|██████████| 10000/10000 [00:17<00:00, 576.75it/s]


Insertion of 2000000 samples: 1.7041785359979258
Sampling of 10000 batches: 16.558432882004126


In [12]:
memory = ReplayBuffer2(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:01<00:00, 1350695.30it/s]
100%|██████████| 10000/10000 [00:16<00:00, 604.26it/s]


Insertion of 2000000 samples: 0.7784661740006413
Sampling of 10000 batches: 15.611870838998584


In [13]:
memory = ReplayBuffer3(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:01<00:00, 1171149.37it/s]
100%|██████████| 10000/10000 [00:17<00:00, 585.70it/s]


Insertion of 2000000 samples: 1.0343449819993111
Sampling of 10000 batches: 16.888517145998776


In [14]:
memory = ReplayBuffer4(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:03<00:00, 592494.36it/s]
100%|██████████| 10000/10000 [00:17<00:00, 566.53it/s]


Insertion of 2000000 samples: 2.088889257000119
Sampling of 10000 batches: 16.621598482997797


In [15]:
memory = ReplayBuffer5(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:04<00:00, 405429.38it/s]
100%|██████████| 10000/10000 [06:41<00:00, 24.90it/s]


Insertion of 2000000 samples: 3.3488479379957425
Sampling of 10000 batches: 463.5867436450062


In [16]:
memory = ReplayBuffer6(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:01<00:00, 1171832.41it/s]
100%|██████████| 10000/10000 [00:22<00:00, 452.80it/s]


Insertion of 2000000 samples: 0.8611387350101722
Sampling of 10000 batches: 19.480439072998706


In [17]:
memory = ReplayBuffer7(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:07<00:00, 277969.59it/s]
100%|██████████| 10000/10000 [00:19<00:00, 502.06it/s]


Insertion of 2000000 samples: 2.130781361993286
Sampling of 10000 batches: 16.827209620998474


In [18]:
memory = ReplayBuffer8(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:01<00:00, 1386378.58it/s]
100%|██████████| 10000/10000 [00:20<00:00, 477.66it/s]


Insertion of 2000000 samples: 0.7717193870048504
Sampling of 10000 batches: 18.67609501199331


In [19]:
memory = ReplayBuffer9(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:02<00:00, 844337.09it/s]
100%|██████████| 10000/10000 [00:00<00:00, 18091.69it/s]


Insertion of 2000000 samples: 1.7328127619985025
Sampling of 10000 batches: 0.6603696280071745


In [20]:
memory = ReplayBuffer10(replay_buffer_size)
test_insertion_tqdm(memory, nb_samples)
test_sampling_tqdm(memory, nb_batches)
test_insertion_timeit(memory, nb_samples)
test_sampling_timeit(memory, nb_batches)

100%|██████████| 2000000/2000000 [00:02<00:00, 822408.11it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3437.90it/s]


Insertion of 2000000 samples: 1.8442452210001647
Sampling of 10000 batches: 2.8511063719925005
