In [None]:
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
import cntk
from cntk import *
from cntk.layers import *

### Some comments about replay buffer implementation

In [None]:
class ReplayBuffer: 
    """
    Fixed capacity buffer implemented as circular queue
    Transitions are stored as (s, a, r, s') tuples
    """
    def __init__(self, capacity):
        self.samples = np.ndarray(capacity, dtype=object)
        self.capacity = capacity
        self.counter = 0
        self.flag = False
        
    def size(self):
        if self.flag:
            return self.capacity
        else:
            return self.counter
        
    def add(self, sample):
        self.samples[self.counter] = sample
        
        self.counter += 1
        if self.counter >= self.capacity:
            self.counter = 0
            self.flag = True
            
    def sample(self, n):
        n = min(n, self.size())
        
        size = self.size()
        if size < self.capacity:
            return np.random.choice(self.samples[:size], n, replace=False)
        else:
            return np.random.choice(self.samples, n, replace=False)

In [None]:
class ErAgent:
    def __init__(self, input_dim, output_dim, batch_size, gamma, buffer):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.batch_size = batch_size
        self.gamma = gamma
        self.buffer = buffer
        self.epsilon = 1
        
        # Create the model
        self.input_var = input(input_dim, np.float32)
        self.output_var = input(output_dim, np.float32)

        self.model = Sequential([
            Dense(16, activation=relu, name='dense1'),
            Dense(16, activation=relu, name='dense2'),
            Dense(self.output_dim, name='z')
        ])(self.input_var)

        loss = reduce_mean(square(self.model - self.output_var), axis=0)

        learning_rate = 0.0025
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample)
        learner = sgd(self.model.parameters, lr_schedule)
        self.trainer = Trainer(self.model, loss, learner)
        
    def update_epsilon(self, episode):
        """
        Updates epsilon using exponential decay with the decay rate chosen such
        that epsilon is 0.05 by episode 8000
        """
        self.epsilon = max(math.exp(-3.74e-4 * episode), 0.05)
        
    def predict(self, s):
        """
        Feeds a state through the model (our network) and obtains the values of each action
        """
        return self.model.eval(s)
        
    def act(self, s):
        """
        Selects an action using the epoch-greedy approach
        """
        if random.random() > self.epsilon:
            # Exploit (greedy)            
            return np.argmax(self.predict(s))
        else:
            # Explore (random action)
            return random.randint(0, self.output_dim - 1)
        
    def observe(self, sample):
        """
        Adds a transition to the replay buffer
        """
        self.buffer.add(sample)
        
    def replay(self):
        """
        Samples a random minibatch of transitions from the replay buffer,
        computes the expected return and uses them to perform a gradient descent step
        """
        batch = self.buffer.sample(self.batch_size)
        batch_len = batch.size
        
        no_state = np.zeros(self.input_dim)  # target for terminal state is r
        states = np.array([obs[0] for obs in batch], dtype=np.float32)
        states_ = np.array([no_state if obs[3] is None else obs[3] for obs in batch], dtype=np.float32)
        
        p = self.predict(states)  # value of start state
        p_ = self.predict(states_)  # value of end state
        
        shape = np.hstack([batch_len, self.input_dim])
        x = np.array(states, dtype=np.float32)  # will be inputs to network
        y = np.array(p, dtype=np.float32)  # will be targets for network
        
        actions = [obs[1] for obs in batch]
        rewards = np.array([obs[2] for obs in batch], dtype=np.float32)
        terminal = np.invert([True if obs[3] is None else False for obs in batch]).astype(np.float32)
        targets = rewards + terminal * self.gamma * np.amax(p_, axis=1)
                
        y[range(batch_len), actions] = targets         
                
        # Perform gradient descent
        self.trainer.train_minibatch({self.input_var: x, self.output_var: y})

In [None]:
def initialize_buffer(env, buffer):
    """
    Initializes the replay buffer using experiences generated by taking random actions
    """
    actions = env.action_space.n
    s = env.reset()
    
    while buffer.size() < buffer.capacity:
        a = random.randint(0, actions - 1)
        s_, r, done, info = env.step(a)
        
        if done:  # terminal state
            s_ = None
        
        buffer.add((s, a, r, s_))
        
        if done:
            s = env.reset()
        else:
            s = s_

In [None]:
def train(env, episodes, gamma, buffer_capacity, minibatch_size):
    """
    param env: The gym environment to train with
    param episodes: The number of episodes to train for
    param gamma: The discount factor
    param epsilon_fn: Function which returns epsilon for epoch-greedy learning
    """
    input_dim = env.observation_space.shape
    output_dim = env.action_space.n 
    
    # Create buffer and initialize using random transitions
    buffer = ReplayBuffer(buffer_capacity)
    initialize_buffer(env, buffer)
    
    # Create agent
    agent = ErAgent(input_dim, output_dim, minibatch_size, gamma, buffer)
    
    episode = 0
    rewards = 0
    episode_rewards = []
    s = env.reset()
    s = s.astype(np.float32)
    
    while episode < episodes:
        # Select action using policy derived from Q (e-greedy) 
        a = agent.act(s)

        # Take action and observe next state and reward
        s_, r, done, info = env.step(a)

        # Store transition in replay buffer
        s_ = s_.astype(np.float32)
        agent.observe((s, a, r, s_))
        
        # Sample random minibatch, compute y_i and perform gradient descent step
        agent.replay()

        s = s_
        rewards += r

        # Episode over, reset environment, update epsilon
        if done:                
            episode += 1
            agent.update_epsilon(episode)
            episode_rewards.append(rewards)

            if episode % 200 == 0:
                print('Episode {}, reward = {}'.format(episode, rewards))

            s = env.reset()
            s = s.astype(np.float32)
            rewards = 0
    return agent.model, episode_rewards

In [None]:
gamma = 0.99
episodes = 10000
buffer_capacity = 1
minibatch_size = 1
env = gym.make('CartPole-v0')

In [None]:
model1, rewards1 = train(env, episodes, gamma, buffer_capacity, minibatch_size)

In [None]:
pd.Series(rewards1).rolling(window=100).mean().plot(label='with experience replay')
plt.show()

In [None]:
def evaluate(env, model, episodes):
    """
    Computes the average performance of the trained model over 'episodes' episodes
    """
    episode = 0
    rewards = 0
    
    while episode < episodes:
        s = env.reset()
        done = False
        while not done:
            a = np.argmax(model.eval(s.astype(np.float32)))
            s, r, done, info = env.step(a)
            rewards += r
        episode += 1
    
    return rewards / float(episodes)

In [None]:
ave_er = evaluate(env, model1, 100)
print('Average reward (with experience replay) = {}'.format(ave_er))

In [None]:
plt.plot(range(len(rewards1)), rewards1)
plt.show()