In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn import svm
import math
import tensorflow as tf
%matplotlib inline

In [2]:
class RandomAgent:
    def __init__(self, env, num_experiments=10, num_episodes=100, num_timesteps=1000):
        self.env = env
        self.num_experiments=num_experiments
        self.num_episodes=num_episodes
        self.num_timesteps=num_timesteps
    def getAction(self, state):
        return self.env.action_space.sample()
    def getQValue(self, state):
        pass
    def update(self, state, action, reward, nextState):
        pass
    def run(self):
        episodic_return = np.zeros((self.num_experiments, self.num_episodes))
        for i in range(self.num_experiments):
            for j in range(self.num_episodes):
                acc_reward = 0
                for t in range(self.num_timesteps):
                    observation = self.env.reset()
                    action = self.getAction(observation)
                    nextObservation, reward, done, info = self.env.step(action)
                    if done:
                        break
                    acc_reward += reward
                episodic_return[i, j] += acc_reward
        return episodic_return

In [7]:

class PGAgent:
    def __init__(self, env, num_experiments=10, num_episodes=100, num_timesteps=1000, 
                 replay_buffer_size=10000, epsilon=0.05, batch_size=32, lr=0.00001):
        self.env = env
        self.num_experiments=num_experiments
        self.num_episodes=num_episodes
        self.num_timesteps=num_timesteps
        self.num_actions = 2
        self.replay_buffer_size = replay_buffer_size
        self.observation_size = self.env.observation_space.shape[0]
        self.epsilon = epsilon
        self.replay_buffer = []
        self.batch_size = batch_size
        self.lr = lr
        self.state = tf.placeholder(tf.float32, [None, self.observation_size], name="state")
        self.next_state = tf.placeholder(tf.float32, [None, self.observation_size], name="next_state")
        self.reward = tf.placeholder(tf.float32, [None], name="reward")
        self.action = tf.placeholder(tf.int32, [None], name="action")
        
        def network(input_data, scope):
            with tf.variable_scope(scope):
                w1 = tf.layers.dense(input_data, 16,  name="input_layer")
                #w2 = tf.layers.dense(w1, 32, name="h1")
                out = tf.layers.dense(w1, self.num_actions, name="h2")
            return out
        
        self.Q_network = network(self.state, "network")
        self.Q_target = network(self.next_state, "target")
        

        
        
        
        network_params = [t for t in tf.trainable_variables() if t.name.startswith("network")]
        network_params = sorted(network_params, key=lambda v: v.name)
        target_params = [t for t in tf.trainable_variables() if t.name.startswith("target")]
        target_params = sorted(target_params, key=lambda v: v.name)

        update_target = []
        for network_v, target_v in zip(network_params, target_params):
            op = target_v.assign(network_v)
            update_target.append(op)


                
        self.update_target = update_target

        
        self.loss = 1/2*tf.pow(self.reward + tf.reduce_max(self.Q_target)- tf.gather(self.Q_network, self.action, axis=1), 2)
        
        self.optim = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        
        
        # self.alpha = 0.2
        # self.beta = 0.2
    
        # Your code here!
        # Features can for example come from a grid of Radial Basis Functions
        # or some other model that lets you do predictions over unvisited states
        # self.features = ?
        
        ### You can store parts of the state space like this and build your kernels
        #
        # self.observations = [[] for i in range(num_action)]
        # for i in self.observations:
        #     i.append(self.env.observation_space.sample())
        #
        ###
        
        ### You need to store your weights somewhere
        ### if you are using actor-critic you probably want weights for both the policy and the value function!
        #
        # self.PIweights = np.ones((num_action,?))
        # self.Vweights = np.ones((num_action,?))
        #
        ###
        
    ### This method will return a sampled action
    ### from your policy
    def getAction(self, state, epsilon=0.05):
        Q = self.getQValues(state)
        action = []
        for q in Q:
            if random.random() < epsilon:
                action.append(random.choice(range(self.num_actions)))
            else:
                action.append(np.argmax(q))
        return action
        
    def getQValues(self, state):
        return self.session.run(self.Q_network, feed_dict= {self.state: state})
    
                
    def update_target(self):
        self.session.run(self.update_target)
        
    def update(self):
        state, action, reward, next_state = self.sample_from_buffer(size=self.batch_size)
        self.session.run(self.optim, feed_dict={
                self.state: state, self.action: action, self.reward: reward, self.next_state: next_state}
                )
       
    def add_replay_buffer(self,x):
        self.replay_buffer.append(x)
        if len(self.replay_buffer)>self.replay_buffer_size:
            self.replay_buffer = self.replay_buffer[-self.replay_buffer_size:]
            
    def sample_from_buffer(self, size=1):
        buffer_length = len(self.replay_buffer)
        sample = np.random.choice(buffer_length, size)
        reward = []
        state = []
        action = []
        next_state = []
        for i in sample:
            s, a, r, n_s = self.replay_buffer[i]
            state.append(s)
            reward.append(r)
            action.append(a)
            next_state.append(n_s)
        
        state=np.array(state, dtype=np.float32)
        reward=np.array(reward, dtype=np.float32)
        action=np.array(action, dtype=np.int32)
        next_state=np.array(next_state, dtype=np.float32)
        
        return state, action, reward, next_state
        
    
    def step(self, action):
        nextObservation = []
        reward = []
        done = []
        for a in action:
            n_o, r, d, info = self.env.step(a)
            done.append(d)
            nextObservation.append(n_o)
            reward.append(r)
        return nextObservation, reward, done
            
    def epsilon_anneal(self, iteration, maximum=1., minimum=0.02):
        if iteration > 10000:
            return minimum
        else:
            return minimum + (maximum-minimum)*(1-iteration/10000)
        
    def run(self):
        episodic_return = np.zeros((self.num_experiments, self.num_episodes))
        total_steps=0
        for i in range(self.num_experiments):
            # first collect some MC returns 
            for j in range(self.num_episodes):
                acc_reward = 0
                observation = self.env.reset()
                for t in range(self.num_timesteps):
                    action = self.getAction(np.array([observation]), self.epsilon_anneal(total_steps))
                    action = action[0]
                    nextObservation, reward, done,  info = self.env.step(action)
                    self.add_replay_buffer((observation, action, reward, nextObservation))
                    acc_reward += reward
                    self.update()
                    total_steps+=1
                    if done:
                        break
                    observation = nextObservation
                episodic_return[i, j] += acc_reward
                #print(acc_reward)
        return episodic_return
    

In [None]:
# pg agent
tf.reset_default_graph()
env = gym.make("CartPole-v0")
ragent = PGAgent(env, num_episodes=10000, num_experiments=1, lr=0.001, batch_size=32)
eps_return = ragent.run()
#print (eps_return.shape)
#print (np.mean(eps_return, axis=0))
plt.plot(np.mean(eps_return, axis=0))
#plt.scatter()

# random agent
env = gym.make("CartPole-v0")
ragent = RandomAgent(env)
eps_return = ragent.run()
#print(np.mean(eps_return, axis=0))


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
