In [10]:
import gym
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [15]:
#Create a model
def create_model(input_dim,output_dim=1,baseline=False):
    input1 = keras.layers.Input(input_dim)
    hidden1 = keras.layers.Dense(10,activation='relu',name="layer1")(input1)
    hidden2 = keras.layers.Dense(4,activation='relu',name="layer2")(hidden1)
    
    if baseline == True:
        output1 = keras.layers.Dense(output_dim,activation='relu',name="output")(hidden2)  #For baseline
    else:
        if output_dim == 1:
            output1 = keras.layers.Dense(output_dim,activation='tanh',name="output")(hidden2) #For pendulum
        else:
            output1 = keras.layers.Dense(output_dim,activation='softmax',name="output")(hidden2)
    
   
    model = keras.models.Model(inputs=input1, outputs=output1)  
    return model
   

def build_train_fn(model,output_dim=1,baseline=False):
    action_prob_placeholder = model.output
    action_onehot_placeholder = keras.backend.placeholder(shape=(None, output_dim),name="action_onehot")
    discount_reward_placeholder = keras.backend.placeholder(shape=(None,),name="discount_reward")
    
    if output_dim > 1:
        action_prob = keras.backend.sum(action_prob_placeholder * action_onehot_placeholder, axis=1)
    else:
        action_prob = action_prob_placeholder
        
    if not baseline:
        action_prob = keras.backend.log(action_prob)

    loss = -action_prob * discount_reward_placeholder
    loss = keras.backend.mean(loss)

    adam = keras.optimizers.Adam()

    updates = adam.get_updates(params=model.trainable_weights,loss=loss)
    
    if output_dim > 1:
        train_fn = keras.backend.function(inputs=[model.input,action_onehot_placeholder,discount_reward_placeholder],
                                           outputs=[],
                                           updates=updates) 
    else:
        train_fn = keras.backend.function(inputs=[model.input,discount_reward_placeholder],
                                           outputs=[],
                                           updates=updates)
    return train_fn

In [12]:

def train(states,actions,rewards,model_train,baseline,baseline_model_train = None):
    #print("baseline",baseline)
    discount_reward = discounted_reward(rewards)
    discount_baseline = discounted_reward(baseline)
    #print("before",discount_reward)
    discount_reward -= discount_baseline
    #print("after",discount_reward)
    if output_dim > 1:
        action_onehot = keras.utils.to_categorical(actions, num_classes=output_dim)
        model_train([states, action_onehot, discount_reward])
    else:
        model_train([states, discount_reward])
    
    if baseline_model_train != None:
        baseline_model_train([states,discount_reward])
    

def discounted_reward(rewards):
    size = len(rewards)
    cum_reward = [0]*size
    cum=0
    for t in range(size-1,-1,-1):
        cum+=rewards[t]
        cum_reward[t] = cum
    
    disc = 1
    for t in range(0,size):
        cum_reward[t] = disc * cum_reward[t]
        disc *= gamma
    cum_reward = np.array(cum_reward)
    cum_reward = (cum_reward - cum_reward.mean()) / (cum_reward.std()+np.finfo(float).eps)
    return cum_reward

def get_baseline(state,baseline_model=None):
    if len(state.shape) == 1:
        state = np.expand_dims(state, axis=0)
    if baseline_model == None:
        return 0
    else:
        bl = baseline_model.predict(state)
        #print(bl)
        return np.squeeze(bl)
    
def get_action(state,model,test=0):
    if len(state.shape) == 1:
        state = np.expand_dims(state, axis=0)
    
    #print(np.arange(total_actions))
    #print(action_dist)
    if(output_dim > 1):
        action_dist = np.squeeze(model.predict(state))
        if test == 1:
            return np.argmax(action_dist)
        return np.random.choice(np.arange(total_actions),p=action_dist)
    else:
        
        action_dist = model.predict(state)
        #for single dimension action regardless of test or not
        return action_dist

def reinforce(env,model,model_train,baseline_model=None,baseline_model_train=None):
    for episode in range(1,episodes+1):
        done = False
        rewards = []
        actions = []
        states = []
        baseline = []
        state = env.reset()
        
        while not done:
            state = np.squeeze(state) # added for pendulum
            action = get_action(state,model)
            observation,reward,done,_ = env.step(action)
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            baseline.append(get_baseline(state,baseline_model))
            #print(baseline)
            if done:
                states = np.array(states)
                actions = np.array(actions)
                rewards = np.array(rewards)
                baseline = np.array(baseline)
                #print("baseline",baseline)
                if episode % 100 == 0:
                    print("Episode: ",episode, "Reward: ",rewards.sum())
    
                train(states,actions,rewards,model_train,baseline,baseline_model_train)
                
        
            state = observation
            
        
        
def test(env,model):
    for episode in range(1,test_episodes+1):
        done = False
        rewards = []
        state = env.reset()
        while not done:
            env.render()
            action = get_action(state,model,1)
            observation,reward,done,_ = env.step(action)
            rewards.append(reward)
            if done:
                rewards = np.array(rewards)
                print("Episode: ",episode, "Reward: ",rewards.sum())
            state = observation

# Vanilla Policy Gradient

## Cartpole

In [4]:
cartpole_env = gym.make('CartPole-v0')
episodes = 2000
gamma = 0.99
test_episodes = 10
total_actions = cartpole_env.action_space.n
input_dim = cartpole_env.observation_space.shape
output_dim = total_actions

  result = entry_point.load(False)


In [5]:
model = create_model(input_dim,output_dim)
model_train = build_train_fn(model,output_dim)  #customized fit function
print("Training")
reinforce(cartpole_env,model,model_train)

Training
Episode:  100 Reward:  10.0
Episode:  200 Reward:  13.0
Episode:  300 Reward:  25.0
Episode:  400 Reward:  13.0
Episode:  500 Reward:  16.0
Episode:  600 Reward:  11.0
Episode:  700 Reward:  55.0
Episode:  800 Reward:  30.0
Episode:  900 Reward:  20.0
Episode:  1000 Reward:  14.0
Episode:  1100 Reward:  63.0
Episode:  1200 Reward:  79.0
Episode:  1300 Reward:  79.0
Episode:  1400 Reward:  46.0
Episode:  1500 Reward:  148.0
Episode:  1600 Reward:  41.0
Episode:  1700 Reward:  200.0
Episode:  1800 Reward:  200.0
Episode:  1900 Reward:  200.0
Episode:  2000 Reward:  200.0


In [6]:
print("Testing")
test(cartpole_env,model)

Testing
Episode:  1 Reward:  200.0
Episode:  2 Reward:  200.0
Episode:  3 Reward:  200.0
Episode:  4 Reward:  200.0
Episode:  5 Reward:  200.0
Episode:  6 Reward:  200.0
Episode:  7 Reward:  200.0
Episode:  8 Reward:  200.0
Episode:  9 Reward:  200.0
Episode:  10 Reward:  200.0


## Pendulum

In [31]:
pendulum_env = gym.make('Pendulum-v0')
episodes = 2000
gamma = 0.99
test_episodes = 10
total_actions = 1
input_dim = pendulum_env.observation_space.shape
output_dim = total_actions

In [32]:
pend_model = create_model(input_dim,output_dim)
pend_model_train = build_train_fn(pend_model)  #customized fit function
print("Training")
reinforce(pendulum_env,pend_model,pend_model_train)

Training
Episode:  100 Reward:  -1366.2188
Episode:  200 Reward:  -1251.4907
Episode:  300 Reward:  -1371.6742
Episode:  400 Reward:  -1214.8118
Episode:  500 Reward:  -1256.9417
Episode:  600 Reward:  -1324.7427
Episode:  700 Reward:  -1425.9497
Episode:  800 Reward:  -1372.248
Episode:  900 Reward:  -1225.1758
Episode:  1000 Reward:  -1797.7625
Episode:  1100 Reward:  -1827.6471
Episode:  1200 Reward:  -1253.9158
Episode:  1300 Reward:  -1679.7559
Episode:  1400 Reward:  -1345.5474
Episode:  1500 Reward:  -1168.6304
Episode:  1600 Reward:  -1326.5498
Episode:  1700 Reward:  -1270.9841
Episode:  1800 Reward:  -1555.3312
Episode:  1900 Reward:  -1770.3217
Episode:  2000 Reward:  -978.4465


# Policy Gradient With Baseline

## Cartpole

In [13]:
cartpole_env = gym.make('CartPole-v0')
episodes = 2000
gamma = 0.99
test_episodes = 10
total_actions = cartpole_env.action_space.n
input_dim = cartpole_env.observation_space.shape
output_dim = total_actions
mlr = 0.001
blr = 0.001

In [16]:
model = create_model(input_dim,output_dim)
baseline_model = create_model(input_dim,baseline=True)
print(model.summary())
print(baseline_model.summary())
model_train = build_train_fn(model,output_dim)  #customized fit function
baseline_model_train = build_train_fn(baseline_model,baseline=True)
print("Training")
reinforce(cartpole_env,model,model_train,baseline_model,baseline_model_train)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
layer1 (Dense)               (None, 10)                50        
_________________________________________________________________
layer2 (Dense)               (None, 4)                 44        
_________________________________________________________________
output (Dense)               (None, 2)                 10        
Total params: 104
Trainable params: 104
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 4)                 0         
_________________________________________________________________
layer1 

In [17]:
print("Testing")
test(cartpole_env,model)

Testing
Episode:  1 Reward:  200.0
Episode:  2 Reward:  200.0
Episode:  3 Reward:  200.0
Episode:  4 Reward:  200.0
Episode:  5 Reward:  200.0
Episode:  6 Reward:  200.0
Episode:  7 Reward:  200.0
Episode:  8 Reward:  200.0
Episode:  9 Reward:  200.0
Episode:  10 Reward:  200.0
