In [1]:
import sys
import gym
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam

In [2]:
alpha = 0.001

model = Sequential()
model.add(Dense(64, input_dim=4, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.summary()
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=alpha))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 4,610
Trainable params: 4,610
Non-trainable params: 0
_________________________________________________________________


  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [6]:
def get_action(model, state):
    policy = model.predict(state, batch_size=1).flatten()
    return np.random.choice(env.action_space.n, 1, p=policy)[0]

def discount_rewards(discount, rewards):
    discounted_rewards = np.zeros_like(rewards)
    count = 0
    for t in reversed(range(0, len(rewards))):
        count = count * discount + rewards[t]
        discounted_rewards[t] = count
    return discounted_rewards

def reshape_state(state):
    return np.reshape(state, [1,env.observation_space.shape[0]])

def train_model(state_list, action_list, rewards_list, discount, model):
    episode_length = len(state_list)
    discounted_rewards = discount_rewards(discount, rewards_list)
    update_inputs = np.zeros((episode_length, env.observation_space.shape[0]))
    target = np.zeros((episode_length, env.action_space.n))

    for i in range(episode_length):
        update_inputs[i] = state_list[i]
        target[i][action_list[i]] = discounted_rewards[i]

    model.fit(update_inputs, target, epochs=1, verbose=0)

In [11]:
env = gym.make('CartPole-v0')

EPISODES = 2000
discount = 0.99

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, env.observation_space.shape[0]])
    state_list, action_list, rewards_list = [], [], []
    step = 0

    while True:
        step += 1
        action = get_action(model, state)
        next_state, reward, done, info = env.step(action)
        next_state = reshape_state(next_state)

        state_list.append(state)
        rewards_list.append(reward)
        action_list.append(action)
        state = next_state

        if done:
            train_model(state_list, action_list, rewards_list, discount, model)
            if e%100 == 0:
              print('Episode step: ', step)
            break


Episode step:  9
Episode step:  10
Episode step:  9
Episode step:  9
Episode step:  9
Episode step:  10
Episode step:  8
Episode step:  10
Episode step:  10
Episode step:  8
Episode step:  11
Episode step:  8
Episode step:  9
Episode step:  11
Episode step:  10
Episode step:  10
Episode step:  8
Episode step:  9
Episode step:  9
Episode step:  9


In [10]:
def test_policy(model):
    avg_steps = []
    for iter in range(10):
      obs = env.reset()
      for i in range(50000):
        action = np.argmax(model.predict(reshape_state(obs))[0])
        obs, reward, done, info = env.step(action)
        if done:
          avg_steps.append(i+1)
          # print("Iterations that were run:", i+1)
          break
    avg = sum(avg_steps)/len(avg_steps)
    print("Average steps for policy:", avg)
    return avg

In [28]:
test_policy(model)

Average steps for policy: 9.0


9.0