In [0]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

tf.config.list_physical_devices('GPU')

# Common imports
import matplotlib.pyplot as plt
import numpy as np
import os
import gym
from Kfocusingtf2 import FocusedLayer1D
# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
tf.random.set_seed(42)
np.random.seed(42)
envName = "MountainCarContinuous-v0"
env = gym.make(envName)

env.seed(42)
obs = env.reset()
mode = 'focused'

input_shape = env.observation_space.shape

is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
if not is_discrete:
    shape = env.action_space.shape
    low = env.action_space.low
    high = env.action_space.high
    env.action_space.n = 10
    actions = np.linspace(low, high, 10, dtype="float32")
    print('The environment is not discrete')
    print('The action space: ', actions.reshape(1,-1)[0])

n_outputs = env.action_space.n

if mode == 'focused':
  layer = FocusedLayer1D(32
                         ,name='focus-1'
                         ,activation='elu'
                         ,init_sigma=0.2)
                          #.25 .2 .1 .05
else:
  layer = keras.layers.Dense(32, activation="elu")

model = keras.models.Sequential([
    keras.layers.Dense(32, activation="elu", input_shape=input_shape),
    layer,
    keras.layers.Dense(n_outputs)
])

The environment is not discrete
The action space:  [-1.         -0.7777778  -0.5555556  -0.3333333  -0.1111111   0.11111116
  0.33333337  0.5555556   0.7777778   1.        ]
{'name': 'focus-1'}


In [0]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(2)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [0]:
from collections import deque

replay_memory = deque(maxlen=2000)

In [0]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [0]:
def play_one_step(env, state, epsilon):
    state = setAxis(state)

    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(get_action(action))
    replay_memory.append((state, isList(action), isList(reward), setAxis(next_state), isList(done)))
    return next_state, reward, done, info

In [0]:
def get_action(action):
    if not is_discrete:
        action = [actions[action]]
    return action

In [0]:
def setAxis(state):
    if not is_discrete:
        state = state.reshape(1,-1)[0]
    return state

In [0]:
def isList(obj):
    try:
        obj = obj[0]
    finally:
        return obj

In [0]:
  def saveModel():
      model.save('./{}-{}.h5'.format(envName, mode))
      score = './{}-{}-scores.npy'.format(envName, mode)
      np.save(score, np.array(rewards))
      replay = './{}-{}-replay.npy'.format(envName, mode)
      np.save(replay, np.array(replay_memory))
      lossName = './{}-{}-loss.npy'.format(envName, mode)
      np.save(lossName, np.array(lossList))


In [0]:
def showTrend(scores, name='',saveFig=False):
    x = []
    y = []
    for i in range(0, len(scores)):
        x.append(int(i))
        y.append(int(scores[i]))

    plt.plot(x, y, label=name+" score per run")

    trend_x = x[1:]
    z = np.polyfit(np.array(trend_x), np.array(y[1:]), 1)
    p = np.poly1d(z)
    plt.plot(trend_x, p(trend_x), linestyle="-.",  label=name+" trend")


    plt.xlabel("episodes")
    plt.ylabel("scores")
    plt.legend(loc="upper left")
    if saveFig:
      plt.savefig('./'+name+'_trend.png', bbox_inches="tight")
      saveModel()

In [0]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
  experiences = sample_experiences(batch_size)
  states, actions, rewards, next_states, dones = experiences
  next_Q_values = model.predict(next_states)
  max_next_Q_values = np.max(next_Q_values, axis=1)
  target_Q_values = rewards + (1 - dones) * discount_rate * max_next_Q_values
  mask = tf.one_hot(actions, n_outputs)
  with tf.GradientTape() as tape:
      all_Q_values = model(states)
      Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
      loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))
  if mode=='focused':
    clipcallBack('Sigma', (0.05,1.0))
    clipcallBack('Mu', (0.0,1.0))
  return float(loss)

In [0]:
def clipcallBack(varname, clips):
  all_weights = model.trainable_weights

  for i,p in enumerate(all_weights):
      # print(p.name)
      if (p.name.find(varname)>=0):
          pval = p.numpy()
          clipped = np.clip(pval,clips[0],clips[1])
          p.assign(clipped)
          # print("Clipped", p.name)
          

In [0]:
env.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

rewards = [] 
lossList = []
best_score = 200

In [0]:
for episode in range(600):
    obs = env.reset()   
    score = 0 
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        score += reward
        if done:
            break
    rewards.append(score)
    if score > best_score:
        best_weights = model.get_weights()
        best_score = score
    print("\rEpisode: {}, Steps: {}, eps: {:.3f}".format(episode+1, step+1, epsilon), end="")
    if episode > 50:
        lossList.append(training_step(batch_size))

model.set_weights(best_weights)

Episode: 275, Steps: 200, eps: 0.452

In [0]:
plt.figure(figsize=(20, 15))
showTrend(scores=rewards, name= envName + " " + mode, saveFig=True)