In [1]:
import gym
import numpy as np
from tqdm import tqdm

In [2]:
env = gym.make('FrozenLake-v0', is_slippery=False)

In [3]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [4]:
obs_shape = 1
n_actions = env.action_space.n
model = keras.models.Sequential([
                                 keras.layers.Dense(32, activation='relu', input_shape=[obs_shape]),
                                 keras.layers.Dense(64, activation='relu'),
                                 keras.layers.Dense(64, activation='relu'),
                                 keras.layers.Dense(n_actions)
])

In [5]:
def epsilon_greedy(state, epsilon):
    if np.random.binomial(1, epsilon):
        return np.random.randint(n_actions)
    else:
        return np.argmax(model.predict([state]))

In [None]:
n_episodes = 10000
replay_buffer = []
batch_size = 32
gamma = 0.98
optimizer = keras.optimizers.Adam(learning_rate=0.001)
loss_fn = keras.losses.mean_squared_error

goal_reached = []

for episode in tqdm(range(n_episodes)):
    env.reset()
    done = False
    state = (0-7.5)/4.61
    while not done:
        #I CHOOSE A LINEAR DECAY OF EPSILON
        epsilon = (-0.09/(n_episodes-1))*episode + 1
        #PLAY THE GAME USING EPSILON-GREEDY WITH CURRENT POLICY UNTIL IT'S DONE
        action = epsilon_greedy(state, epsilon)
        next_state, reward, done, info = env.step(action)
        #WE RECORD IF THE EPISODED ENDED ON THE GOAL CASE OR NOT
        if done:
            if next_state==15:
                goal_reached.append(1)
            else:
                goal_reached.append(0)
        #WE STORE THAT DATA INTO A BUFFER TO REPLAY FROM LATER AND TRAIN OUR MODEL
        #IN THE BUFFER WE STORE THE TRAJECTORIES
        replay_buffer.append((state, action, reward, (next_state-7.5)/4.61, done))
        #THE NEXT_STATE BECOMES THE STATE FROM WHICH WE MOVE
        state = (next_state-7.5)/4.61
  
    #AFTER PLAYING FOR A WHILE WE HAVE ENOUGH TRAJECTORIES WE CAN START TRAINING
    if episode>32:
        #WE SAMPLE batch_size TRAJECTORIES
        indices = np.random.randint(len(replay_buffer), size=batch_size)
        batch = [replay_buffer[index] for index in indices] 
        states, actions, rewards, next_states, dones = [np.array([trajectory[i] for trajectory in batch]) for i in range(5)]
        #WE HAVE TO COMPUTE THE TARGET VALUES TO WHICH WE TRY TO APPROACH
        target_Q_values = rewards + (1-dones)*gamma*np.max(model.predict(next_states), axis=1)
        #TIME TO COMPUTE THE GRADIENTS
        mask = tf.one_hot(actions, n_actions)
        with tf.GradientTape() as tape:
            #WE COMPUTE THE ACTUAL Q_VALUES
            Q_values = model(states[:,np.newaxis])
            Q_values = tf.reduce_sum(Q_values * mask, axis=1, keepdims=True)
            #WE COMPUTE THE LOSS BETWEEN OUR ACTUAL Q_VALUES AND THE TARGET VALUES
            loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
        #WE COMPUTE THE GRADIENTS OF THE LOSS WITH RESPECT TO THE MODEL'S VARIABLES
        grads = tape.gradient(loss, model.trainable_variables)
        #WE APPLY THE GRADIENTS TO THE MODEL'S VARIABLES USING THE OPTIMIZER
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

 91%|█████████ | 9079/10000 [07:51<00:59, 15.53it/s]

In [None]:
plt.plot(np.arange(n_episodes), goal_reached)

In [None]:
env.reset()
print("--------- Initial State ---------")
env.render()
done = False
state = 0
while not done:
    print("--------- State "+str(state)+" ---------")
    action = np.argmax(model.predict([(state-7.5)/4.61]))
    next_state, reward, done, info = env.step(action)
    env.render()
    state = next_state

In [None]:
#TRY FIXED Q_VALUES

In [None]:
model = keras.models.Sequential([
                                 keras.layers.Dense(64, activation='relu', input_shape=[obs_shape]),
                                 keras.layers.Dense(64, activation='relu'),
                                 keras.layers.Dense(n_actions)
])

In [None]:
target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [None]:
n_episodes = 10000
replay_buffer = []
batch_size = 32
gamma = 0.98
optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss_fn = keras.losses.mean_squared_error

goal_reached = []

for episode in tqdm(range(n_episodes)):
    env.reset()
    done = False
    state = (0-7.5)/4.61
    while not done:
        #I CHOOSE A LINEAR DECAY OF EPSILON
        epsilon = (-0.09/(n_episodes-1))*episode + 1
        #PLAY THE GAME USING EPSILON-GREEDY WITH CURRENT POLICY UNTIL IT'S DONE
        action = epsilon_greedy(state, epsilon)
        next_state, reward, done, info = env.step(action)
        #WE RECORD IF THE EPISODED ENDED ON THE GOAL CASE OR NOT
        if done:
            if next_state==15:
                goal_reached.append(1)
            else:
                goal_reached.append(0)
        #WE STORE THAT DATA INTO A BUFFER TO REPLAY FROM LATER AND TRAIN OUR MODEL
        #IN THE BUFFER WE STORE THE TRAJECTORIES
        replay_buffer.append((state, action, reward, (next_state-7.5)/4.61, done))
        #THE NEXT_STATE BECOMES THE STATE FROM WHICH WE MOVE
        state = (next_state-7.5)/4.61
  
    #AFTER PLAYING FOR A WHILE WE HAVE ENOUGH TRAJECTORIES WE CAN START TRAINING
    if episode>32:
        #WE SAMPLE batch_size TRAJECTORIES
        indices = np.random.randint(len(replay_buffer), size=batch_size)
        batch = [replay_buffer[index] for index in indices] 
        states, actions, rewards, next_states, dones = [np.array([trajectory[i] for trajectory in batch]) for i in range(5)]
        #WE HAVE TO COMPUTE THE TARGET VALUES TO WHICH WE TRY TO APPROACH
        target_Q_values = rewards + (1-dones)*gamma*np.max(target.predict(next_states), axis=1)
        #TIME TO COMPUTE THE GRADIENTS
        mask = tf.one_hot(actions, n_actions)
        with tf.GradientTape() as tape:
            #WE COMPUTE THE ACTUAL Q_VALUES
            Q_values = model(states[:,np.newaxis])
            Q_values = tf.reduce_sum(Q_values * mask, axis=1, keepdims=True)
            #WE COMPUTE THE LOSS BETWEEN OUR ACTUAL Q_VALUES AND THE TARGET VALUES
            loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
        #WE COMPUTE THE GRADIENTS OF THE LOSS WITH RESPECT TO THE MODEL'S VARIABLES
        grads = tape.gradient(loss, model.trainable_variables)
        #WE APPLY THE GRADIENTS TO THE MODEL'S VARIABLES USING THE OPTIMIZER
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
    if episode % 64 == 0:
        target.set_weights(model.get_weights())

In [None]:
plt.plot(np.arange(n_episodes), goal_reached)

In [None]:
env.reset()
print("--------- Initial State ---------")
env.render()
done = False
state = 0
while not done:
    print("--------- State "+str(state)+" ---------")
    action = np.argmax(model.predict([(state-7.5)/4.61]))
    next_state, reward, done, info = env.step(action)
    env.render()
    state = next_state

In [None]:
obs_shape = 1
n_actions = env.action_space.n
model2 = keras.models.Sequential([
                                 keras.layers.Dense(32, activation='relu', input_shape=[obs_shape]),
                                 keras.layers.Dense(n_actions)
])

In [None]:
target2 = keras.models.clone_model(model2)
target2.set_weights(model2.get_weights())

In [None]:
def epsilon_greedy2(state, epsilon):
    if np.random.binomial(1, epsilon):
        return np.random.randint(n_actions)
    else:
        return np.argmax(model2.predict([state]))

In [None]:
n_episodes = 1000000
replay_buffer = []
batch_size = 32
gamma = 0.999
optimizer = keras.optimizers.Adam(learning_rate=0.001)
loss_fn = keras.losses.mean_squared_error

goal_reached = []

for episode in tqdm(range(n_episodes)):
    env.reset()
    done = False
    state = (0-7.5)/4.61
    while not done:
        #I CHOOSE A LINEAR DECAY OF EPSILON
        epsilon = (-0.9/n_episodes)*episode +1
        #PLAY THE GAME USING EPSILON-GREEDY WITH CURRENT POLICY UNTIL IT'S DONE
        action = epsilon_greedy2(state, epsilon)
        next_state, reward, done, info = env.step(action)
        #WE RECORD IF THE EPISODED ENDED ON THE GOAL CASE OR NOT
        if done:
            if next_state==15:
                goal_reached.append(1)
            else:
                goal_reached.append(0)
        #WE STORE THAT DATA INTO A BUFFER TO REPLAY FROM LATER AND TRAIN OUR MODEL
        #IN THE BUFFER WE STORE THE TRAJECTORIES
        replay_buffer.append((state, action, reward, (next_state-7.5)/4.61, done))
        #THE NEXT_STATE BECOMES THE STATE FROM WHICH WE MOVE
        state = (next_state-7.5)/4.61
  
    #AFTER PLAYING FOR A WHILE WE HAVE ENOUGH TRAJECTORIES WE CAN START TRAINING
    if episode>512:
        #WE SAMPLE batch_size TRAJECTORIES
        indices = np.random.randint(len(replay_buffer), size=batch_size)
        batch = [replay_buffer[index] for index in indices] 
        states, actions, rewards, next_states, dones = [np.array([trajectory[i] for trajectory in batch]) for i in range(5)]
        #WE HAVE TO COMPUTE THE TARGET VALUES TO WHICH WE TRY TO APPROACH
        target_Q_values = rewards + (1-dones)*gamma*np.max(target2.predict(next_states), axis=1)
        #TIME TO COMPUTE THE GRADIENTS
        mask = tf.one_hot(actions, n_actions)
        with tf.GradientTape() as tape:
            #WE COMPUTE THE ACTUAL Q_VALUES
            Q_values = model2(states[:,np.newaxis])
            Q_values = tf.reduce_sum(Q_values * mask, axis=1, keepdims=True)
            #WE COMPUTE THE LOSS BETWEEN OUR ACTUAL Q_VALUES AND THE TARGET VALUES
            loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
        #WE COMPUTE THE GRADIENTS OF THE LOSS WITH RESPECT TO THE MODEL'S VARIABLES
        grads = tape.gradient(loss, model2.trainable_variables)
        #WE APPLY THE GRADIENTS TO THE MODEL'S VARIABLES USING THE OPTIMIZER
        optimizer.apply_gradients(zip(grads, model2.trainable_variables))
    if (episode>512) and (episode%256) == 0:
        target2.set_weights(model2.get_weights())

In [None]:
plt.plot(np.arange(n_episodes), goal_reached)

In [None]:
env.reset()
print("--------- Initial State ---------")
env.render()
done = False
state = 0
while not done:
    print("--------- State "+str(state)+" ---------")
    action = np.argmax(model2.predict([(state-7.5)/4.61]))
    next_state, reward, done, info = env.step(action)
    env.render()
    state = next_state

In [None]:
plt.plot(np.arange(49000,50001), goal_reached[48999:])