In [164]:
#!touch 4-play.py
!chmod +x *.py

In [69]:
#!/usr/bin/env python3
""" Load the Environment """
import gym

def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """
    ****************************************
    ****loads the pre-made FrozenLakeEnv****
    *****evnironment from OpenAI’s gym******
    ****************************************
    @desc: is either None or a list of lists containing a custom
           description of the map to load for the environment
    @map_name: is either None or a string containing the pre-made
               map to load
    *** If both desc and map_name are None, the environment will
        load a randomly generated 8x8 map
    @is_slippery: is a boolean to determine if the ice is slippery
    Returns:
            the environment
    """
    return gym.make("FrozenLake-v0",
                    desc=desc,
                    map_name=map_name,
                    is_slippery=is_slippery)
    

In [73]:
import numpy as np

np.random.seed(0)
env = load_frozen_lake()
print(env.desc)
print(env.P[0][0])
env = load_frozen_lake(is_slippery=True)
print(env.desc)
print(env.P[0][0])
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
print(env.desc)
env = load_frozen_lake(map_name='4x4')
print(env.desc)
env.reset()
env.render()

[[b'S' b'F' b'F' b'F' b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'F' b'F' b'H' b'F' b'F']
 [b'F' b'H' b'F' b'H' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'H' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'H' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'G']]
[(1.0, 0, 0.0, False)]
[[b'S' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'H' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'H']
 [b'F' b'F' b'F' b'F' b'F' b'H' b'F' b'H']
 [b'F' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'G']]
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 8, 0.0, True)]
[[b'S' b'F' b'F']
 [b'F' b'H' b'H']
 [b'F' b'F' b'G']]
[[b'S' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'H']
 [b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'G']]

[41mS[0mFFF
FHFH
FFFH
HFF

In [56]:
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000): 
    env.render()
    env.step(env.action_space.sample()) # take a random action




In [57]:
env.close()

In [55]:
help(gym.make("FrozenLake-v0"))

Help on TimeLimit in module gym.wrappers.time_limit object:

class TimeLimit(gym.core.Wrapper)
 |  Wraps the environment to allow a modular transformation.
 |  
 |  This class is the base class for all wrappers. The subclass could override
 |  some methods to change the behavior of the original environment without touching the
 |  original code.
 |  
 |  .. note::
 |  
 |      Don't forget to call ``super().__init__(env)`` if the subclass overrides :meth:`__init__`.
 |  
 |  Method resolution order:
 |      TimeLimit
 |      gym.core.Wrapper
 |      gym.core.Env
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, env, max_episode_steps=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  reset(self, **kwargs)
 |      Resets the environment to an initial state and returns an initial
 |      observation.
 |      
 |      Note that this function should not reset the environment's random
 |      number generator(s); random variabl

In [71]:
#!/usr/bin/env python3
""" Initialize Q-table """
import numpy as np


def q_init(env):
    """
    ****************************************
    ***********Initialize Q-table***********
    ****************************************
    @env: is the FrozenLakeEnv instance
    Returns:
            the Q-table as a numpy.ndarray of zeros
    """
    return np.zeros((env.observation_space.n,
                     env.action_space.n))


In [72]:

env = load_frozen_lake()
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(is_slippery=True)
Q = q_init(env)
print(Q.shape)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(map_name='4x4')
Q = q_init(env)
print(Q.shape)

(64, 4)
(64, 4)
(9, 4)
(16, 4)


In [104]:
#!/usr/bin/env python3
""" Epsilon Greedy """
import numpy as np


def epsilon_greedy(Q, state, epsilon):
    """
    ******************************************************
    ***********uses epsilon-greedy to determine***********
    *******************the next action********************
    ******************************************************
    @Q: is a numpy.ndarray containing the q-table
    @state: is the current state
    @epsilon: is the epsilon to use for the calculation
    *** You should sample p with numpy.random.uniformn to determine
        if your algorithm should explore or exploit
    *** If exploring, you should pick the next action with
        numpy.random.randint from all possible actions
    Returns:
            the next action index
    """
    p = np.random.uniform()
    if p < epsilon:
        index = np.random.randint(Q.shape[1])
    else:
        index = np.argmax(Q[state])

    return index
     

In [105]:
import numpy as np

desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
Q[7] = np.array([0.5, 0.7, 1, -1])
np.random.seed(0)
print(epsilon_greedy(Q, 7, 0.5))
np.random.seed(1)
print(epsilon_greedy(Q, 7, 0.5))

2
0


In [133]:
#!/usr/bin/env python3
""" Q-learning """
import numpy as np


def train(env, Q, episodes=5000, max_steps=100,
          alpha=0.1, gamma=0.99, epsilon=1,
          min_epsilon=0.1, epsilon_decay=0.05):
    """
    ***********************************************
    **************performs Q-learning**************
    ***********************************************
    @env: is the FrozenLakeEnv instance
    @Q: is a numpy.ndarray containing the Q-table
    @episodes: is the total number of episodes to train over
    @max_steps: is the maximum number of steps per episode
    @alpha: is the learning rate
    @gamma: is the discount rate
    @epsilon: is the initial threshold for epsilon greedy
    @min_epsilon: is the minimum value that epsilon should
                  decay to
    @epsilon_decay: is the decay rate for updating epsilon
                    between episodes
    *** When the agent falls in a hole, the reward should
        be updated to be -1
    Returns:
            Q: is the updated Q-table
            total_rewards: is a list containing the rewards
                           per episode
    """
    training_rewards = []  
    epsilons = []
    max_epsilon = 1
    for episode in range(episodes):
        # Reseting the environment each time as per requirement
        state = env.reset()    
        # Starting the tracker for the rewards
        total_training_rewards = 0

        for step in range(max_steps):
            # Performing epsilon greedy
            action = epsilon_greedy(Q, state, epsilon)
  
            # Taking the action and getting the reward and outcome state
            new_state, reward, done, info = env.step(action)

            # Agent falling in a hole
            if done and reward == 0:
                reward = -1

            # Updating the Q-table using the Bellman equation
            Q[state, action] = (Q[state, action] + alpha
                                * (reward + gamma * np.max(Q[new_state])
                                - Q[state, action]))

            # Increasing our total reward and updating the state
            total_training_rewards += reward      
            state = new_state         

            # Ending the episode
            if done:
                break

        # Cutting down on exploration by reducing the epsilon 
        epsilon = (min_epsilon + (max_epsilon - min_epsilon)
                   * np.exp(-epsilon_decay * episode))

        # Adding the total reward and reduced epsilon values
        training_rewards.append(total_training_rewards)
        epsilons.append(epsilon)

    return Q, training_rewards


In [136]:
import numpy as np
#train = __import__('3-q_learning').train
np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(Q)
split_rewards = np.split(np.array(total_rewards), 10)
for i, rewards in enumerate(split_rewards):
    print((i+1) * 500, ':', np.mean(rewards))

[[ 0.96059593  0.970299    0.95098488  0.96059396]
 [ 0.96059557 -0.77123208  0.0094072   0.37627228]
 [ 0.18061285 -0.1         0.          0.        ]
 [ 0.97029877  0.9801     -0.99999988  0.96059583]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.98009763  0.98009933  0.99        0.9702983 ]
 [ 0.98009922  0.98999782  1.         -0.99999952]
 [ 0.          0.          0.          0.        ]]
500 : 0.812
1000 : 0.88
1500 : 0.9
2000 : 0.9
2500 : 0.88
3000 : 0.844
3500 : 0.892
4000 : 0.896
4500 : 0.852
5000 : 0.928


In [159]:
#!/usr/bin/env python3
""" Play """
import numpy as np

epsilon_greedy = __import__('2-epsilon_greedy').epsilon_greedy


def play(env, Q, max_steps=100):
    """
    *********************************************
    ********trained agent play an episode********
    *********************************************
    @env: is the FrozenLakeEnv instance
    @Q: is a numpy.ndarray containing the Q-table
    @max_steps: is the maximum number of steps in
                the episode
    *** Each state of the board should be displayed
        via the console
    *** always exploit the Q-table
    Returns:
            the total rewards for the episode
    """
     # Reseting the environment
    state = 0
    env.reset()    
    env.render()
    for step in range(max_steps):
        
        # Performing epsilon greedy
        action = epsilon_greedy(Q, state, 0)
        # Taking the action and getting the reward and outcome state
        state, reward, done, info = env.step(action)
        env.render()
        # Agent falling in a hole
        if done and reward == 0:
            return reward      
        # Ending the episode
        if done:
            return reward


In [161]:
import numpy as np

np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(play(env, Q))


[41mS[0mFF
FHH
FFG
  (Down)
SFF
[41mF[0mHH
FFG
  (Down)
SFF
FHH
[41mF[0mFG
  (Right)
SFF
FHH
F[41mF[0mG
  (Right)
SFF
FHH
FF[41mG[0m
1.0
