<h2> Importing libraries

In [1]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

<h2> Creating environmrnt

In [2]:

env_name = "FrozenLake-v0"

env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

<h2> building an random  agent

In [3]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

<h2> Q learning agent

In [4]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.eps = 0.2
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99
        
agent = QAgent(env)

Action size: 4
State size: 16


<h2> Training

In [5]:
total_reward = 0
for ep in range(1000):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        n_reward=reward
        if done and reward==0:
            reward=-1
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += n_reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        print(agent.q_table)
        #time.sleep(0.05)
        clear_output(wait=True)

s: 15 a: 1
Episode: 999, Total reward: 533.0, eps: 8.634249482131589e-06
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[ 1.29716588e-01 -3.57582058e-02 -3.27020008e-02 -3.74089421e-02]
 [-9.66040938e-02 -6.47465967e-02 -7.26048582e-02 -1.49691054e-02]
 [-6.84165507e-02 -6.72322192e-02 -6.86355749e-02 -3.67496345e-02]
 [-5.66359363e-02 -6.68483249e-02 -9.97139810e-02 -4.96240109e-02]
 [ 1.35789297e-01 -3.73858333e-02 -6.41863683e-02 -4.70066100e-02]
 [ 2.53279937e-05  6.27480482e-06  2.12196987e-05  2.35470772e-05]
 [-3.12995402e-01 -3.13018278e-01 -2.90983460e-01 -3.14670122e-01]
 [ 3.30651129e-05  4.34092068e-06  3.05187935e-05  4.32137566e-05]
 [-6.40333141e-02 -4.57590889e-02 -4.69956840e-02  1.88339518e-01]
 [-1.95283274e-02  2.71793121e-01 -8.84636180e-03 -2.67968176e-02]
 [ 2.10014148e-01 -6.77483525e-03 -1.98505938e-02 -2.96517654e-02]
 [ 2.55805545e-05  4.02374971e-06  4.17144314e-06  5.20092541e-05]
 [ 4.27320411e-05  9.79097966e-05  7.84671414e-05  1.10562599e-05]
 [-7.90259056e-03 

<h2> Testing

In [6]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        n_reward=reward
        if done and reward==0:
            reward=-1
        
        state = next_state
        total_reward += n_reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        print(agent.q_table)
        #time.sleep(0.05)
        clear_output(wait=True)

s: 15 a: 1
Episode: 99, Total reward: 76.0, eps: 8.634249482131589e-06
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[ 1.29716588e-01 -3.57582058e-02 -3.27020008e-02 -3.74089421e-02]
 [-9.66040938e-02 -6.47465967e-02 -7.26048582e-02 -1.49691054e-02]
 [-6.84165507e-02 -6.72322192e-02 -6.86355749e-02 -3.67496345e-02]
 [-5.66359363e-02 -6.68483249e-02 -9.97139810e-02 -4.96240109e-02]
 [ 1.35789297e-01 -3.73858333e-02 -6.41863683e-02 -4.70066100e-02]
 [ 2.53279937e-05  6.27480482e-06  2.12196987e-05  2.35470772e-05]
 [-3.12995402e-01 -3.13018278e-01 -2.90983460e-01 -3.14670122e-01]
 [ 3.30651129e-05  4.34092068e-06  3.05187935e-05  4.32137566e-05]
 [-6.40333141e-02 -4.57590889e-02 -4.69956840e-02  1.88339518e-01]
 [-1.95283274e-02  2.71793121e-01 -8.84636180e-03 -2.67968176e-02]
 [ 2.10014148e-01 -6.77483525e-03 -1.98505938e-02 -2.96517654e-02]
 [ 2.55805545e-05  4.02374971e-06  4.17144314e-06  5.20092541e-05]
 [ 4.27320411e-05  9.79097966e-05  7.84671414e-05  1.10562599e-05]
 [-7.90259056e-03 -1

<b> Agent was able to reach goals 76 times out fo 100 episodes

<h2> Expected Sarsa Agent

In [7]:
class ExpectedSarsaAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.eps = 0.2
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        v=(1-self.eps)*(np.max(q_next))
        for i in q_next:
            v+=(self.eps/self.action_size)*i
        
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99
        
agent = ExpectedSarsaAgent(env)

Action size: 4
State size: 16


<h2> Training 

In [8]:
total_reward = 0
for ep in range(1000):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        n_reward=reward
        if done and reward==0:
            reward=-1
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += n_reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        print(agent.q_table)
       
        clear_output(wait=True)

s: 15 a: 1
Episode: 999, Total reward: 587.0, eps: 8.634249482131589e-06
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[ 1.32695927e-01 -2.03150045e-02 -1.55859060e-02 -1.63974537e-02]
 [-3.72871850e-02 -5.40350936e-02 -6.24397280e-02  2.71054489e-02]
 [-5.21441560e-02 -5.29885455e-02 -5.11358993e-02 -1.79280830e-02]
 [-6.60447547e-02 -5.29926140e-02 -7.26149360e-02 -3.00274775e-02]
 [ 1.62462718e-01 -4.48589111e-02 -6.90975697e-02 -7.17801844e-02]
 [ 9.65877386e-05  3.57359102e-05  2.45836377e-05  7.97399903e-05]
 [-2.23013517e-01 -2.88474533e-01 -2.87163024e-01 -2.90268818e-01]
 [ 6.22180922e-05  1.72274200e-05  6.78906425e-05  7.09931549e-05]
 [-4.63658316e-02 -6.04843379e-02 -3.62197664e-02  2.07282148e-01]
 [-1.92937094e-02  2.80491802e-01 -2.81912300e-02 -8.10686860e-03]
 [ 2.19717206e-01 -9.35047581e-03 -5.27047120e-03 -1.26025570e-02]
 [ 6.64761844e-05  4.06409447e-05  6.53574388e-05  6.18604135e-05]
 [ 3.88934323e-05  3.76186428e-05  9.61839988e-05  5.59937719e-05]
 [-2.89947109e-02 

<h2> Testing

In [9]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        n_reward=reward
        if done and reward==0:
            reward=-1
        
        state = next_state
        total_reward += n_reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        print(agent.q_table)
       
        clear_output(wait=True)

s: 15 a: 1
Episode: 99, Total reward: 87.0, eps: 8.634249482131589e-06
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[ 1.32695927e-01 -2.03150045e-02 -1.55859060e-02 -1.63974537e-02]
 [-3.72871850e-02 -5.40350936e-02 -6.24397280e-02  2.71054489e-02]
 [-5.21441560e-02 -5.29885455e-02 -5.11358993e-02 -1.79280830e-02]
 [-6.60447547e-02 -5.29926140e-02 -7.26149360e-02 -3.00274775e-02]
 [ 1.62462718e-01 -4.48589111e-02 -6.90975697e-02 -7.17801844e-02]
 [ 9.65877386e-05  3.57359102e-05  2.45836377e-05  7.97399903e-05]
 [-2.23013517e-01 -2.88474533e-01 -2.87163024e-01 -2.90268818e-01]
 [ 6.22180922e-05  1.72274200e-05  6.78906425e-05  7.09931549e-05]
 [-4.63658316e-02 -6.04843379e-02 -3.62197664e-02  2.07282148e-01]
 [-1.92937094e-02  2.80491802e-01 -2.81912300e-02 -8.10686860e-03]
 [ 2.19717206e-01 -9.35047581e-03 -5.27047120e-03 -1.26025570e-02]
 [ 6.64761844e-05  4.06409447e-05  6.53574388e-05  6.18604135e-05]
 [ 3.88934323e-05  3.76186428e-05  9.61839988e-05  5.59937719e-05]
 [-2.89947109e-02 -1

<b> Agent was able to reach goals 87 times out fo 100 episodes