# Capítulo 04

In [12]:
import gym
from torch.utils.tensorboard import SummaryWriter
import numpy as np

ENV_NAME = "FrozenLake-v0"
#ENV_NAME = "FrozenLake8x8-v0"  
GAMMA = 0.95

### The Agent

In [18]:

class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.V = np.zeros(self.env.observation_space.n)
        
    def calc_action_value(self, state, action):
        action_value = sum([prob*(reward + GAMMA*self.V[next_state])
                            for prob, next_state, reward, _ 
                            in self.env.P[state][action]]) 
        return action_value

    def select_action(self, state):
        best_action = best_value = None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if not best_value or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def value_iteration(self):
        for state in range(self.env.observation_space.n):
            state_values = []
            for action in range(self.env.action_space.n):  
                state_values.append(self.calc_action_value(state, action))
            self.V[state] = max(state_values)
        return self.V
        
    def value_iteration_v2(self):
        for state in range(self.env.observation_space.n):
            state_values = [self.calc_action_value(state, action)
                            for action in range(self.env.action_space.n)]
            self.V[state] = max(state_values)
        return self.V


### Training loop

In [19]:
TEST_EPISODES = 40

def check_improvements():
    test_env = gym.make(ENV_NAME)
    reward_test = 0.0
    for _ in range(TEST_EPISODES):
        total_reward = 0.0
        state = test_env.reset()
        while True:
            action = agent.select_action(state)
            new_state, new_reward, is_done, _ = test_env.step(action)
            total_reward += new_reward
            if is_done: 
                break
            state = new_state
        reward_test += total_reward
    reward_test /= TEST_EPISODES
    return reward_test   

In [20]:
REWARD_THRESHOLD = 0.9

def train(agent): 
    writer = SummaryWriter()
    t = 0
    best_reward = 0.0
 
    while best_reward < REWARD_THRESHOLD:
        agent.value_iteration()
        t += 1
        reward_test = check_improvements()
        writer.add_scalar("reward", reward_test, t)
               
        if reward_test > best_reward:
            print(f"Best reward updated {reward_test:.2f} at iteration {t}") 
            best_reward = reward_test
    writer.close()

In [21]:
agent = Agent()
train(agent)

Best reward updated 0.35 at iteration 3
Best reward updated 0.38 at iteration 5
Best reward updated 0.42 at iteration 6
Best reward updated 0.47 at iteration 7
Best reward updated 0.50 at iteration 10
Best reward updated 0.75 at iteration 13
Best reward updated 0.80 at iteration 14
Best reward updated 0.85 at iteration 18
Best reward updated 0.93 at iteration 42


In [None]:
def extract_policy(agent):   
    env = gym.make(ENV_NAME)
    policy = np.zeros(env.observation_space.n) 
    for s in range(env.observation_space.n):
        Q_values = [agent.calc_action_value(s,a) for a in range(env.action_space.n)] 
        policy[s] = np.argmax(np.array(Q_values))        
    return policy

def print_policy(policy):
    print(policy.reshape([-1, 4]))
    print("\n")
    visual_help = {0:'<', 1:'v', 2:'>', 3:'^'}
    policy_arrows = [visual_help[x] for x in policy]
    print(np.array(policy_arrows).reshape([-1, 4]))

In [None]:
policy=extract_policy(agent)
print_policy(policy)

[[0. 3. 0. 3.]
 [0. 0. 0. 0.]
 [3. 1. 0. 0.]
 [0. 2. 1. 0.]]


[['<' '^' '<' '^']
 ['<' '<' '<' '<']
 ['^' 'v' '<' '<']
 ['<' '>' 'v' '<']]


In [None]:
env = gym.make(ENV_NAME)
env.env.P[4][0]

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 4, 0.0, False),
 (0.3333333333333333, 8, 0.0, False)]

In [None]:
env.env.P[8][3]

[(0.3333333333333333, 9, 0.0, False),
 (0.3333333333333333, 4, 0.0, False),
 (0.3333333333333333, 8, 0.0, False)]

### Test the Agent

In [None]:
def test(agent):
    new_test_env = gym.make(ENV_NAME) 
    state = new_test_env.reset()
    new_test_env.render()
    is_done = False
    t = 0

    while not is_done:
        action = agent.select_action(state)
        new_state, reward, is_done, _ = new_test_env.step(action)
        new_test_env.render()
        state = new_state
        t += 1
    print("\nlast state =", state)
    print("reward =    ", reward)
    print("time steps =", t)

test(agent)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[

In [None]:
%load_ext tensorboard


In [None]:
tensorboard  --logdir=runs

### The Agent that estimates the Transition Function

In [None]:
import collections

N = 100

class AgentUpdated:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(collections.Counter)
        self.V = np.zeros(self.env.observation_space.n)

    def play_n_random_steps(self, count):
        for _ in range(count):
            action = self.env.action_space.sample()
            new_state, reward, is_done, _ = self.env.step(action)
            self.rewards[(self.state, action, new_state)] = reward
            self.transits[(self.state, action)][new_state] += 1
            if is_done:
                self.state = self.env.reset() 
            else: 
                self.state = new_state

    def calc_action_value(self, state, action):
        target_counts = self.transits[(state, action)]
        total = sum(target_counts.values())
        action_value = 0.0
        for s_, count in target_counts.items():
            r = self.rewards[(state, action, s_)]
            prob = (count / total)
            action_value += prob*(r + GAMMA*self.V[s_])
        return action_value

    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action


    def value_iteration(self):
        self.play_n_random_steps(N)
        for state in range(self.env.observation_space.n):
            state_values = [
                self.calc_action_value(state, action)
                for action in range(self.env.action_space.n)
            ]
            self.V[state] = max(state_values)

In [None]:
agent = AgentUpdated()
train(agent)
policy=extract_policy(agent)
print_policy(policy)

Best reward updated 0.10 at iteration 26
Best reward updated 0.17 at iteration 29
Best reward updated 0.38 at iteration 32
Best reward updated 0.47 at iteration 38
Best reward updated 0.50 at iteration 41
Best reward updated 0.53 at iteration 46
Best reward updated 0.68 at iteration 53
Best reward updated 0.78 at iteration 55
Best reward updated 0.80 at iteration 108
Best reward updated 0.82 at iteration 239
Best reward updated 0.85 at iteration 265
Best reward updated 0.88 at iteration 745
Best reward updated 0.90 at iteration 920
[[0. 3. 0. 3.]
 [0. 0. 0. 0.]
 [3. 1. 0. 0.]
 [0. 2. 1. 0.]]


[['<' '^' '<' '^']
 ['<' '<' '<' '<']
 ['^' 'v' '<' '<']
 ['<' '>' 'v' '<']]


In [None]:
tensorboard  --logdir=runs