# Chapter 3
# Solving Problems with Dynamic Programming


In [10]:
import gym
import numpy as np

### Policy iteration applied to FrozenLake

In [4]:
env = gym.make('FrozenLake-v0')
env = env.unwrapped

In [7]:
print(env.action_space)
print(env.observation_space)

Discrete(4)
Discrete(16)


In [11]:
nA = env.action_space.n # action space dimension
nS = env.observation_space.n # state space dimension
nS

16

In [82]:
V = np.zeros(nS) # value function
policy = np.zeros(nS)

First we must define a function to evaluate the policy and a function to improve the policy 

In [83]:
def eval_state_pi(V,s,a,gamma=0.99): #function that evaluate the state s under policy pi
    
    return sum(prob*(rew+gamma*V[next_s]) for prob, next_s, rew, _ in env.P[s][a])

def policy_evaluation(V, policy, eps=0.0001): #evaluates all the states under the policy pi, until surpasses the threshold eps
    
    while True:
        
        delta = 0
        for state in range(nS):
            
            old_Vs = V[state]
            V[state] = eval_state_pi(V, state, policy[state])
            delta = max(delta, np.abs(old_Vs-V[state]))
            
        if delta < eps:
            break
        

In [90]:
def policy_improvement(V, policy): # for each state, selects the action with the highest payoff
    
    policy_stable = True
    
    for state in range(nS):
    
        old_a = policy[state]
        policy[state] = np.argmax([eval_state_pi(V, state, action) for action in range(nA)])
        
        if old_a != policy[state]:
            
            policy_stable = False
        
    return policy_stable

In [91]:
# Apply the evaluation and improvement of the agent's policy
policy_stable = False
it = 0
while not policy_stable:
    
    policy_evaluation(V,policy)
    policy_stable = policy_improvement(V,policy)
    it+=1

In [106]:
policy.reshape(4,4)

array([[0., 3., 3., 3.],
       [0., 0., 0., 0.],
       [3., 1., 0., 0.],
       [0., 2., 1., 0.]])

In [109]:
print(np.round(V.reshape(4,4),2))

[[0.54 0.5  0.47 0.46]
 [0.56 0.   0.36 0.  ]
 [0.59 0.64 0.61 0.  ]
 [0.   0.74 0.86 0.  ]]


In [102]:
#After finding the optimal policy through planning, the agent can finally interact with the enviroment

def run_episodes(env, V, policy, num_games=100):
    
    total_rew = 0
    state = env.reset() # returns the initial state. In this game the initial state is always zero
    
    for _ in range(num_games):
        
        done = False
        
        while not done:
            
            next_state, reward, done, _ = env.step(policy[state])
            state = next_state
            total_rew += reward
            if done:
                state =env.reset()
        
    print('Won %i of %i games!'%(total_rew,num_games))

In [103]:
run_episodes(env,V,policy)

Won 83 of 100 games!


### Value iteration applied to FrozenLake

In [137]:
# A different way to find the best policy. In this case we evaluate the expected value for each state,
# After that we select the action with the highest payoff
def value_iteration(eps=0.001):
    
    V=np.zeros(nS)
    it=0
    while True:
        
        delta=0
        for state in range(nS):
            
            old_V = V[state]
            V[state] = np.max([eval_state_pi(V,state,action) for action in range(nA)])
            delta = max(delta, np.abs(old_V - V[state]))
            
        if delta < eps:
            
            print('Iter', it, ' delta:', np.round(delta,5))
            return V
            
        it += 1
            
    

In [138]:
def run_episodes(env, V, num_games=100):
    
    total_rew = 0
    state = env.reset()
    
    for _ in range(num_games):
        
        done = False
        
        while not done:
            
            action = np.argmax([eval_state_pi(V,state,a) for a in range(nA)])
            next_state, reward, done, _ = env.step(action)
            state = next_state
            total_rew += reward
            
            if done:
                state = env.reset()
                
    print('Won %i of %i games!'%(total_rew, num_games))

In [141]:
V = value_iteration()
run_episodes(env,V)

Iter 83  delta: 0.00096
Won 80 of 100 games!


In [143]:
print(np.round(V.reshape(4,4),2))


[[0.53 0.48 0.45 0.44]
 [0.55 0.   0.35 0.  ]
 [0.58 0.64 0.61 0.  ]
 [0.   0.74 0.86 0.  ]]
