In [33]:
import numpy as np
import matplotlib
import scipy # FrozenLake needs it apparently
import gym

%matplotlib inline

In [36]:
# Initializing and testing the gym environment

env =  gym.make('FrozenLake-v0')
env.reset()

print(env.action_space) 
print(env.observation_space)

for _ in range(1):
    env.render()
    state = env.step(env.action_space.sample())
    print(state)
    
env.env.P[0][0]
env.env.P[5][0]

Discrete(4)
Discrete(16)

[41mS[0mFFF
FHFH
FFFH
HFFG
(1, 0.0, False, {'prob': 0.3333333333333333})


[(1.0, 5, 0, True)]

env.step() returns (next_state, reward, done, probability)<br/>
env.env.P\[state\]\[action\] returns the possible next states which can be achieved <br>

### Action Space
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

# Policy Evaluation
Implementing policy evaluation in Numpy
\begin{equation}
v_{\pi}(s) = \sum_{a}\pi(a|s) \sum_{s\prime}p(s,a,s\prime)\bigg[r(s,a,s\prime) + \gamma v_{\pi}(s\prime) \bigg]
\end{equation}

## Algorithm
1. Input $\pi$, the policy to be evaluated
2. Initialize an array $v(s) = 0$ , for all $s\in S^{+}$
3. Repeat
4. &nbsp;&nbsp;    Initialize $\Delta \gets 0$ 
5. &nbsp;&nbsp;    For each $s \in S$ do:
6. &nbsp;&nbsp;&nbsp;&nbsp;$temp \gets v(s)$
7. &nbsp;&nbsp;&nbsp;&nbsp;$v(s) \gets \sum_{a}\pi(a|s) \sum_{s\prime}p(s,a,s\prime)\bigg[r(s,a,s\prime) + \gamma v(s\prime) \bigg]$
8. &nbsp;&nbsp;&nbsp;&nbsp;$\Delta \gets max(\Delta,|temp - v(s)|)$
9. until $\Delta < \theta$ (a small positive number)
10.Output $v \approx v_{\pi}$

In [58]:
def policy_evaluation(env,policy,gamma = 1.0,theta = 1e-5):
    '''
        policy is a 2D numpy matrix
        policy.shape = (number of states, number of actions)
        gamma = discount factor
        theta = tolerance
        env = env.env
    '''
    v = np.zeros(env.nS) # env.nS = number of states
    complete = False
    while not complete:
        delta = 0
        for s in range(env.nS):
            temp = v[s]
            tot_val = 0
            for a in range(env.nA):
                action_val = 0
                transition = env.P[s][a]
                for trans_prob, next_state, reward, done  in transition:
                    action_val += trans_prob*(reward + gamma*v[next_state])    
                tot_val += policy[s,a]*action_val
            v[s] = tot_val
            delta = max(delta, np.abs(temp-v[s]))
        # print(delta)
        if delta < theta:
            complete = True
    
    return v

In [59]:
# Deterministic policy
det_policy = np.array([[0,0,1,0],
                   [0,0,1,0],
                   [0,1,0,0],
                   [1,0,0,0],
                   [0,1,0,0],
                   [1,0,0,0],
                   [0,0,0,1],
                   [0,0,1,0],
                   [0,0,1,0],
                   [0,1,0,0],
                   [1,0,0,0],
                   [1,0,0,0],
                   [0,1,0,0],
                   [0,0,1,0],
                   [0,0,0,1],
                   [0,0,0,1]])

random_policy = np.ones((env.env.nS,env.env.nA))/env.env.nA # all actions equally probable in all states
print(policy_evaluation(env.env,random_policy,0.99,1e-8))

[0.01235611 0.01042444 0.01933842 0.00947774 0.01478704 0.
 0.03889445 0.         0.03260247 0.08433764 0.13781085 0.
 0.         0.17034482 0.43357944 0.        ]


In [None]:
def policy_iteration(env):
    v = np.zeros(env.nS)
    policy = np.random.rand(env.nS,env.nA)
    
    v = policy_evaluation(env,policy)
    pass