In [1]:
import numpy as np
import gym
from tqdm import tqdm
import matplotlib.pyplot as plt

In [18]:
env = gym.make('FrozenLake-v0')

In [19]:
start = env.reset()

In [28]:
env.render(mode='rgb_array')


[41mS[0mFFF
FHFH
FFFH
HFFG


In [45]:
env.env.P[1]

{0: [(0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 5, 0.0, True)],
 1: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 5, 0.0, True),
  (0.3333333333333333, 2, 0.0, False)],
 2: [(0.3333333333333333, 5, 0.0, True),
  (0.3333333333333333, 2, 0.0, False),
  (0.3333333333333333, 1, 0.0, False)],
 3: [(0.3333333333333333, 2, 0.0, False),
  (0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)]}

In [116]:
def Policy_Iteration(env,gamma = 0.9, eps = 1.0e-8, maxIterations = 1000):
    #random policy
    policy = np.random.choice(env.env.nA,size=env.env.nS)
    Vs = np.zeros(env.env.nS)
    Vs_isFinal = np.zeros(env.env.nS,dtype=np.uint8)
    new_Vs = np.zeros(env.env.nS)
    
    
    for i in range(maxIterations):
        # policy evaluation, evaluation old_policy
        j = 0
        while True:
            for s in range(env.env.nS):
                if Vs_isFinal[s] == 1:
                    policy[s] = 0
                    Vs[s] = 0.0
                    new_Vs[s] = 0.0
                else:
                    a = policy[s]
                    sigma = 0.0
                    for p,s_next,r,done in env.env.P[s][a]:
                        if done:
                            Vs_isFinal[s_next] = 1
                            Vs[s_next] = 0.0
                        
                        sigma += p*(r + gamma*Vs[s_next])
                    
                    new_Vs[s] = sigma
                    
            j = j+1
            if np.all(np.fabs(new_Vs - Vs) < eps):
                #print("after {} integration convergent".format(j))
                break
            Vs = np.copy(new_Vs)
        
        # policy improvement
        # get new policy
        new_policy = np.copy(policy)
        
        for s in range(env.env.nS):
            q_a_max = float('-inf')
            for a in env.env.P[s]: 
                q_a = np.sum([p*(r+gamma *Vs[s_next]) for p,s_next,r,_ in env.env.P[s][a]])
                if q_a > q_a_max:
                    q_a_max = q_a
                    new_policy[s] = a
        
        #print("____{} Iteration____".format(i))
        #print("Old Policy:")
        
        #print(policy.reshape(4,4))
        
        #print("New policy:")
        #print(new_policy.reshape(4,4))
        
        #print("V:")
        #print(Vs.reshape(4,4))
        
        
        if np.all(new_policy == policy):
            print("Policy Converged at {} Iteration".format(i))
            break
        
         
        policy = new_policy
    
    return new_policy,Vs
        

In [117]:
def evaluate_Policy(env,policy,gamma=0.9,repeats = 100):
    rewards =[]
    for i in tqdm(range(repeats)):
        s = env.reset()
        discount = 1.0
        total_r = 0.0
        while True:
            s,r,done,_ = env.step(int(policy[s]))
            total_r += discount * r
            if done:
                break
            
            discount *=gamma
            
        rewards.append(total_r)
    
    return np.mean(np.array(rewards))

In [135]:
policy,Vs = Policy_Iteration(env)
average_reward = evaluate_Policy(env,policy)
env.render()
print("Average_Reward:{}".format(average_reward))
print("Value Tabels:\n {}".format(Vs.reshape(4,4)))
print("Poliy_Table: \n {}".format(policy.reshape(4,4)))

100%|██████████| 100/100 [00:00<00:00, 1081.23it/s]

Policy Converged at 1 Iteration
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Average_Reward:0.08058812892607178
Value Tabels:
 [[0.06889083 0.06141451 0.07440971 0.05580727]
 [0.09185447 0.         0.11220818 0.        ]
 [0.1454363  0.24749692 0.29961757 0.        ]
 [0.         0.37993587 0.63902013 0.        ]]
Poliy_Table: 
 [[0 3 0 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]



