In [2]:
import sys
import pygame
sys.path.append("../lib/myenv")
from gridworld import gridworld
import numpy as np 
import random
import matplotlib.pyplot as plt 


# sample from categorical distribution
def sample_categorical(probabilities):
    return random.choices(range(len(probabilities)), probabilities)[0]

# gridworld dimension
dim=10
# gamma discounting factor 
gamma=1

# delta for policy evaluation
delta=0

#env variable 
gw=gridworld(dim)

# The agent is placed at (0,0) and value function are initiliazed to a zero array
gw.reset()

# State value (V) is an array of dimension nxn where n is the gridworld size
V=np.random.rand(dim,dim)

#Transition matrix
Pss=np.ones((dim,dim))

In [3]:

def policy(dim):
    """
    Initial policy with 1/4 chance for each action
    Input: dimension of gridworld
    output: random uniform policy
    """
    pi={}
    for i in range(dim):
        for j in range(dim):
            pi[(i,j)]=[0.25]*4

    return pi

In [9]:
# Initial policy

# The action mapping for human readibility
# 0:right
# 1:Left
# 2:UP
# 3:Down 

pi=policy(dim)
sliced_policy = dict(list(pi.items())[:5])
sliced_policy


{(0, 0): [0.25, 0.25, 0.25, 0.25],
 (0, 1): [0.25, 0.25, 0.25, 0.25],
 (0, 2): [0.25, 0.25, 0.25, 0.25],
 (0, 3): [0.25, 0.25, 0.25, 0.25],
 (0, 4): [0.25, 0.25, 0.25, 0.25]}

In [5]:
def policy_evaluation(pi,V,iteration_num=1):
    delta=0
    #looping over all states
    for _ in range(iteration_num):
        Vc=V.copy()
        for i in range(dim):
            for j in range(dim):
                gw.reset()
                listupdate=[]
                gw.agent=(i,j)

                if (i,j)!=(0,0):
                   for a in range(4):
                        o,r,_,_,_,=gw.step(a)
                        k,l=o[:2]
                        listupdate.append(pi[(i,j)][a]*(r+(gamma*Vc[k,l])))
                    
                   #update Vk+1
                   Vc[i,j]=np.mean(listupdate)
                #calculate delta
                # print("state: ",(i,j))
                # print("V",V[i,j])
                # print("Vc",Vc[i,j])
                # print("Vc-V",np.abs(Vc[i,j]-V[i,j]))
                # delta=max(delta,np.abs(V[i,j]-Vc[i,j]))
                # print("delta ",delta)

        
        V=Vc
        # if delta <0.1:
        #     break

    return V

In [6]:
def policy_improvement(Vpi):
# policy iteration 
    for i in range(dim):
                for j in range(dim):
                    listofall=[-100]*4
                    r=-1
                    if (i,j)==gw.target:
                        r=0
                    
                    # listofall=[Vpi[i+1,j],Vpi[i-1,j],Vpi[i,j-1],Vpi[i,j+1]]
                    if i+1<dim:
                        listofall[0]=pi[(i,j)][0]*(r+(gamma*Vpi[i+1,j]))
                    if j+1<dim:
                        listofall[3]=pi[(i,j)][3]*(r+(gamma*Vpi[i,j+1])) 
                    if i-1>dim:
                        listofall[1]=pi[(i,j)][1]*(r+(gamma*Vpi[i-1,j]))
                    if j-1>dim:
                        listofall[2]=pi[(i,j)][1]*(r+(gamma*Vpi[i,j-1]))


                    a=np.argmax(listofall)

                    pi[(i,j)]=[1 if i == a else 0 for i in range(4)]

    return pi

In [7]:
# policy iteration algorithm 
for i in range(1,2):
    print("########### Iteration number: ",i)   
    Vpi=policy_evaluation(pi,V,iteration_num=2)
    pi=policy_improvement(Vpi)

print(np.array(Vpi))

print(pi)


########### Iteration number:  1
[[ 0.87910427 -1.33892215 -1.57983001 -1.72380082 -1.95321862 -1.88870627
  -1.76857352 -1.91548546 -1.83010733 -1.81594353]
 [-1.16692839 -1.44192906 -1.93203618 -2.04895334 -2.21689326 -2.17111285
  -2.09556968 -2.14647589 -2.16708181 -2.11354116]
 [-1.41136482 -1.65764338 -1.97630961 -2.08995988 -2.09298425 -2.09374949
  -2.05719202 -1.92292051 -2.2143732  -2.01693958]
 [-1.44185814 -1.50719522 -1.7526857  -2.01095664 -2.0646005  -2.02535232
  -2.06031137 -1.96194832 -2.22249245 -2.03187636]
 [-1.53090027 -1.56577912 -1.82363828 -2.02241581 -2.14241401 -2.04406139
  -2.22179289 -2.15745873 -2.18459051 -2.10426593]
 [-1.54374329 -1.49132038 -1.86621349 -2.07097835 -2.29316298 -2.19372172
  -2.40811524 -2.37221743 -2.2712458  -2.15953089]
 [-1.43811815 -1.33305849 -1.80802239 -2.10043121 -2.20410922 -2.18166003
  -2.32502538 -2.27500612 -2.32341188 -2.26978335]
 [-1.52049275 -1.30578927 -1.774745   -2.18471687 -2.15600851 -2.26058862
  -2.22689041 -2.0

In [8]:
# The loop below will test the policy iteration algorithm
# start position
gw.reset()
o=(0,0,dim-1,dim-1)
terminated=False
rewards=[]
while not terminated:

    a=sample_categorical(pi[o[:2]])
    print(a)
    o,r,terminated,_,_,=gw.step(a)
    rewards.append(r)
    gw.render(np.round(Vpi, 1),mode='human')
    print(o[:2])
    print("reward is: ",r)
    print("terminated:",terminated)

print("total rewards: ", np.sum(rewards))

print("The end")
pygame.quit()

0
(1, 0)
reward is:  -1
terminated: False
0
(2, 0)
reward is:  -1
terminated: False
0
(3, 0)
reward is:  -1
terminated: False
3
(3, 1)
reward is:  -1
terminated: False
0
(4, 1)
reward is:  -1
terminated: False
0
(5, 1)
reward is:  -1
terminated: False
0
(6, 1)
reward is:  -1
terminated: False
0
(7, 1)
reward is:  -1
terminated: False
3
(7, 2)
reward is:  -1
terminated: False
3
(7, 3)
reward is:  -1
terminated: False
3
(7, 4)
reward is:  -1
terminated: False
3
(7, 5)
reward is:  -1
terminated: False
3
(7, 6)
reward is:  -1
terminated: False
3
(7, 7)
reward is:  -1
terminated: False
3
(7, 8)
reward is:  -1
terminated: False
0
(8, 8)
reward is:  -1
terminated: False
3
(8, 9)
reward is:  -1
terminated: False
0
(9, 9)
reward is:  0
terminated: True
total rewards:  -17
The end
