In [10]:
import sys
import pygame
sys.path.append("../lib/myenv")
from gridworld import gridworld
import numpy as np 
import random
import matplotlib.pyplot as plt 


# sample from categorical distribution
def sample_categorical(probabilities):
    return random.choices(range(len(probabilities)), probabilities)[0]

# gridworld dimension
dim=10
# gamma discounting factor 
gamma=1

# delta for policy evaluation
delta=0

#env variable 
gw=gridworld(dim)

# The agent is placed at (0,0) and value function are initiliazed to a zero array
gw.reset()

# State value (V) is an array of dimension nxn where n is the gridworld size
V=np.random.rand(dim,dim)

#Transition matrix
Pss=np.ones((dim,dim))

In [11]:

def policy(dim):
    """
    Initial policy with 1/4 chance for each action
    Input: dimension of gridworld
    output: random uniform policy
    """
    pi={}
    for i in range(dim):
        for j in range(dim):
            pi[(i,j)]=[0.25]*4

    return pi

In [12]:
# Initial policy

# The action mapping for human readibility
# 0:right
# 1:Left
# 2:UP
# 3:Down 

pi=policy(dim)
sliced_policy = dict(list(pi.items())[:5])
sliced_policy


{(0, 0): [0.25, 0.25, 0.25, 0.25],
 (0, 1): [0.25, 0.25, 0.25, 0.25],
 (0, 2): [0.25, 0.25, 0.25, 0.25],
 (0, 3): [0.25, 0.25, 0.25, 0.25],
 (0, 4): [0.25, 0.25, 0.25, 0.25]}

In [13]:
def policy_evaluation(pi,V,iteration_num=1):
    delta=0
    #looping over all states
    for _ in range(iteration_num):
        Vc=V.copy()
        for i in range(dim):
            for j in range(dim):
                gw.reset()
                listupdate=[]
                gw.agent=(i,j)

                if (i,j)!=(0,0):
                   for a in range(4):
                        o,r,_,_,_,=gw.step(a)
                        k,l=o[:2]
                        listupdate.append(pi[(i,j)][a]*(r+(gamma*Vc[k,l])))
                    
                   #update Vk+1
                   Vc[i,j]=np.mean(listupdate)
                #calculate delta
                # print("state: ",(i,j))
                # print("V",V[i,j])
                # print("Vc",Vc[i,j])
                # print("Vc-V",np.abs(Vc[i,j]-V[i,j]))
                # delta=max(delta,np.abs(V[i,j]-Vc[i,j]))
                # print("delta ",delta)

        
        V=Vc
        # if delta <0.1:
        #     break

    return V

In [14]:
def policy_improvement(Vpi):
# policy iteration 
    for i in range(dim):
                for j in range(dim):
                    listofall=[-100]*4
                    r=-1
                    if (i,j)==gw.target:
                        r=0
                    
                    # listofall=[Vpi[i+1,j],Vpi[i-1,j],Vpi[i,j-1],Vpi[i,j+1]]
                    if i+1<dim:
                        listofall[0]=pi[(i,j)][0]*(r+(gamma*Vpi[i+1,j]))
                    if j+1<dim:
                        listofall[3]=pi[(i,j)][3]*(r+(gamma*Vpi[i,j+1])) 
                    if i-1>dim:
                        listofall[1]=pi[(i,j)][1]*(r+(gamma*Vpi[i-1,j]))
                    if j-1>dim:
                        listofall[2]=pi[(i,j)][1]*(r+(gamma*Vpi[i,j-1]))


                    a=np.argmax(listofall)

                    pi[(i,j)]=[1 if i == a else 0 for i in range(4)]

    return pi

In [15]:
# policy iteration algorithm 
for i in range(1,2):
    print("########### Iteration number: ",i)   
    Vpi=policy_evaluation(pi,V,iteration_num=2)
    pi=policy_improvement(Vpi)

print(np.array(Vpi))

print(pi)


########### Iteration number:  1
[[ 0.27206433 -0.29477457 -0.30069313 -0.29092645 -0.28340458 -0.29772066
  -0.288203   -0.2865375  -0.30227605 -0.30329877]
 [-0.28136167 -0.29173654 -0.30555476 -0.29925567 -0.29219702 -0.30692108
  -0.29595672 -0.29542399 -0.31067743 -0.30638982]
 [-0.28251802 -0.27726255 -0.2958525  -0.30257314 -0.29500354 -0.3054913
  -0.29944059 -0.29367873 -0.30867002 -0.29966699]
 [-0.27379823 -0.2668405  -0.29071139 -0.29567733 -0.30584791 -0.30307282
  -0.29461377 -0.2933455  -0.30731303 -0.2925794 ]
 [-0.28620049 -0.27991376 -0.29638292 -0.28930813 -0.30929337 -0.29924153
  -0.28651065 -0.29369234 -0.30555874 -0.29117044]
 [-0.28554948 -0.28504024 -0.29582787 -0.29273615 -0.30011107 -0.3012994
  -0.29044908 -0.29905196 -0.30518584 -0.29628492]
 [-0.26810806 -0.27162593 -0.28638407 -0.30065837 -0.29319354 -0.29095002
  -0.29032698 -0.2975778  -0.30699833 -0.29632998]
 [-0.27352337 -0.28042106 -0.29288204 -0.30925875 -0.29806013 -0.29142172
  -0.29292516 -0.304

In [16]:
# The loop below will test the policy iteration algorithm
# start position
gw.reset()
o=(0,0,dim-1,dim-1)
terminated=False
rewards=[]
while not terminated:

    a=sample_categorical(pi[o[:2]])
    print(a)
    o,r,terminated,_,_,=gw.step(a)
    rewards.append(r)
    gw.render(np.round(Vpi, 1),mode='human')
    print(o[:2])
    print("reward is: ",r)
    print("terminated:",terminated)

print("total rewards: ", np.sum(rewards))

print("The end")
pygame.quit()

0
(1, 0)
reward is:  -1
terminated: False
0
(2, 0)
reward is:  -1
terminated: False
0
(3, 0)
reward is:  -1
terminated: False
3
(3, 1)
reward is:  -1
terminated: False
0
(4, 1)
reward is:  -1
terminated: False
0
(5, 1)
reward is:  -1
terminated: False
0
(6, 1)
reward is:  -1
terminated: False
0
(7, 1)
reward is:  -1
terminated: False
0
(8, 1)
reward is:  -1
terminated: False
3
(8, 2)
reward is:  -1
terminated: False
3
(8, 3)
reward is:  -1
terminated: False
3
(8, 4)
reward is:  -1
terminated: False
3
(8, 5)
reward is:  -1
terminated: False
3
(8, 6)
reward is:  -1
terminated: False
3
(8, 7)
reward is:  -1
terminated: False
3
(8, 8)
reward is:  -1
terminated: False
3
(8, 9)
reward is:  -1
terminated: False
0
(9, 9)
reward is:  0
terminated: True
total rewards:  -17
The end
