In [1]:
import sys
sys.path.append("../lib/myenv")
from gridworld import gridworld
import pygame
import numpy as np 
import random
import matplotlib.pyplot as plt 


# sample from categorical distribution
def sample_categorical(probabilities):
    return random.choices(range(len(probabilities)), probabilities)[0]

# gridworld dimension
dim=7

# gamma discounting factor 
gamma=1

#env variable 
gw=gridworld(dim)


# The agent is placed at (0,0) and value function are initiliazed to a zero array
gw.reset()


# State value (V) is an array of dimension nxn where n is the gridworld size
V=np.random.rand(dim,dim)
# V=np.zeros((dim,dim))



In [2]:
################ policy  ########################################

def policy(dim):
    """
    Initial policy with 1/4 chance for each action
    Input: dimension of gridworld
    output: random uniform policy
    """
    pi={}
    for i in range(dim):
        for j in range(dim):
            pi[(i,j)]=[0.25]*4

    return pi

In [3]:
# Initial policy

# The action mapping for human readibility
# 0:right
# 1:Left
# 2:UP
# 3:Down 

pi=policy(dim)
sliced_policy = dict(list(pi.items())[:5])
sliced_policy


{(0, 0): [0.25, 0.25, 0.25, 0.25],
 (0, 1): [0.25, 0.25, 0.25, 0.25],
 (0, 2): [0.25, 0.25, 0.25, 0.25],
 (0, 3): [0.25, 0.25, 0.25, 0.25],
 (0, 4): [0.25, 0.25, 0.25, 0.25]}

### Policy evaluation using montecarlo first visit


#### Estimating state value V(s)

In [4]:

def episode(pi,dim,log=True):
    # reset environement
    gw.reset()
    # Init observation
    o=(0,0,dim-1,dim-1)
    # track state termination
    terminated=False
    # list of rewards
    rewards=[]
    # State/action/reward list
    sar=[]
    #Initial reward
    r=-1
    # episode length
    l=0
    while True :
        l+=1
        # sample an action 
        a=sample_categorical(pi[o[:2]])
        # store the action/state/reward in a list
        sar.append((o[:2],a,r))
        if terminated:
            break
        o,r,terminated,_,_,=gw.step(a)
        rewards.append(r)

    if log:
        print("total rewards: ", np.sum(rewards))
        # print("state action reward: ",sar)
        print("length of episode: ",l)
    return sar

In [5]:
episode(pi,dim=4)

total rewards:  -499
length of episode:  501


[((0, 0), 2, -1),
 ((0, 0), 3, -1),
 ((0, 1), 1, -1),
 ((0, 1), 2, -1),
 ((0, 0), 1, -1),
 ((0, 0), 1, -1),
 ((0, 0), 3, -1),
 ((0, 1), 2, -1),
 ((0, 0), 1, -1),
 ((0, 0), 0, -1),
 ((1, 0), 1, -1),
 ((0, 0), 3, -1),
 ((0, 1), 2, -1),
 ((0, 0), 2, -1),
 ((0, 0), 2, -1),
 ((0, 0), 0, -1),
 ((1, 0), 3, -1),
 ((1, 1), 1, -1),
 ((0, 1), 2, -1),
 ((0, 0), 2, -1),
 ((0, 0), 2, -1),
 ((0, 0), 3, -1),
 ((0, 1), 3, -1),
 ((0, 2), 1, -1),
 ((0, 2), 3, -1),
 ((0, 3), 1, -1),
 ((0, 3), 2, -1),
 ((0, 2), 0, -1),
 ((1, 2), 1, -1),
 ((0, 2), 2, -1),
 ((0, 1), 1, -1),
 ((0, 1), 0, -1),
 ((1, 1), 2, -1),
 ((1, 0), 3, -1),
 ((1, 1), 1, -1),
 ((0, 1), 1, -1),
 ((0, 1), 3, -1),
 ((0, 2), 0, -1),
 ((1, 2), 2, -1),
 ((1, 1), 2, -1),
 ((1, 0), 2, -1),
 ((1, 0), 3, -1),
 ((1, 1), 0, -1),
 ((2, 1), 3, -1),
 ((2, 2), 1, -1),
 ((1, 2), 2, -1),
 ((1, 1), 3, -1),
 ((1, 2), 0, -1),
 ((2, 2), 2, -1),
 ((2, 1), 2, -1),
 ((2, 0), 2, -1),
 ((2, 0), 2, -1),
 ((2, 0), 0, -1),
 ((3, 0), 3, -1),
 ((3, 1), 2, -1),
 ((3, 0), 

In [6]:
#Initialization of the first visit MC algo

# Gain initialization
G=0
# Gain for each state state dictionnary
dict_visit={(i,j):[] for 
            i in range(dim) for j in range(dim)}
print(dict_visit)
# number of episodes for evaluation The state value V 
num_episodes=10

{(0, 0): [], (0, 1): [], (0, 2): [], (0, 3): [], (0, 4): [], (0, 5): [], (0, 6): [], (1, 0): [], (1, 1): [], (1, 2): [], (1, 3): [], (1, 4): [], (1, 5): [], (1, 6): [], (2, 0): [], (2, 1): [], (2, 2): [], (2, 3): [], (2, 4): [], (2, 5): [], (2, 6): [], (3, 0): [], (3, 1): [], (3, 2): [], (3, 3): [], (3, 4): [], (3, 5): [], (3, 6): [], (4, 0): [], (4, 1): [], (4, 2): [], (4, 3): [], (4, 4): [], (4, 5): [], (4, 6): [], (5, 0): [], (5, 1): [], (5, 2): [], (5, 3): [], (5, 4): [], (5, 5): [], (5, 6): [], (6, 0): [], (6, 1): [], (6, 2): [], (6, 3): [], (6, 4): [], (6, 5): [], (6, 6): []}


In [7]:

for _ in range(num_episodes):
    G=0
    #episode 
    sar=episode(pi,dim)
    # reverse the list sar list
    reversed_sar=list(reversed(sar))

    for i,e in enumerate(reversed_sar):
        s,a,r=e
        G=G+(gamma*r)
        Exist=sum([sar[0]==s for sar in reversed_sar[i+1:]])
        if Exist==0 :
            dict_visit[s].append(G)
                                

total rewards:  -40
length of episode:  42
total rewards:  -240
length of episode:  242
total rewards:  -84
length of episode:  86
total rewards:  -508
length of episode:  510
total rewards:  -153
length of episode:  155
total rewards:  -88
length of episode:  90
total rewards:  -369
length of episode:  371
total rewards:  -136
length of episode:  138
total rewards:  -224
length of episode:  226
total rewards:  -253
length of episode:  255


In [8]:
print("list visit: ",dict_visit)
for k,v in dict_visit.items():
    if len(v)!=0:
        V[k]=np.mean(v)
    else:
        V[k]=-1000

print("###############################################")
print("The states values estimated using MC First visit:\n ",V)

list visit:  {(0, 0): [-41, -241, -85, -509, -154, -89, -370, -137, -225, -254], (0, 1): [-22, -238, -69, -507, -153, -87, -308, -128, -223, -247], (0, 2): [-21, -234, -67, -495, -105, -86, -315, -84, -220, -246], (0, 3): [-20, -121, -65, -315, -106, -78, -316, -64, -219, -235], (0, 4): [-19, -133, -55, -295, -129, -75, -321, -212, -233], (0, 5): [-166, -54, -309, -128, -291, -92], (0, 6): [-165, -53, -308, -114, -260, -87], (1, 0): [-37, -240, -84, -483, -147, -54, -367, -135, -197, -253], (1, 1): [-36, -239, -43, -501, -145, -68, -305, -134, -222, -248], (1, 2): [-26, -233, -61, -446, -134, -69, -301, -117, -215, -245], (1, 3): [-120, -64, -441, -133, -34, -317, -216, -236], (1, 4): [-18, -134, -50, -313, -132, -73, -322, -213, -232], (1, 5): [-167, -51, -352, -123, -283, -89], (1, 6): [-160, -375, -122, -13, -282, -85], (2, 0): [-31, -34, -80, -485, -55, -66, -366, -122, -196, -252], (2, 1): [-30, -229, -40, -491, -56, -67, -365, -119, -194, -249], (2, 2): [-27, -232, -39, -490, -57

#### Estimating action-state value Q(s,a)

In [2]:
#Initialization of the first visit MC algo

# Gain initialization
G=0
# Gain for each state state dictionnary
dict_visit_action={(i,j,a):[np.random.rand()] for 
            i in range(dim) for j in range(dim) for a in range(4)}
# print(dict_visit_action)


NameError: name 'dim' is not defined

In [10]:
# number of episodes for evaluation The state value V 

def Policy_Evaluation(pi,dim=4,num_episodes=100):
    Q={}
    for _ in range(num_episodes):
        G=0
        #episode 
        sar=episode(pi,dim,log=False)
        # reverse the list sar list
        reversed_sar=list(reversed(sar))

        for i,e in enumerate(reversed_sar):
            s,a,r=e
            G=G+(gamma*r)
            Exist_state_action=sum([(sar[0]==s and sar[1]==a) for sar in reversed_sar[i+1:]])
            if Exist_state_action==0 :
                dict_visit_action[s+(a,)].append(G)

    for k,v in dict_visit_action.items():
            Q[k]=np.mean(v)
      
    return Q
   

In [11]:
action_human={0:"right",1:"left",2:"UP",3:"DOWN"}

In [12]:
# policy improvement 
def Policy_Improvement(pi,Q,log=False,epsilon=0.3):
    
    for i in range(dim):
        for j in range(dim):
            s=(i,j)
            a=np.argmax([Q[s+(a,)] for a in range(4)])
            # epsilon greedy selection 0 for armaxg, 1 for other rand action
            if log:
                print(f"In state {s} the action to take is {action_human[a]}")
            pi[s]=[1-epsilon if i == a else epsilon/3 for i in range(4)]
            
                
    return pi

In [13]:
# policy iteration for montecarlo 
for i in range(1,100):
    Q=Policy_Evaluation(pi,dim=4,num_episodes=1)
    pi=Policy_Improvement(pi,Q,epsilon=1/i)
print("###############################################")
print("The Action states values estimated using MC First visit:\n ",Q) 

###############################################
The Action states values estimated using MC First visit:
  {(0, 0, 0): -127.51406127834358, (0, 0, 1): -124.51589651930638, (0, 0, 2): -161.13017194408476, (0, 0, 3): -62.51534157485373, (0, 1, 0): -54.692861532653595, (0, 1, 1): -83.3396285597642, (0, 1, 2): -105.98451249010006, (0, 1, 3): -91.81358711757215, (0, 2, 0): -143.33249598336698, (0, 2, 1): -145.39781403249924, (0, 2, 2): -169.8425408184766, (0, 2, 3): -65.97852878463299, (0, 3, 0): -45.56373082724048, (0, 3, 1): -127.26357612508698, (0, 3, 2): -126.7314410450683, (0, 3, 3): -186.83917232223368, (0, 4, 0): -177.11150849933702, (0, 4, 1): -192.11226169447158, (0, 4, 2): -144.27313841690457, (0, 4, 3): -118.47550735289128, (0, 5, 0): -111.2592181714377, (0, 5, 1): -100.64812581568913, (0, 5, 2): -102.23917879222361, (0, 5, 3): -99.67034077723754, (0, 6, 0): -99.18099524452724, (0, 6, 1): -95.71259101790905, (0, 6, 2): -95.39225127526674, (0, 6, 3): -82.31614621449943, (1, 0, 0):

In [14]:
# The action mapping for human readibility
# 0:right
# 1:Left
# 2:UP
# 3:Down 
pi

{(0, 0): [0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673,
  0.98989898989899],
 (0, 1): [0.98989898989899,
  0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673],
 (0, 2): [0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673,
  0.98989898989899],
 (0, 3): [0.98989898989899,
  0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673],
 (0, 4): [0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673,
  0.98989898989899],
 (0, 5): [0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673,
  0.98989898989899],
 (0, 6): [0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673,
  0.98989898989899],
 (1, 0): [0.98989898989899,
  0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673],
 (1, 1): [0.0033670033670033673,
  0.0033670033670033673,
  0.0033670033670033673,
  0.98989898989899],
 (1, 2): [0.0033670033670033673,
  0.0033670033670033673,
  0.00

In [15]:
# The loop below will test the policy iteration algorithm
# start position
V=np.zeros((dim,dim))
gw.reset()
o=(0,0,dim-1,dim-1)
terminated=False
rewards=[]
stop=0
while (not terminated) and (stop!=100) :
    stop=stop+1
    a=sample_categorical(pi[o[:2]])
    o,r,terminated,_,_,=gw.step(a)
    rewards.append(r)
    gw.render(V,mode='human')
    print("Action: ",action_human[a])
    print("state: ",o[:2])
    print("reward is: ",r)

print("total rewards: ", np.sum(rewards))

pygame.quit()

Action:  DOWN
state:  (0, 1)
reward is:  -1
Action:  right
state:  (1, 1)
reward is:  -1
Action:  DOWN
state:  (1, 2)
reward is:  -1
Action:  DOWN
state:  (1, 3)
reward is:  -1
Action:  right
state:  (2, 3)
reward is:  -1
Action:  DOWN
state:  (2, 4)
reward is:  -1
Action:  right
state:  (3, 4)
reward is:  -1
Action:  right
state:  (4, 4)
reward is:  -1
Action:  right
state:  (5, 4)
reward is:  -1
Action:  UP
state:  (5, 3)
reward is:  -1
Action:  right
state:  (6, 3)
reward is:  -1
Action:  DOWN
state:  (6, 4)
reward is:  -1
Action:  DOWN
state:  (6, 5)
reward is:  -1
Action:  DOWN
state:  (6, 6)
reward is:  0
total rewards:  -13
