In [1]:
import gym
import gym_sokoban
import time
import datetime as dt
import matplotlib.pyplot as plt
import math
from collections import deque
import random
import numpy as np
import pickle
import sys
import tqdm
import os
from collections import namedtuple,defaultdict,deque 

In [2]:
chapter = 0
level = 2


env_name = 'Sokoban-v1'
env = gym.make(env_name)
ACTION_LOOKUP = env.unwrapped.get_action_lookup()
env.unwrapped.set_level(chapter,level)
env.seed(0)
env.reset()
print("Created environment: {}".format(env_name))


#create Results/Chapter [chapter]/Level [level] folder
if not os.path.exists('Results/Chapter '+str(chapter)+'/Level '+str(level)):
    os.makedirs('Results/Chapter '+str(chapter)+'/Level '+str(level))

Created environment: Sokoban-v1


In [7]:

time.sleep(1/180)
env.render()

True

In [3]:
def greedy_policy(V,s):
    #.9 prob of greedy action
    #.1 prob of random action

    if s not in V:
        V[s] = np.zeros(env.action_space.n)
    r_choice = .4
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(V[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(V[s] == max_val).flatten()
        return np.random.choice(max_actions)

In [4]:
EVERY_VISIT_MC = False
EPISODES = 1000

V = {}
total_returns = {}
N = {}
for episode in tqdm.tqdm(range(EPISODES)):
    
    visited = []
    env.reset()
    state = env.unwrapped.serialize_state()
    done = False
    for t in range(40):
        action_time = time.time()
        if done:
            break
        action = greedy_policy(V,state)
        next_state, reward, done, info = env.step(action)
        next_state = env.unwrapped.serialize_state()
        if EVERY_VISIT_MC or (not EVERY_VISIT_MC and state not in visited):
            if not EVERY_VISIT_MC:
                visited.append(state)
            if state not in total_returns:
                total_returns[state] = np.zeros(env.action_space.n)
            for _state in total_returns:
                total_returns[_state][action] += reward
            if state not in N:
                N[state] = np.zeros(env.action_space.n)
            
            N[state][action] += 1
            V[state][action] = (total_returns[state][action] / N[state][action])
        state = next_state

            
#save the value function
fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/MC_'+('every' if EVERY_VISIT_MC else 'first')+'_'+str(EPISODES)+'_episodes.bin'
with open(fname, 'wb') as handle:
    pickle.dump(V, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:10<00:00, 94.07it/s]


In [None]:
N.values()

dict_values([array([ 6123., 16563.,  5990.,  4762.]), array([ 6092., 16476.,  5958.,  4737.]), array([ 4319., 11676.,  4219.,  3356.]), array([ 5489., 14847.,  5368.,  4268.]), array([ 5294., 14325.,  5182.,  4119.]), array([3604., 9749., 3524., 2802.]), array([2254., 6098., 2206., 1750.]), array([1311., 3552., 1288., 1020.]), array([1170., 3164., 1147.,  908.]), array([ 994., 2692.,  973.,  772.]), array([340., 921., 333., 265.]), array([194., 527., 190., 152.]), array([369., 999., 361., 287.]), array([ 391., 1057.,  390.,  303.]), array([ 81., 219.,  80.,  63.]), array([21., 56., 22., 18.]), array([164., 443., 159., 127.]), array([ 7., 13.,  9.,  4.]), array([0., 6., 2., 4.]), array([2., 0., 1., 0.]), array([ 44., 119.,  43.,  34.]), array([ 90., 241.,  85.,  68.]), array([ 47., 118.,  44.,  35.]), array([16., 40., 15., 13.]), array([ 9., 21.,  8.,  7.]), array([10., 32., 13., 14.]), array([0., 8., 4., 2.]), array([0., 3., 0., 7.]), array([30., 80., 30., 26.]), array([15., 37., 13., 

In [None]:
V

{'3333333333333300000000000030000030000003000000000000300300004033030000000000003133000000050300000000000033333333333333000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000': array([10.22908297, 10.22936461, 10.22531886, 10.22954389]),
 '3333333333333300000000000030000030000003000000000000310300004033030000000000003033000000050300000000000033333333333333000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

In [5]:

for s in V:
    argmax = np.argmax(V[s])
    max_val = np.max(V[s])
    s = s[26:40] +'\n'+ s[26+26:40+26]
    print("{} Action: {} Value: {}".format(s,ACTION_LOOKUP[argmax],max_val))

30010004000053
33333333333333 Action: right Value: 27.947453879940827
30001004000053
33333333333333 Action: right Value: 27.784729257641317
30000104000053
33333333333333 Action: right Value: 28.07368088235233
30000014000053
33333333333333 Action: right Value: 29.370599999999357
30100004000053
33333333333333 Action: right Value: 155.08508943089097
31000004000053
33333333333333 Action: right Value: 1365.9859999999833
30000001400053
33333333333333 Action: right Value: 29.49174343122037
30000000140053
33333333333333 Action: right Value: 28.89364393939331
30000001040053
33333333333333 Action: right Value: 219.33004597700668
30000010040053
33333333333333 Action: right Value: 1329.2801538461329
30000010400053
33333333333333 Action: right Value: 259.45130555555016
30000100400053
33333333333333 Action: right Value: 1119.1624999999813
30001000400053
33333333333333 Action: right Value: 859.4934000000032
30000000014053
33333333333333 Action: right Value: 29.151464012250518
30000000001453
333333333

In [6]:
def greedy_policy_pi(V,s):
    #.9 prob of greedy action
    #.1 prob of random action

    if s not in V:
        V[s] = np.zeros(env.action_space.n)
    r_choice = .3
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(V[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(V[s] == max_val).flatten()
        return np.random.choice(max_actions)


In [7]:
_ = env.reset()
state = env.unwrapped.serialize_state()


In [9]:

for i in range(100):
    time.sleep(1)
    _ = env.reset()
    state = env.unwrapped.serialize_state()
    st_time = time.time()
    done = False
    for t in range(200):
        if time.time() - st_time < .2:
            env.render()
            time.sleep(1/60)
            continue
        st_time = time.time()
        action_time = time.time()
        action = greedy_policy_pi(V,state)
        next_state, reward, done, info = env.step(action)
        next_state = env.unwrapped.serialize_state()
        print("\rAction: {} Reward: {}".format(ACTION_LOOKUP[action],reward),end="")
        if done:
            continue
        env.render()

Action: down Reward: 1.88158999999999873

KeyboardInterrupt: 

In [None]:

action = 0
next_state, reward, done, info = env.step(action)
env.render()
print("Action: {} Reward: {}".format(ACTION_LOOKUP[action],reward))

Action: right Reward: 73.60300000000001
