In [17]:
import gym
import gym_sokoban
import time
import datetime as dt
import matplotlib.pyplot as plt
import math
from collections import deque
import random
import numpy as np
import pickle
import sys
import tqdm
import os
from collections import namedtuple,defaultdict,deque 

In [18]:
chapter = 0
level = 4


env_name = 'Sokoban-v1'
env = gym.make(env_name)
ACTION_LOOKUP = env.unwrapped.get_action_lookup()
env.unwrapped.set_level(chapter,level)
env.seed(0)
env.reset()
print("Created environment: {}".format(env_name))


#create Results/Chapter [chapter]/Level [level] folder
if not os.path.exists('Results/Chapter '+str(chapter)+'/Level '+str(level)):
    os.makedirs('Results/Chapter '+str(chapter)+'/Level '+str(level))

Created environment: Sokoban-v1


In [8]:
env.render()

True

In [9]:
def map( x,  in_min,  in_max,  out_min,  out_max) :
  return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min;

def greedy_policy(V,s,eps_comp=1e-8):
    #.9 prob of greedy action
    #.1 prob of random action

    if s not in V:
        V[s] = np.zeros(env.action_space.n)
    r_choice = 1 -map(eps_comp,0,1,0.1,.6)
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(V[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(V[s] == max_val).flatten()
        return np.random.choice(max_actions)

In [13]:
EVERY_VISIT_MC = False
EPISODES = 15000
V = {}
total_returns = {}
N = {}
pbar = tqdm.tqdm(range(EPISODES),)
finished = 0
max_reward = 0
max_actions = 300
finished_history = []
for episode in pbar:
    
    visited = []
    env.reset()
    state = env.unwrapped.serialize_state()
    done = False
    
    for t in range(max_actions):
        action_time = time.time()
        if done:
            break
        action = greedy_policy(V,state,eps_comp=episode/EPISODES)
        next_state, reward, done, info = env.step(action)
        
        if reward > max_reward:
            max_reward = reward
        
        if EVERY_VISIT_MC or (not EVERY_VISIT_MC and state not in visited):
            if not EVERY_VISIT_MC:
                visited.append((state,action))
            if state not in total_returns:
                total_returns[state] = np.zeros(env.action_space.n)
            if state not in N:
                N[state] = np.zeros(env.action_space.n)
            for _state,_action in visited:
                total_returns[_state][_action] += reward
                N[_state][_action] += 1
                V[_state][_action] = (total_returns[_state][_action] / N[_state][_action])
            
        if env.unwrapped.is_finished():
            finished += 1
            finished_history.append(visited)
            #V[state][action] = (total_returns[state][action] / N[state][action])
        state = env.unwrapped.serialize_state()
    pbar.set_description(f'Training MC, finished={finished};Max reward gained={round(max_reward,3)}')
    if episode % 1000 == 0:
        fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/MC_'+('every' if EVERY_VISIT_MC else 'first')+'_'+str(EPISODES)+'_episodes_temp.bin'
        with open(fname, 'wb') as handle:
            pickle.dump(V, handle, protocol=pickle.HIGHEST_PROTOCOL)
        replayf = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/MC_'+('every' if EVERY_VISIT_MC else 'first')+'_'+str(EPISODES)+'_replay.bin'
        with open(replayf, 'wb') as handle:
            pickle.dump(finished_history, handle, protocol=pickle.HIGHEST_PROTOCOL)


            
#save the value function
fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/MC_'+('every' if EVERY_VISIT_MC else 'first')+'_'+str(EPISODES)+'_episodes.bin'
with open(fname, 'wb') as handle:
    pickle.dump(V, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

  0%|          | 0/15000 [00:00<?, ?it/s]

Training MC, finished=0;Max reward gained=40.379: 100%|██████████| 15000/15000 [44:38<00:00,  5.60it/s] 


In [7]:
fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/MC_'+('every' if EVERY_VISIT_MC else 'first')+'_'+str(EPISODES)+'_episodes.bin'
with open(fname, 'wb') as handle:
    pickle.dump(V, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:

for s in V:
    argmax = np.argmax(V[s])
    max_val = np.max(V[s])
    s = s[26:40] +'\n'+ s[26+26:40+26]
    print("{} Action: {} Value: {}".format(s,ACTION_LOOKUP[argmax],max_val))

37001000080033
30080700730553 Action: extinguish Value: 9.59677786313387
37010000080033
30080700730553 Action: right Value: 9.355582050961262
37000000080033
30081700730553 Action: left Value: 10.198936821745532
37000000080033
30080100730553 Action: left Value: 11.458274961766524
37000000080033
30080010730553 Action: left Value: 10.4337530075946
37000000080033
30080001730553 Action: right Value: 9.513712007572993
37000000080033
30080000130553 Action: extinguish Value: 9.23229710209534
37000000180033
30080000030553 Action: extinguish Value: 9.481471698113193
37000000080033
30080001030553 Action: extinguish Value: 9.115280780252368
37000001080033
30080000030553 Action: extinguish Value: 10.25243421159957
37000000080033
30080010030553 Action: left Value: 10.674571932757445
37000010080033
30080000030553 Action: right Value: 8.218645411259377
37000100080033
30080000030553 Action: left Value: 9.406157918298796
37000000080033
30080100030553 Action: right Value: 8.825119944211917
37000000100033

In [24]:
#load the value function

fname = "D:\\2023-2024\\RNN\\Sokoban\\Results\\Chapter 0\\Level 4\\MultiProcessing\\CPU 0 MC_first_40000_episodes.bin"
with open(fname, 'rb') as handle:
    V = pickle.load(handle)['V']

In [20]:
def greedy_policy_pi(V,s):
    #.9 prob of greedy action
    #.1 prob of random action

    if s not in V:
        V[s] = np.zeros(env.action_space.n)
    r_choice = .2
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(V[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(V[s] == max_val).flatten()
        return np.random.choice(max_actions)


In [25]:

for i in range(1000):
    time.sleep(.01)
    _ = env.reset()
    state = env.unwrapped.serialize_state()
    st_time = time.time()
    done = False
    t= 0
    last_reward = 0
    last_time = time.time()
    last_state = state
    rendered_frame = 0
    while True:
        if time.time() - st_time < .02:
            env.render()
            rendered_frame += 1
            time.sleep(1/144)
            if (time.time() - last_time) > 1:
                print(f'\rFPS: {rendered_frame}',end='')
                rendered_frame = 0
                last_time = time.time()
            continue
        st_time = time.time()
        action_time = time.time()
        action = greedy_policy_pi(V,state)
        
        state, reward, done, info = env.step(action)
        state = env.unwrapped.serialize_state()
        #print(f'{ACTION_LOOKUP[action]} state change: {last_state != state}',end=' ')
        last_state = state
        last_reward = reward
        t += 1
        if done or t > 300:
            env.render()
            break
        env.render()

KeyboardInterrupt: 

In [32]:
ACTION_LOOKUP

{0: 'right', 1: 'up', 2: 'down', 3: 'left', 4: 'extinguish'}

In [10]:
_ = env.reset()
state = env.unwrapped.serialize_state()


In [13]:

action = 1
next_state, reward, done, info = env.step(action)
env.render()
print("Action: {} Reward: {}".format(ACTION_LOOKUP[action],reward))

Action: up Reward: 7.8469999999999995


In [14]:
for v in V.values():
    print(np.round(v,2))

[13.06 13.11 13.14 15.45 13.19]
[13.13 13.78 13.78 16.39 13.75]
[13.92 14.94 14.92 17.77 15.  ]
[15.66 16.48 20.2  15.84  8.77]
[16.43 15.2  15.18 14.73  8.71]
[16.66 14.86 15.69 14.84  8.73]
[15.43 15.99 16.03 15.27 17.58]
[15.11 15.04 15.05 15.97 15.7 ]
[18.97 20.92 18.92 18.35  9.78]
[21.81 19.99 19.1  19.03  9.79]
[17.18 19.2  17.06 17.37  9.72]
[20.15 18.32 17.26 18.26  9.73]
[22.62 21.   20.95 19.99  9.78]
[22.21 21.9  21.85 21.15 23.5 ]
[23.86 22.78 23.79 22.15 14.76]
[23.24 23.21 23.25 22.77 24.43]
[16.62 16.99 18.1  16.68 16.99]
[16.15 16.28 16.12 16.94 16.12]
[17.76 17.09 18.93 17.71 17.75]
[17.02 16.19 16.21 16.42 16.18]
[17.08 14.02 14.88 14.51  8.71]
[18.59 19.38 18.57 18.59 14.55]
[21.31 21.23 21.2  21.13 23.14]
[19.09 18.65 18.65 19.1  20.15]
[19.22 18.45 18.93 17.65 14.5 ]
[22.49 20.82 21.08 19.9  14.65]
[19.81 18.24 17.   17.31 14.52]
[18.22 16.17 16.19 16.27 14.47]
[16.03 15.92 17.42 15.52 15.71]
[16.97 17.93 16.72 16.48 14.47]
[14.87 14.71 15.95 14.58 14.79]
[16.23 1

In [26]:

fname = "D:\\2023-2024\\RNN\\Sokoban\\Results\\Chapter 0\\Level 4\\MultiProcessing\\CPU 0 MC_first_40000_replay.bin"
with open(fname, 'rb') as handle:
    finished_history = pickle.load(handle)

In [27]:

for replay_eps in finished_history:
    action_list = {}
    for state,action in replay_eps:
        action_list[state] = action
    time.sleep(1.01)
    _ = env.reset()
    state = env.unwrapped.serialize_state()
    st_time = time.time()
    done = False
    t= 0
    last_reward = 0
    last_time = time.time()
    last_state = state
    rendered_frame = 0
    while True:
        if time.time() - st_time < .1:
            env.render()
            rendered_frame += 1
            time.sleep(1/144)
            if (time.time() - last_time) > 1:
                print(f'\rFPS: {rendered_frame}',end='')
                rendered_frame = 0
                last_time = time.time()
            continue
        st_time = time.time()
        action_time = time.time()
        action = action_list[state]
        
        state, reward, done, info = env.step(action)
        state = env.unwrapped.serialize_state()
        #print(f'{ACTION_LOOKUP[action]} state change: {last_state != state}',end=' ')
        last_state = state
        last_reward = reward
        t += 1
        if done or t > 150:
            env.render()
            break
        env.render()

FPS: 18