In [1]:
import gym
import gym_sokoban
import time
import datetime as dt
import matplotlib.pyplot as plt
import math
from collections import deque
import random
import numpy as np
import pickle
import sys
import tqdm
import os
from collections import namedtuple,defaultdict,deque 

In [2]:
chapter = 0
level = 5


env_name = 'Sokoban-v1'
env = gym.make(env_name)
ACTION_LOOKUP = env.unwrapped.get_action_lookup()
env.unwrapped.set_level(chapter,level)
env.seed(0)
env.reset()
print("Created environment: {}".format(env_name))


#create Results/Chapter [chapter]/Level [level] folder
if not os.path.exists('Results/Chapter '+str(chapter)+'/Level '+str(level)):
    os.makedirs('Results/Chapter '+str(chapter)+'/Level '+str(level))

Created environment: Sokoban-v1


In [3]:
env.render()

True

In [11]:
def map( x,  in_min,  in_max,  out_min,  out_max) :
  return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min;

def greedy_policy(Q,s,eps_comp=1e-8):
    #.9 prob of greedy action
    #.1 prob of random action
    if s not in Q:
        Q[s] = np.random.rand(env.action_space.n)
    r_choice = 1 -map(eps_comp,0,1,0.1,.6)
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(Q[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(Q[s] == max_val).flatten()
        return np.random.choice(max_actions)

In [14]:
alpha = 0.99
gamma = 0.8
num_episodes = 1000
num_timesteps = 50
Q = {}
pbar = tqdm.tqdm(range(num_episodes),)
finished = 0
max_reward = 0
for episode in pbar:
    state = env.reset()
    last_reward = 0
    for step in range(num_timesteps):
        action = greedy_policy(Q,state,episode/num_episodes)
        next_state, reward, done, info = env.step(action)
        if next_state not in Q:
            Q[next_state] = np.random.rand(env.action_space.n)
        a_ = greedy_policy(Q,next_state,episode/num_episodes)
        Q[state][action] += alpha * ((reward-last_reward) + gamma * Q[next_state][a_] - Q[state][action])
        state = next_state
        last_reward = reward
        if reward > max_reward:
            max_reward = reward
        
        if done:
            if step < num_timesteps-1:
                finished +=1
            break
    pbar.set_description(f'Training SARSA, finished={finished};Max reward gained={round(max_reward,3)}')
    if episode % 1000 == 0:
        fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/SARSA_'+str(num_episodes)+'_episodes_temp.bin'
        with open(fname, 'wb') as handle:
            pickle.dump(Q, handle, protocol=pickle.HIGHEST_PROTOCOL)


            
#save the value function
fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/SARSA_'+str(num_episodes)+'_episodes.bin'
with open(fname, 'wb') as handle:
    pickle.dump(Q, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Training SARSA, finished=239;Max reward gained=41.969: 100%|██████████| 1000/1000 [00:28<00:00, 35.34it/s]


In [18]:
def greedy_policy_pi(V,s):
    #.9 prob of greedy action
    #.1 prob of random action

    if s not in V:
        V[s] = np.zeros(env.action_space.n)
    r_choice = .2
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(V[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(V[s] == max_val).flatten()
        return np.random.choice(max_actions)

for i in range(1000):
    time.sleep(1)
    _ = env.reset()
    state = env.unwrapped.serialize_state()
    st_time = time.time()
    done = False
    t= 0
    last_reward = 0
    last_time = time.time()
    last_state = state
    rendered_frame = 0
    while True:
        if time.time() - st_time < .2:
            env.render()
            rendered_frame += 1
            time.sleep(1/64)
            if (time.time() - last_time) > 1:
                print(f'\rFPS: {rendered_frame}',end='')
                rendered_frame = 0
                last_time = time.time()
            continue
        st_time = time.time()
        action_time = time.time()
        action = greedy_policy_pi(Q,state)
        
        state, reward, done, info = env.step(action)
        state = env.unwrapped.serialize_state()
        #print(f'{ACTION_LOOKUP[action]} state change: {last_state != state}',end=' ')
        last_state = state
        last_reward = reward
        t += 1
        if done or t > 150:
            env.render()
            break
        env.render()

FPS: 16

Exception ignored in: <function Texture.__del__ at 0x000001889C1473A0>
Traceback (most recent call last):
  File "d:\2023-2024\RNN\Sokoban\SokobanEnv\lib\site-packages\pyglet\image\__init__.py", line 1225, in __del__
    self._context.delete_texture(self.id)
  File "d:\2023-2024\RNN\Sokoban\SokobanEnv\lib\site-packages\pyglet\gl\base.py", line 321, in delete_texture
    gl.glDeleteTextures(1, gl.GLuint(texture_id))
  File "d:\2023-2024\RNN\Sokoban\SokobanEnv\lib\site-packages\pyglet\gl\lib.py", line 52, in errcheck
    def errcheck(result, func, arguments):
KeyboardInterrupt: 


FPS: 23