In [1]:
import gym
import gym_sokoban
import time
import datetime as dt
import matplotlib.pyplot as plt
import math
from collections import deque
import random
import numpy as np
import pickle
import sys
import tqdm
import os
from collections import namedtuple,defaultdict,deque 

In [11]:
chapter = 0
level = 5


env_name = 'Sokoban-v1'
env = gym.make(env_name)
ACTION_LOOKUP = env.unwrapped.get_action_lookup()
env.unwrapped.set_level(chapter,level)
env.seed(0)
env.reset()
print("Created environment: {}".format(env_name))


#create Results/Chapter [chapter]/Level [level] folder
if not os.path.exists('Results/Chapter '+str(chapter)+'/Level '+str(level)):
    os.makedirs('Results/Chapter '+str(chapter)+'/Level '+str(level))

Created environment: Sokoban-v1


In [None]:

for i in range(1000):
    time.sleep(1)
    _ = env.reset()
    state = env.unwrapped.serialize_state()
    st_time = time.time()
    done = False
    t= 0
    last_reward = 0
    last_time = time.time()
    last_state = state
    rendered_frame = 0
    while True:
        if time.time() - st_time < .1:
            env.render()
            rendered_frame += 1
            if (time.time() - last_time) > 1:
                print(f'\rFPS: {rendered_frame}',end='')
                rendered_frame = 0
                last_time = time.time()
            continue
        st_time = time.time()
        action_time = time.time()
        action = env.action_space.sample()
        
        state, reward, done, info = env.step(action)
        state = env.unwrapped.serialize_state()
        #print(f'{ACTION_LOOKUP[action]} state change: {last_state != state}',end=' ')
        last_state = state
        last_reward = reward
        t += 1
        if done or t > 15000:
            env.render()
            break
        env.render()

In [12]:
env.render()

True

In [13]:
def map( x,  in_min,  in_max,  out_min,  out_max) :
  return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min;

def greedy_policy(V,s,eps_comp=1e-8):
    #.9 prob of greedy action
    #.1 prob of random action
    if s not in V:
        V[s] = np.zeros(env.action_space.n)
    r_choice = 1 -map(eps_comp,0,1,0.1,.6)
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(V[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(V[s] == max_val).flatten()
        return np.random.choice(max_actions)

In [19]:
alpha = 0.99
gamma = 0.8
epsilon = 0.8
num_episodes = 10**5
num_timesteps = 50
Q = {}
pbar = tqdm.tqdm(range(num_episodes),)
finished = 0
max_reward = 0
for episode in pbar:
    state = env.reset()
    last_reward = 0
    for step in range(num_timesteps):
        action = greedy_policy(Q,state,episode/num_episodes)
        next_state, reward, done, info = env.step(action)
        if next_state not in Q:
            Q[next_state] = np.zeros(env.action_space.n)
        a_ = np.argmax(Q[next_state])
        Q[state][action] += alpha * ((reward-last_reward) + gamma * Q[next_state][a_] - Q[state][action])
        state = next_state
        last_reward = reward
        if reward > max_reward:
            max_reward = reward
        
        if done:
            if step < num_timesteps-1:
                finished +=1
            break
    pbar.set_description(f'Training Q, finished={finished};Max reward gained={round(max_reward,3)}')
    if episode % 1000 == 0:
        fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/Q_'+str(num_episodes)+'_episodes_temp.bin'
        with open(fname, 'wb') as handle:
            pickle.dump(Q, handle, protocol=pickle.HIGHEST_PROTOCOL)


            
#save the value function
fname = 'Results/Chapter '+str(chapter)+'/Level '+str(level)+'/Q_'+str(num_episodes)+'_episodes.bin'
with open(fname, 'wb') as handle:
    pickle.dump(Q, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

  0%|          | 0/20000 [00:00<?, ?it/s]

Training Q, finished=2;Max reward gained=49.039:   2%|▏         | 446/20000 [00:14<10:40, 30.53it/s]


KeyboardInterrupt: 

In [15]:
for q in Q:
    print(Q[q])

[4.36707234 4.53044477 4.47776244 4.55202548 4.55253855]
[5.46009041 4.31709512 4.31709137 7.1515481  4.31809973]
[6.82636301 5.41009041 5.41009041 6.47662744 5.41109041]
[5.77636301 5.77636301 7.28420376 4.66009041 5.77736301]
[5.82636301 4.61009041 4.61009041 3.72707232 4.61109041]
[9.1065047  5.82636301 7.23420376 7.23420376 7.23520376]
[ 9.0565047   9.0565047   9.0565047   7.28420376 11.38438087]
[4.66009041 3.67707232 3.67707232 2.98065786 3.67807232]
[3.72707232 2.93065786 2.93065786 2.93065786 2.93165786]
[7.98172609 6.33438087 6.33438087 5.1065047  6.33538087]
[6.38438087 4.08420376 5.0565047  5.0565047  5.0575047 ]
[9.97840762 6.38438087 7.93172609 6.38438087 7.93272609]
[12.47425952  7.98172609  9.92840762  7.98172609  9.92940762]
[4.03420376 4.03420376 5.1065047  3.26636301 4.03520376]
[15.5865744  12.41825952 12.41825952  9.97360762 12.41925952]
[12.46825952  7.98425896  9.92360762  7.97788609  9.92460762]
[9.9815737  7.93425896 9.97360762 6.38640717 7.93525896]
[ 3.2836732

In [None]:
def greedy_policy_pi(V,s):
    #.9 prob of greedy action
    #.1 prob of random action

    if s not in V:
        V[s] = np.zeros(env.action_space.n)
    r_choice = .3
    if np.random.random() < r_choice:
        return np.random.choice(np.arange(env.action_space.n))
    else:
        max_val = np.max(V[s])
         #find all actions that have the max value and choose one at random
        max_actions = np.argwhere(V[s] == max_val).flatten()
        return np.random.choice(max_actions)


In [18]:

for i in range(1000):
    time.sleep(.1)
    _ = env.reset()
    state = env.unwrapped.serialize_state()
    st_time = time.time()
    done = False
    t= 0
    last_reward = 0
    last_time = time.time()
    last_state = state
    rendered_frame = 0
    while True:
        if time.time() - st_time < .1:
            env.render()
            rendered_frame += 1
            if (time.time() - last_time) > 1:
                print(f'\rFPS: {rendered_frame}',end='')
                rendered_frame = 0
                last_time = time.time()
            continue
        st_time = time.time()
        action_time = time.time()
        action = greedy_policy_pi(Q,state)
        
        state, reward, done, info = env.step(action)
        state = env.unwrapped.serialize_state()
        #print(f'{ACTION_LOOKUP[action]} state change: {last_state != state}',end=' ')
        last_state = state
        last_reward = reward
        t += 1
        if done or t > 50:
            env.render()
            break
        env.render()

FPS: 43

KeyboardInterrupt: 