In [1]:
!git clone https://github.com/ghonest-jung/rlenv.git

Cloning into 'rlenv'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 43 (delta 13), reused 36 (delta 10), pack-reused 0[K
Unpacking objects: 100% (43/43), 8.54 KiB | 795.00 KiB/s, done.


In [2]:
import random
import copy
from tqdm import tqdm

from rlenv.gridtsp import GridTSP

In [3]:
class QFunc:
    def __init__(self):
        self.q_func = dict()
        self.actions = [GridTSP.UP, GridTSP.DOWN, GridTSP.LEFT, GridTSP.RIGHT]

    def set(self, state, action, value):
        key = str(state)
        self.q_func[key][action] = value

    def add(self, state):
        key = str(state)

        if key in self.q_func:
            return

        self.q_func[key] = dict()

        for a in self.actions:
            self.q_func[key][a] = 0

    def get(self, state, action=None):
        key = str(state)

        if action is None:
            return self.q_func[key]

        return self.q_func[key][action]

    def print(self):
        for key in sorted(self.q_func.keys()):
            print(key, (self.q_func[key]))

In [4]:
def print_board(board):
    for b in board:
        print(b)


def epsilon_greedy_action(epsilon, q_func, state, env):
    key = str(state)

    avail_actions = env.sample_action()

    if random.random() < 1 - epsilon:
        q_avail_actions = dict()
        for a in avail_actions:
            q_avail_actions[a] = q_func.get(key, a)
        action_idx = max(q_avail_actions, key=q_avail_actions.get)
    else:
        action_idx = random.choice(avail_actions)
    return action_idx

In [5]:
q_func = QFunc()
env = GridTSP(size = (4,4), max_num_tasks = 5, reward_default = -0.01)

In [6]:
for episode in tqdm(range(200000)):
    env.reset()
    epsilon = 0.5 - episode * 0.0 / 200000

    state = copy.deepcopy(env.board)
    q_func.add(state)
    action = epsilon_greedy_action(0.5, q_func, state, env)

    while True:
        next_state, reward, done, info = env.step(action)
        q_func.add(next_state)
        next_action = epsilon_greedy_action(0.5, q_func, next_state, env)

        value = (1 - 0.1) * q_func.get(state, action) + 0.1 * (reward + 0.9 * q_func.get(next_state, next_action))

        q_func.set(state, action, value)

        state = copy.deepcopy(next_state)
        action = next_action

        if done is True:
            break

# q_func.print()

100%|██████████| 200000/200000 [01:47<00:00, 1852.79it/s]


In [7]:
env.reset(start = (3, 1), tasks = [(2, 0), (2, 2), (0, 0), (0, 3)])

state = copy.deepcopy(env.board)
q_func.add(state)
action = epsilon_greedy_action(0.0, q_func, state, env)
print_board(state)
print(action, q_func.get(state))

while True:
    next_state, reward, done, info = env.step(action)
    q_func.add(next_state)
    next_action = epsilon_greedy_action(0.0, q_func, next_state, env)
    action = next_action

    print_board(next_state)
    print(next_action, q_func.get(next_state))

    if done is True:
        break


[2, 0, 0, 2]
[0, 0, 0, 0]
[2, 0, 2, 0]
[0, 1, 0, 0]
0 {0: 0.008777314576344708, 1: 0, 2: -0.001, 3: -0.001179829}
[2, 0, 0, 2]
[0, 0, 0, 0]
[2, 1, 2, 0]
[0, 0, 0, 0]
3 {0: 0.0011127096909894741, 1: 0, 2: 0, 3: 0.6381625861920439}
[2, 0, 0, 2]
[0, 0, 0, 0]
[2, 0, 1, 0]
[0, 0, 0, 0]
2 {0: 0.42708459830580187, 1: 0.17841867921367865, 2: 1.2097170059272737, 3: 0.15189941936243806}
[2, 0, 0, 2]
[0, 0, 0, 0]
[2, 1, 0, 0]
[0, 0, 0, 0]
2 {0: 0.39773251601393117, 1: 0.6000097754418099, 2: 1.9689702727970197, 3: 0.46033845204659096}
[2, 0, 0, 2]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
0 {0: 1.2135401185400732, 1: 0.7371357353209711, 2: 0, 3: 0.837677815850703}
[2, 0, 0, 2]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
0 {0: 1.4818338832994427, 1: 0.9463488230759436, 2: 0, 3: 0.9968248652868408}
[1, 0, 0, 2]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
3 {0: 0, 1: 0.39476147177584, 2: 0, 3: 0.6104583299856454}
[0, 1, 0, 2]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
3 {0: 0, 1: 0.484429301354492, 2: 0.4637319350540