In [1]:
!git clone https://github.com/ghonest-jung/rlenv.git

fatal: destination path 'rlenv' already exists and is not an empty directory.


In [2]:
import random
import copy
from tqdm import tqdm

from rlenv.gridtsp import GridTSP

In [3]:
class QFunc:
    def __init__(self):
        self.q_func = dict()
        self.actions = [GridTSP.UP, GridTSP.DOWN, GridTSP.LEFT, GridTSP.RIGHT]

    def set(self, state, action, value):
        key = str(state)
        self.q_func[key][action] = value

    def add(self, state):
        key = str(state)

        if key in self.q_func:
            return

        self.q_func[key] = dict()

        for a in self.actions:
            self.q_func[key][a] = 0

    def get(self, state, action=None):
        key = str(state)

        if action is None:
            return self.q_func[key]

        return self.q_func[key][action]

    def print(self):
        for key in sorted(self.q_func.keys()):
            print(key, (self.q_func[key]))

In [4]:
def print_board(board):
    for b in board:
        print(b)


def get_avail_action(board, r, c):
    actions = []
    if r != 0:
        actions.append(GridTSP.UP)
    if r != len(board) - 1:
        actions.append(GridTSP.DOWN)

    if c != 0:
        actions.append(GridTSP.LEFT)
    if c != len(board[0]) - 1:
        actions.append(GridTSP.RIGHT)

    return actions


def epsilon_greedy_action(epsilon, q_func, state, r, c):
    key = str(state)

    avail_actions = get_avail_action(state, r, c)

    if random.random() < 1 - epsilon:
        q_avail_actions = dict()
        for a in avail_actions:
            q_avail_actions[a] = q_func.get(key, a)
        action_idx = max(q_avail_actions, key=q_avail_actions.get)
    else:
        action_idx = random.choice(avail_actions)
    return action_idx

In [5]:
q_func = QFunc()

board_size = (3, 3)
max_num_task = 3
coord_all = []

for r in range(board_size[0]):
    for c in range(board_size[1]):
        coord_all.append((r, c))

for episode in tqdm(range(1000)):
    num_task = random.randint(1, max_num_task)

    coord_selected = random.sample(coord_all, k=num_task + 1)

    start = coord_selected[0]
    coord_tasks = coord_selected[1:]

    env = GridTSP(board_size, start, coord_tasks, reward_default=0)

    state = copy.deepcopy(env.board)
    q_func.add(state)
    action = epsilon_greedy_action(0.5, q_func, state, env.r, env.c)

    while True:
        next_state, reward, done, info = env.step(action)
        q_func.add(next_state)
        next_action = epsilon_greedy_action(0.5, q_func, next_state, env.r, env.c)

        if done is True:
            value = reward
        else:
            value = (1 - 0.1) * q_func.get(state, action) + 0.1 * (reward + 0.9 * q_func.get(next_state, next_action))

        q_func.set(state, action, value)

        state = copy.deepcopy(next_state)
        action = next_action

        if done is True:
            break

q_func.print()

100%|██████████| 1000/1000 [00:00<00:00, 2985.03it/s]


[[0, 0, 0], [0, 0, 0], [0, 0, 1]] {1: 0, 2: 0, 3: 0, 4: 0}
[[0, 0, 0], [0, 0, 0], [0, 1, 0]] {1: 0, 2: 0, 3: 0, 4: 0}
[[0, 0, 0], [0, 0, 0], [0, 1, 2]] {1: 0.23188953800955286, 2: 0, 3: 0.08004601223486307, 4: 1}
[[0, 0, 0], [0, 0, 0], [0, 2, 1]] {1: 0.06696405093249343, 2: 0, 3: 1, 4: 0}
[[0, 0, 0], [0, 0, 0], [1, 0, 0]] {1: 0, 2: 0, 3: 0, 4: 0}
[[0, 0, 0], [0, 0, 0], [1, 0, 2]] {1: 0.03710299188521255, 2: 0, 3: 0, 4: 0.5125795110000001}
[[0, 0, 0], [0, 0, 0], [1, 2, 0]] {1: 0.08823592925394802, 2: 0, 3: 0, 4: 1}
[[0, 0, 0], [0, 0, 0], [1, 2, 2]] {1: 0.003222180000000001, 2: 0, 3: 0, 4: 0.8617600683175906}
[[0, 0, 0], [0, 0, 0], [2, 0, 1]] {1: 0.165143314646846, 2: 0, 3: 0.7264020441166331, 4: 0}
[[0, 0, 0], [0, 0, 0], [2, 1, 0]] {1: 0.19593510750991952, 2: 0, 3: 1, 4: 0.2451905162882543}
[[0, 0, 0], [0, 0, 0], [2, 1, 2]] {1: 0.0, 2: 0, 3: 0, 4: 0.15311240299515133}
[[0, 0, 0], [0, 0, 0], [2, 2, 1]] {1: 0.029813946647192304, 2: 0, 3: 0.5126280339316509, 4: 0}
[[0, 0, 0], [0, 0, 1], [0

In [6]:
env = GridTSP(board_size, (2, 2), [(0, 1), (2, 0)], reward_default=0)

state = copy.deepcopy(env.board)
q_func.add(state)
action = epsilon_greedy_action(0.0, q_func, state, env.r, env.c)
print_board(state)
print(action, q_func.get(state))

while True:
    next_state, reward, done, info = env.step(action)
    q_func.add(next_state)
    next_action = epsilon_greedy_action(0.0, q_func, next_state, env.r, env.c)
    action = next_action

    print_board(next_state)
    print(next_action, q_func.get(next_state))

    if done is True:
        break


[0, 2, 0]
[0, 0, 0]
[2, 0, 1]
1 {1: 0.0, 2: 0, 3: 0, 4: 0}
[0, 2, 0]
[0, 0, 1]
[2, 0, 0]
3 {1: 0.0, 2: 0.0, 3: 0.033153512598544314, 4: 0}
[0, 2, 0]
[0, 1, 0]
[2, 0, 0]
1 {1: 0.7329403545747718, 2: 0.0, 3: 0, 4: 0.0}
[0, 1, 0]
[0, 0, 0]
[2, 0, 0]
2 {1: 0, 2: 0.495039392131768, 3: 0.3102271016133657, 4: 0.18199719502515016}
[0, 0, 0]
[0, 1, 0]
[2, 0, 0]
3 {1: 0.25079994935245864, 2: 0.599892913740696, 3: 0.7284796956907637, 4: 0.24429078578511168}
[0, 0, 0]
[1, 0, 0]
[2, 0, 0]
2 {1: 0.42989589778873544, 2: 1, 3: 0, 4: 0.4525560970043889}
[0, 0, 0]
[0, 0, 0]
[1, 0, 0]
1 {1: 0, 2: 0, 3: 0, 4: 0}
