In [1]:
!git clone https://github.com/ghonest-jung/rlenv.git

Cloning into 'rlenv'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 16 (delta 3), reused 16 (delta 3), pack-reused 0[K
Unpacking objects: 100% (16/16), done.


In [8]:
import random
import copy
from tqdm import tqdm

from gridtsp import GridTSP

ModuleNotFoundError: ignored

In [9]:
class QFunc:
    def __init__(self):
        self.q_func = dict()
        self.actions = [GridTSP.UP, GridTSP.DOWN, GridTSP.LEFT, GridTSP.RIGHT]

    def set(self, state, action, value):
        key = str(state)
        self.q_func[key][action] = value

    def add(self, state):
        key = str(state)

        if key in self.q_func:
            return

        self.q_func[key] = dict()

        for a in self.actions:
            self.q_func[key][a] = 0

    def get(self, state, action=None):
        key = str(state)

        if action is None:
            return self.q_func[key]

        return self.q_func[key][action]

    def print(self):
        for key in sorted(self.q_func.keys()):
            print(key, (self.q_func[key]))

In [10]:
def print_board(board):
    for b in board:
        print(b)


def get_avail_action(board, r, c):
    actions = []
    if r != 0:
        actions.append(GridTSP.UP)
    if r != len(board) - 1:
        actions.append(GridTSP.DOWN)

    if c != 0:
        actions.append(GridTSP.LEFT)
    if c != len(board[0]) - 1:
        actions.append(GridTSP.RIGHT)

    return actions


def epsilon_greedy_action(epsilon, q_func, state, r, c):
    key = str(state)

    avail_actions = get_avail_action(state, r, c)

    if random.random() < 1 - epsilon:
        q_avail_actions = dict()
        for a in avail_actions:
            q_avail_actions[a] = q_func.get(key, a)
        action_idx = max(q_avail_actions, key=q_avail_actions.get)
    else:
        action_idx = random.choice(avail_actions)
    return action_idx

In [None]:
q_func = QFunc()

board_size = (3, 3)
max_num_task = 3
coord_all = []

for r in range(board_size[0]):
    for c in range(board_size[1]):
        coord_all.append((r, c))

for episode in tqdm(range(1000)):
    num_task = random.randint(1, max_num_task)

    coord_selected = random.sample(coord_all, k=num_task + 1)

    start = coord_selected[0]
    coord_tasks = coord_selected[1:]

    env = GridTSP(board_size, start, coord_tasks, reward_default=-0.01)

    state = copy.deepcopy(env.board)
    q_func.add(state)
    action = epsilon_greedy_action(0.5, q_func, state, env.r, env.c)

    while True:
        next_state, reward, done, info = env.step(action)
        q_func.add(next_state)
        next_action = epsilon_greedy_action(0.5, q_func, next_state, env.r, env.c)

        if done is True:
            value = reward
        else:
            value = (1 - 0.1) * q_func.get(state, action) + 0.1 * (reward + 0.9 * q_func.get(next_state, next_action))

        q_func.set(state, action, value)

        state = copy.deepcopy(next_state)
        action = next_action

        if done is True:
            break

q_func.print()

In [13]:
env = GridTSP(board_size, (2, 2), [(0, 1), (2, 0)], reward_default=0)

state = copy.deepcopy(env.board)
q_func.add(state)
action = epsilon_greedy_action(0.0, q_func, state, env.r, env.c)
print_board(state)
print(action, q_func.get(state))

while True:
    next_state, reward, done, info = env.step(action)
    q_func.add(next_state)
    next_action = epsilon_greedy_action(0.0, q_func, next_state, env.r, env.c)
    action = next_action

    print_board(next_state)
    print(next_action, q_func.get(next_state))

    if done is True:
        break


[0, 2, 0]
[0, 0, 0]
[2, 0, 1]
3 {1: -0.00109, 2: 0, 3: 0.011880500275421, 4: 0}
[0, 2, 0]
[0, 0, 0]
[2, 1, 0]
3 {1: -0.00199, 2: 0, 3: 0.3791057016754034, 4: -0.00109}
[0, 2, 0]
[0, 0, 0]
[1, 0, 0]
1 {1: 0.3912743265913458, 2: 0, 3: 0, 4: 0.0741353757395386}
[0, 2, 0]
[1, 0, 0]
[0, 0, 0]
1 {1: 0.5896766092531455, 2: 0.024661957058714785, 3: 0, 4: 0.34319569275096157}
[1, 2, 0]
[0, 0, 0]
[0, 0, 0]
4 {1: 0, 2: 0.14519364992594705, 3: 0, 4: 1}
[0, 1, 0]
[0, 0, 0]
[0, 0, 0]
2 {1: 0, 2: 0, 3: 0, 4: 0}
