In [1]:
def code_to_move(code):
    if code==0:
        return "left"
    elif code==1:
        return "up"
    elif code==2:
        return "right"
    elif code==3:
        return "down"
    else:
        return code

In [2]:
import os
from numpy import genfromtxt

# Read mazes from .csv
def read_mazes(directory='./mazes/', num_mazes=10):
    mazes = []
    # Iterate over all csv files in the specified directory
    cur_count = 0
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            print(filepath)

            # Read csv into np array, then convert to MazeMap
            cur_maze = genfromtxt(filepath, delimiter=',')
            mazes.append(MazeMap(cur_maze))
        cur_count += 1
        if cur_count == num_mazes:
            break
            
    return mazes

In [3]:
import copy
class MazeSet:
    def __init__(self, maze_list):
        self.maze_list = maze_list
        self.len = len(maze_list)
        self.index = 0
        self.current = maze_list[0]
        self.map_history = [[] for i in range(self.len)]
    
    # Cycle to next maze in the training set
    def next(self):
        # Circular array
        self.index += 1
        if self.index == self.len:
            self.index = 0
        
        # Change current maze
        self.current = maze_list[self.index]
        
        # Return new maze
        return copy.deepcopy(self.current)
    
    # Return current maze
    def get_maze(self):
        return copy.deepcopy(self.current)
    
    # Add another maze to the training set
    def add(self, new_maze):
        self.maze_list.append(new_maze)
        self.len += 1
        self.map_history.append([])
    
    def record_history(self, num_episodes):
        self.map_history[self.index].append(num_episodes)

    def get_map_hist(self):
        return self.map_history[self.index]

In [None]:
from utils import build_model
from replay import Episode, ReplyBuffer
import numpy as np
from mazemap import Action, MazeMap, Mode
import tensorflowjs as tfjs

maze_test2 = np.array([
    [ 0., 1., 0., 0., 0., 0., 0., 0. ],
    [ 0., 0., 0., 1., 1., 0., 1., 0. ],
    [ 1., 1., 1., 0., 0., 0., 1., 0. ],
    [ 0., 0., 0., 0., 1., 1., 0., 0. ],
    [ 0., 1., 1., 1., 0., 0., 0., 1. ],
    [ 0., 1., 0., 0., 0., 0., 0., 1. ],
    [ 0., 0., 0., 1., 0., 0., 0., 1. ],
    [ 0., 0., 0., 1., 0., 0., 0., 0. ],
])

maze_test = np.zeros((8,8))


def start_train(model,
                maze_set: MazeSet, 
                num_epoch = 15000, 
                max_buffer = 1000, 
                sample_size = 50,
                gamma = 0.9,
                history_size = None,
                print_steps = False,
                load_path = None,
                save_path = None):
    global epsilon
    global AT_rate

    if save_path == None:
        save_path = 'maze_model'

    if load_path != None:
        print(f'Load weight from {load_path}')
        model.load_weights(load_path)

    maze = maze_set.get_maze()
    maze_map = maze

    replay_buf: ReplyBuffer = ReplyBuffer(model, maze_map.get_state_size(), max_buffer, gamma)

    history = []
    loss = 0.0
    if history_size:
        hsize = history_size
    else:
        hsize = maze.get_state_size() // 2
    
    print("Initialization complete, begin training")
    # Run training epoch
    for epoch in range(num_epoch):
        loss = 0.
        is_over = False

        curr_state = maze.observe()
        print(curr_state.shape)
        num_episode = 0
        if print_steps:
            mode = 'init'
        
        while not is_over:
            valid_actions = maze.get_valid_actions()
            #print("valid_actions:", valid_actions)
            if len(valid_actions) == 0:
                break

            # Explore
            explore = True
            accelerated_training = False
            action = np.random.choice(valid_actions)
            if np.random.rand() > epsilon:
                # Exploit
                explore = False
                action = np.argmax(replay_buf.predict(curr_state))
            else:
                # Accelerated training
                if np.random.rand() < AT_rate:
                    explore = False
                    accelerated_training = True
                    path,distance = maze.path_to_end()
                    opt_row, opt_col = path[0]
                    cur_row, cur_col = maze.curr_loc
                    if opt_row > cur_row:
                        action = Action.DOWN
                    elif opt_row < cur_row:
                        action = Action.UP
                    elif opt_col > cur_col:
                        action = Action.RIGHT
                    elif opt_col < cur_col:
                        action = Action.LEFT
                    else:
                        print("AT ERROR")
            action = Action(action)
            if print_steps:
                print("Episode num:",num_episode)
                print("Old loc:",maze.curr_loc)
            prev_state = curr_state
            curr_state, reward, mode = maze.act(action)
            mode = Mode(mode)
            if print_steps:
                print(mode)
                print("New loc:",maze.curr_loc)
                print("Chosen action:",code_to_move(action),"\treward:",reward)
                if explore:
                    print("Randomly picked with explore")
                if accelerated_training:
                    print("Optimal pick with accelerated training")
                maze.print_maze(mouse_char=':>')
                print()
            if mode == Mode.END:
                history.append(1)
                is_over = True
            elif mode == Mode.TERMINATED:
                history.append(0)
                is_over = True
            else:
                is_over = False

            episode = Episode(prev_state, curr_state, action, reward, mode)
            replay_buf.log(episode)
            num_episode += 1

            inputs, outputs = replay_buf.sampling(sample_size)
            train_history = model.fit(inputs, outputs, epochs=8, batch_size=16, verbose=0)
            loss = train_history.history['loss'][-1]
        
        win_rate = 0.0 if len(history) < hsize else np.sum(np.array(history[-hsize:])) / hsize

        print(f'Epoch {epoch}/{num_epoch} | Loss: {loss:.2f} | Episodes: {num_episode} | Win Count: {np.sum(np.array(history))} | Win Rate: {win_rate}')
    
        # Record number of episodes for the model for this epoch on the current map
        maze_set.record_history((num_episode, int(is_over))
    
        
        # Reset maze after epoch ends
        maze = maze_set.get_maze()
        if loss > 5:
            epsilon = .5
            at_rate = .95
        elif loss > 1.5:
            epsilon = .25
            at_rate = .8
        elif loss < 1.5:
            epsilon = .2
            at_rate = .5
            if loss < 1:
                epsilon = .15
                at_rate = .25
                if loss < .5:
                    epsilon = .1
                    at_rate = .1
            
        # Prevent the model getting stuck in a local minimum where it loops
        if num_episode > 60:
            epsilon = .2
            at_rate = .3
            
        if win_rate > 0.9:
            epsilon = 0.05
            at_rate = 0
        
        if win_rate == 1.0:
            print('Reach 100% win rate')
            # Print the model's history on the map
            print(maze_set.get_map_hist())
            # Change to next maze in training set
            maze = maze_set.next()
            history = []
            
        if epoch % 15 == 0:
            h5file = save_path + ".h5"
            model.save_weights(h5file, overwrite=True)
            tfjs.converters.save_keras_model(model, './')
            
            print(f'Saved model in {save_path}')
            
        if print_steps:
            maze.print_maze(mouse_char=':>')

    h5file = save_path + ".h5"
    model.save_weights(h5file, overwrite=True)        
    tfjs.converters.save_keras_model(model, './')
    print(f'Saved model in {save_path}')




# This hyperparamter is used to control the ratio of exploration and exploitation
epsilon = 0.1
# This hyperparameter is used to control the ratio of random vs optimized exploration
AT_rate = .5

maze_map = MazeMap(maze_test)
#maze_list = [maze_map, MazeMap(maze_test2)]
maze_list = read_mazes(num_mazes=5)

training_mazes = MazeSet(maze_list)
model = build_model(maze_test)
start_train(model, training_mazes, 300, 8 * maze_map.get_state_size(), history_size=5, print_steps=True)


./mazes/m1.csv
./mazes/m10.csv
./mazes/m2.csv
./mazes/m3.csv
./mazes/m4.csv
Initialization complete, begin training
(1, 64)
Episode num: 0
Old loc: (0, 0)
Mode.INVALID
New loc: (0, 0)
Chosen action: Action.LEFT 	reward: -10
░░░░░░░░░░░░░░░░░░
░:>██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 1
Old loc: (0, 0)
Mode.INVALID
New loc: (0, 0)
Chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░:>██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (0, 0)
Mode.VALID
New loc: (1, 0)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░:>      ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episod

  return h5py.File(h5file)


Episode num: 1
Old loc: (1, 0)
Mode.INVALID
New loc: (1, 0)
Chosen action: Action.LEFT 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░:>      ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (1, 0)
Mode.PREVIOUS
New loc: (0, 0)
Chosen action: Action.UP 	reward: -5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░:>██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (0, 0)
Mode.PREVIOUS
New loc: (1, 0)
Chosen action: Action.DOWN 	reward: -5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░:>      ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (1, 0)
Mode.INVALID
New loc: (1, 0)
C

Epoch 1/300 | Loss: 2.49 | Episodes: 28 | Win Count: 0 | Win Rate: 0.0
░░░░░░░░░░░░░░░░░░
░:>██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░
(1, 64)
Episode num: 0
Old loc: (0, 0)
Mode.VALID
New loc: (1, 0)
Chosen action: Action.DOWN 	reward: -0.5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░:>      ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 1
Old loc: (1, 0)
Mode.VALID
New loc: (1, 1)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░  :>    ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (1, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (1, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░    :>  ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (1, 2)
Mode.VALID
New loc: (1, 3)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░      :>██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (1, 3)
Mode.VALID
New loc: (2, 3)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██:>██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (2, 3)
Mode.VALID
New loc: (3, 3)
Chosen action: Action.DOWN 	reward: -0.5
Optimal pick with accelerated

Episode num: 1
Old loc: (1, 0)
Mode.VALID
New loc: (1, 1)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░  :>    ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (1, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░    :>  ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (1, 2)
Mode.VALID
New loc: (1, 3)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░      :>██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (1, 3)
Mode.VALID
New loc: (2, 3)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██    

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (1, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░    :>  ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (1, 2)
Mode.VALID
New loc: (1, 3)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░      :>██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (1, 3)
Mode.VALID
New loc: (2, 3)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██:>██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (2, 3)
Mode.PREVIOUS
New loc: (1, 3)
Chosen action: Action.UP 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██      

Episode num: 6
Old loc: (3, 3)
Mode.PREVIOUS
New loc: (2, 3)
Chosen action: Action.UP 	reward: -5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██:>██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 7
Old loc: (2, 3)
Mode.PREVIOUS
New loc: (3, 3)
Chosen action: Action.DOWN 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██:>  ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 8
Old loc: (3, 3)
Mode.VALID
New loc: (3, 4)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██  :>██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 9
Old loc: (3, 4)
Mode.VALID
New loc: (4, 4)
Chosen action: Action.DOWN 	reward: -0.5
░

Episode num: 17
Old loc: (5, 6)
Mode.VALID
New loc: (6, 6)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████    :>██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 18
Old loc: (6, 6)
Mode.VALID
New loc: (7, 6)
Chosen action: Action.DOWN 	reward: -0.5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██:>EE░
░░░░░░░░░░░░░░░░░░

Episode num: 19
Old loc: (7, 6)
Mode.END
New loc: (7, 7)
Chosen action: Action.RIGHT 	reward: 10
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Epoch 7/300 | Loss: 3.13 | Episodes: 20 | Win Count:

Episode num: 24
Old loc: (7, 2)
Mode.INVALID
New loc: (7, 2)
Chosen action: Action.RIGHT 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░    :>██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 25
Old loc: (7, 2)
Mode.INVALID
New loc: (7, 2)
Chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░    :>██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 26
Old loc: (7, 2)
Mode.INVALID
New loc: (7, 2)
Chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░    :>██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 27
Old loc: (7, 2)
Mode.TERMINATED
New loc: (7, 1)
Chosen action: Action.LEFT 	reward: -5
Optimal pick with ac

Episode num: 6
Old loc: (3, 3)
Mode.VALID
New loc: (3, 4)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██  :>██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 7
Old loc: (3, 4)
Mode.VALID
New loc: (4, 4)
Chosen action: Action.DOWN 	reward: -0.5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██:>    ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 8
Old loc: (4, 4)
Mode.VALID
New loc: (4, 5)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██  :>  ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 9
Old loc: (4, 5)
Mode.VALID
New loc: (4, 6)
Chosen action: Action.RIGHT 	re

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (1, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░    :>  ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (1, 2)
Mode.VALID
New loc: (1, 3)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░      :>██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (1, 3)
Mode.VALID
New loc: (2, 3)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██:>██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (2, 3)
Mode.VALID
New loc: (3, 3)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██     

Episode num: 1
Old loc: (1, 0)
Mode.VALID
New loc: (2, 0)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░:>  ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (2, 0)
Mode.INVALID
New loc: (2, 0)
Chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░:>  ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (2, 0)
Mode.INVALID
New loc: (2, 0)
Chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░:>  ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (2, 0)
Mode.INVALID
New loc: (2, 0)
Chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░        

Episode num: 10
Old loc: (2, 0)
Mode.VALID
New loc: (2, 1)
Chosen action: Action.RIGHT 	reward: -0.5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░  :>██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 11
Old loc: (2, 1)
Mode.PREVIOUS
New loc: (2, 0)
Chosen action: Action.LEFT 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░:>  ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 12
Old loc: (2, 0)
Mode.INVALID
New loc: (2, 0)
Chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░:>  ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 13
Old loc: (2, 0)
Mode.INVALID
New loc: (2, 0)
Chosen action: Action.DO

Epoch 16/300 | Loss: 1.56 | Episodes: 18 | Win Count: 0 | Win Rate: 0.0
░░░░░░░░░░░░░░░░░░
░:>      ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░
(1, 64)
Episode num: 0
Old loc: (0, 0)
Mode.VALID
New loc: (1, 0)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░:>  ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 1
Old loc: (1, 0)
Mode.PREVIOUS
New loc: (0, 0)
Chosen action: Action.UP 	reward: -5
░░░░░░░░░░░░░░░░░░
░:>      ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (0, 0)
Mode.PREVIOUS
New loc: (1, 0)
Chosen action: Action.DOWN 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░:>  ██████  

Episode num: 4
Old loc: (1, 1)
Mode.INVALID
New loc: (1, 1)
Chosen action: Action.RIGHT 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░  :>██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (1, 1)
Mode.INVALID
New loc: (1, 1)
Chosen action: Action.RIGHT 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░  :>██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 6
Old loc: (1, 1)
Mode.VALID
New loc: (0, 1)
Chosen action: Action.UP 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  :>    ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 7
Old loc: (0, 1)
Mode.INVALID
New loc: (0, 1)
Chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░  :>    ██

Episode num: 32
Old loc: (3, 3)
Mode.INVALID
New loc: (3, 3)
Chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██    :>██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 33
Old loc: (3, 3)
Mode.INVALID
New loc: (3, 3)
Chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██    :>██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 34
Old loc: (3, 3)
Mode.INVALID
New loc: (3, 3)
Chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██    :>██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 35
Old loc: (3, 3)
Mode.TERMINATED
New loc: (4, 3)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░     

Episode num: 1
Old loc: (1, 0)
Mode.VALID
New loc: (1, 1)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░  :>██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (2, 1)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░  :>██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (2, 1)
Mode.VALID
New loc: (3, 1)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██:>    ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (3, 1)
Mode.PREVIOUS
New loc: (2, 1)
Chosen action: Action.UP 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██ 

Episode num: 3
Old loc: (2, 1)
Mode.VALID
New loc: (3, 1)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██:>    ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (3, 1)
Mode.PREVIOUS
New loc: (2, 1)
Chosen action: Action.UP 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░  :>██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (2, 1)
Mode.PREVIOUS
New loc: (3, 1)
Chosen action: Action.DOWN 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██:>    ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 6
Old loc: (3, 1)
Mode.PREVIOUS
New loc: (2, 1)
Chosen action: Action.UP 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  

Episode num: 12
Old loc: (5, 1)
Mode.PREVIOUS
New loc: (4, 1)
Chosen action: Action.UP 	reward: -5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░  :>██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 13
Old loc: (4, 1)
Mode.PREVIOUS
New loc: (5, 1)
Chosen action: Action.DOWN 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░  :>████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 14
Old loc: (5, 1)
Mode.VALID
New loc: (6, 1)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██:>████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 15
Old loc: (6, 1)
Mode.PREVIOUS
New loc: (5, 1)
Chosen action: Action.UP 	

Episode num: 12
Old loc: (4, 4)
Mode.VALID
New loc: (4, 5)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██    :>██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 13
Old loc: (4, 5)
Mode.PREVIOUS
New loc: (4, 4)
Chosen action: Action.LEFT 	reward: -5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██  :>  ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 14
Old loc: (4, 4)
Mode.PREVIOUS
New loc: (4, 5)
Chosen action: Action.RIGHT 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██    :>██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 15
Old loc: (4, 5)
Mode.PREVIOUS
New loc: (4, 4)
Chosen action: Action.

Episode num: 6
Old loc: (5, 1)
Mode.PREVIOUS
New loc: (4, 1)
Chosen action: Action.UP 	reward: -5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░  :>██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 7
Old loc: (4, 1)
Mode.PREVIOUS
New loc: (5, 1)
Chosen action: Action.DOWN 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░  :>████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 8
Old loc: (5, 1)
Mode.VALID
New loc: (6, 1)
Chosen action: Action.DOWN 	reward: -0.5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██:>████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 9
Old loc: (6, 1)
Mode.PREVIOUS
New loc: (5, 1)
Chosen acti

Episode num: 3
Old loc: (2, 1)
Mode.VALID
New loc: (3, 1)
Chosen action: Action.DOWN 	reward: -0.5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██:>    ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (3, 1)
Mode.VALID
New loc: (3, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██  :>  ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (3, 2)
Mode.VALID
New loc: (3, 3)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██    :>██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 6
Old loc: (3, 3)
Mode.VALID
New loc: (4, 3)
Chosen action: Action.DOWN 	rew

Episode num: 3
Old loc: (2, 1)
Mode.VALID
New loc: (3, 1)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██:>    ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (3, 1)
Mode.VALID
New loc: (3, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██  :>  ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (3, 2)
Mode.VALID
New loc: (3, 3)
Chosen action: Action.RIGHT 	reward: -0.5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██    :>██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 6
Old loc: (3, 3)
Mode.VALID
New loc: (4, 3)
Chosen action: Action.DOWN 	reward: -0.5


Episode num: 14
Old loc: (4, 4)
Mode.PREVIOUS
New loc: (4, 5)
Chosen action: Action.RIGHT 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██    :>██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 15
Old loc: (4, 5)
Mode.PREVIOUS
New loc: (4, 4)
Chosen action: Action.LEFT 	reward: -5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██  :>  ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 16
Old loc: (4, 4)
Mode.VALID
New loc: (5, 4)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████:>██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 17
Old loc: (5, 4)
Mode.VALID
New loc: (6, 4)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░    

Episode num: 1
Old loc: (1, 0)
Mode.VALID
New loc: (1, 1)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░  :>██████      ░
░    ██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (2, 1)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░  :>██████  ████░
░██      ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (2, 1)
Mode.VALID
New loc: (3, 1)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        ██  ██  ░
░    ██████      ░
░    ██████  ████░
░██:>    ██  ██  ░
░    ██      ██  ░
░    ████  ██    ░
░██  ████  ██  ██░
░    ██        EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (3, 1)
Mode.VALID
New loc: (3, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░        █

Episode num: 1
Old loc: (1, 0)
Mode.INVALID
New loc: (1, 0)
Chosen action: Action.LEFT 	reward: -10
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░:>      ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 2
Old loc: (1, 0)
Mode.PREVIOUS
New loc: (0, 0)
Chosen action: Action.UP 	reward: -5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░:>  ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (0, 0)
Mode.PREVIOUS
New loc: (1, 0)
Chosen action: Action.DOWN 	reward: -5
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░:>      ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (1, 0)
Mode.INVALID
New loc: (1, 0)
Chosen action: Action.LEFT 	reward: -10


Episode num: 28
Old loc: (5, 3)
Mode.PREVIOUS
New loc: (6, 3)
Chosen action: Action.DOWN 	reward: -5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████:>████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 29
Old loc: (6, 3)
Mode.PREVIOUS
New loc: (5, 3)
Chosen action: Action.UP 	reward: -5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████:>██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 30
Old loc: (5, 3)
Mode.TERMINATED
New loc: (6, 3)
Chosen action: Action.DOWN 	reward: -5
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████:>████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Epoch 31/300 | Loss: 6.53 | Episodes: 31 | Win Count: 0 

Episode num: 2
Old loc: (1, 1)
Mode.VALID
New loc: (1, 2)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░    :>  ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 3
Old loc: (1, 2)
Mode.VALID
New loc: (1, 3)
Chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░      :>██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 4
Old loc: (1, 3)
Mode.VALID
New loc: (2, 3)
Chosen action: Action.DOWN 	reward: -0.5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██:>██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 5
Old loc: (2, 3)
Mode.VALID
New loc: (3, 3)
Chosen action: Action.DOWN 	reward: -0.5


Episode num: 7
Old loc: (4, 3)
Mode.PREVIOUS
New loc: (3, 3)
Chosen action: Action.UP 	reward: -5
Optimal pick with accelerated training
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██:>    ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 8
Old loc: (3, 3)
Mode.VALID
New loc: (3, 4)
Chosen action: Action.RIGHT 	reward: -0.5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██  :>  ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 9
Old loc: (3, 4)
Mode.INVALID
New loc: (3, 4)
Chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██  :>  ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 10
Old loc: (3, 4)
Mode.INVALID
New loc: (3, 4)


Episode num: 12
Old loc: (3, 3)
Mode.VALID
New loc: (4, 3)
Chosen action: Action.DOWN 	reward: -0.5
Randomly picked with explore
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████:>██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 13
Old loc: (4, 3)
Mode.INVALID
New loc: (4, 3)
Chosen action: Action.RIGHT 	reward: -10
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████:>██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 14
Old loc: (4, 3)
Mode.VALID
New loc: (5, 3)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████:>██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Episode num: 15
Old loc: (5, 3)
Mode.PREVIOUS
New loc: (4, 3)
Chosen action: Action.UP 	reward: 

Episode num: 16
Old loc: (6, 6)
Mode.VALID
New loc: (7, 6)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██:>EE░
░░░░░░░░░░░░░░░░░░

Episode num: 17
Old loc: (7, 6)
Mode.END
New loc: (7, 7)
Chosen action: Action.RIGHT 	reward: 10
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░

Epoch 36/300 | Loss: 0.70 | Episodes: 18 | Win Count: 3 | Win Rate: 0.6
░░░░░░░░░░░░░░░░░░
░:>  ██    ██  ██░
░        ██      ░
░██  ██  ██  ██  ░
░    ██      ████░
░██████  ██  ██  ░
░  ████  ██      ░
░  ████  ████  ██░
░          ██  EE░
░░░░░░░░░░░░░░░░░░
(1, 64)
Episode num: 0
Old loc: (0, 0)
Mode.VALID
New loc: (1, 0)
Chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░    ██    ██  ██░
░:>      ██   

implement accelerated training:
some % of the time, instead of explore being a random choice, it is the optimum choice
  
at higher loss values, this occurs more often, and doesn't occur at all for lower loss

In [None]:
from utils import build_model
from replay import Episode, ReplyBuffer
import numpy as np
from mazemap import Action, MazeMap, Mode
import tensorflowjs as tfjs

maze_test2 = np.array([
    [ 0., 1., 0., 0., 0., 0., 0., 0. ],
    [ 0., 0., 0., 1., 1., 0., 1., 0. ],
    [ 1., 1., 1., 0., 0., 0., 1., 0. ],
    [ 0., 0., 0., 0., 1., 1., 0., 0. ],
    [ 0., 1., 1., 1., 0., 0., 0., 1. ],
    [ 0., 1., 0., 0., 0., 0., 0., 1. ],
    [ 0., 0., 0., 1., 0., 0., 0., 1. ],
    [ 0., 0., 0., 1., 0., 0., 0., 0. ],
])

maze_test = np.zeros((8,8))

maze_map = MazeMap(maze_test)
path,distance = maze_map.path_to_end()
print(distance, path)

maze_map = MazeMap(maze_test2)
path,distance = maze_map.path_to_end()
print(distance, path)
