In [1]:
def code_to_move(code):
    if code==0:
        return "left"
    elif code==1:
        return "up"
    elif code==2:
        return "right"
    elif code==3:
        return "down"
    else:
        return code

In [2]:
import os
from numpy import genfromtxt

# Read mazes from .csv
def read_mazes(directory='./mazes/', num_mazes=10):
    mazes = []
    # Iterate over all csv files in the specified directory
    cur_count = 0
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            print(filepath)

            # Read csv into np array, then convert to MazeMap
            cur_maze = genfromtxt(filepath, delimiter=',')
            mazes.append(MazeMap(cur_maze))
        cur_count += 1
        if cur_count == num_mazes:
            break
            
        return mazes

In [3]:
import copy
class MazeSet:
    def __init__(self, maze_list):
        self.maze_list = maze_list
        self.len = len(maze_list)
        self.index = 0
        self.current = maze_list[0]
        self.map_history = [[] for i in range(self.len)]
    
    # Cycle to next maze in the training set
    def next(self):
        # Circular array
        self.index += 1
        if self.index == self.len:
            self.index = 0
        
        # Change current maze
        self.current = maze_list[self.index]
        
        # Return new maze
        return copy.deepcopy(self.current)
    
    # Return current maze
    def get_maze(self):
        return copy.deepcopy(self.current)
    
    # Add another maze to the training set
    def add(self, new_maze):
        self.maze_list.append(new_maze)
        self.len += 1
        self.map_history.append([])
    
    def record_history(self, num_episodes):
        self.map_history[self.index].append(num_episodes)

    def get_map_hist(self):
        return self.map_history[self.index]

In [None]:
from utils import build_model
from replay import Episode, ReplyBuffer
import numpy as np
from mazemap import Action, MazeMap, Mode
import tensorflowjs as tfjs

maze_test2 = np.array([
    [ 0., 1., 0., 0., 0., 0., 0., 0. ],
    [ 0., 0., 0., 1., 1., 0., 1., 0. ],
    [ 1., 1., 1., 0., 0., 0., 1., 0. ],
    [ 0., 0., 0., 0., 1., 1., 0., 0. ],
    [ 0., 1., 1., 1., 0., 0., 0., 1. ],
    [ 0., 1., 0., 0., 0., 0., 0., 1. ],
    [ 0., 0., 0., 1., 0., 0., 0., 1. ],
    [ 0., 0., 0., 1., 0., 0., 0., 0. ],
])

maze_test = np.zeros((8,8))


def start_train(model,
                maze_set: MazeSet, 
                num_epoch = 15000, 
                max_buffer = 1000, 
                sample_size = 50,
                gamma = 0.9,
                history_size = None,
                print_steps = False,
                load_path = None,
                save_path = None):
    global epsilon

    if save_path == None:
        save_path = 'maze_model'

    if load_path != None:
        print(f'Load weight from {load_path}')
        model.load_weights(load_path)

    maze = maze_set.get_maze()
    maze_map = maze

    replay_buf: ReplyBuffer = ReplyBuffer(model, maze_map.get_state_size(), max_buffer, gamma)

    history = []
    loss = 0.0
    if history_size:
        hsize = history_size
    else:
        hsize = maze.get_state_size() // 2
    
    print("Initialization complete, begin training")
    # Run training epoch
    for epoch in range(num_epoch):
        loss = 0.
        is_over = False

        curr_state = maze.observe()
        print(curr_state.shape)
        num_episode = 0
        if print_steps:
            mode = 'init'
        
        while not is_over:
            valid_actions = maze.get_valid_actions()
            #print("valid_actions:", valid_actions)
            if len(valid_actions) == 0:
                break

            # Explore
            action = np.random.choice(valid_actions)
            if np.random.rand() > epsilon:
                # Exploit
                action = np.argmax(replay_buf.predict(curr_state))
            action = Action(action)
            if print_steps:
                print("Old loc:",maze.curr_loc)
            prev_state = curr_state
            curr_state, reward, mode = maze.act(action)
            mode = Mode(mode)
            if print_steps:
                print(mode)
                print("New loc:",maze.curr_loc)
                print("chosen action:",code_to_move(action),"\treward:",reward)
                maze.print_maze()
                print()
            if mode == Mode.END:
                history.append(1)
                is_over = True
            elif mode == Mode.TERMINATED:
                history.append(0)
                is_over = True
            else:
                is_over = False

            episode = Episode(prev_state, curr_state, action, reward, mode)
            replay_buf.log(episode)
            num_episode += 1

            inputs, outputs = replay_buf.sampling(sample_size)
            train_history = model.fit(inputs, outputs, epochs=8, batch_size=16, verbose=0)
            loss = train_history.history['loss'][-1]
        
        win_rate = 0.0 if len(history) < hsize else np.sum(np.array(history[-hsize:])) / hsize

        print(f'Epoch {epoch}/{num_epoch} | Loss: {loss:.2f} | Episodes: {num_episode} | Win Count: {np.sum(np.array(history))} | Win Rate: {win_rate}')
    
        # Record number of episodes for the model for this epoch on the current map
        maze_set.record_history(num_episode)
    
        
        # Reset maze after epoch ends
        maze = maze_set.get_maze()
        
        if win_rate > 0.9:
            epsilon = 0.05
        
        if win_rate == 1.0:
            print('Reach 100% win rate')
            # Print the model's history on the map
            print(maze_set.get_map_hist())
            # Change to next maze in training set
            maze = maze_set.next()
            history = []
            
        if epoch % 15 == 0:
            h5file = save_path + ".h5"
            model.save_weights(h5file, overwrite=True)
            tfjs.converters.save_keras_model(model, './')
            
            print(f'Saved model in {save_path}')
            
        if print_steps:
            maze.print_maze()

    h5file = save_path + ".h5"
    model.save_weights(h5file, overwrite=True)        
    tfjs.converters.save_keras_model(model, './')
    print(f'Saved model in {save_path}')




# This hyperparamter is used to control the ratio of exploration and exploitation
epsilon = 0.1
maze_map = MazeMap(maze_test)
#maze_list = [maze_map, MazeMap(maze_test2)]
maze_list = read_mazes(num_mazes=5)

training_mazes = MazeSet(maze_list)
model = build_model(maze_test)
start_train(model, training_mazes, 300, 8 * maze_map.get_state_size(), history_size=5, print_steps=True)


['m1.csv', 'm10.csv', 'm2.csv', 'm3.csv', 'm4.csv', 'm5.csv', 'm6.csv', 'm7.csv', 'm8.csv', 'm9.csv']
./mazes/m1.csv
Initialization complete, begin training
(1, 64)
Old loc: (0, 0)
Mode.VALID
New loc: (1, 0)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░mm      ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (1, 0)
Mode.INVALID
New loc: (1, 0)
chosen action: Action.LEFT 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░mm      ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (1, 0)
Mode.VALID
New loc: (1, 1)
chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░  mm    ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc:

  return h5py.File(h5file)


Old loc: (0, 0)
Mode.INVALID
New loc: (0, 0)
chosen action: Action.LEFT 	reward: -10
░░░░░░░░░░░░░░░░░░
░mm██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (0, 0)
Mode.INVALID
New loc: (0, 0)
chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░mm██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (0, 0)
Mode.INVALID
New loc: (0, 0)
chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░mm██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (0, 0)
Mode.INVALID
New loc: (0, 0)
chosen action: Action.UP 	reward: -10
░░░░░░░░░░░░░░░░░░
░mm██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    

Old loc: (3, 4)
Mode.VALID
New loc: (4, 4)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██mm    ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (4, 4)
Mode.INVALID
New loc: (4, 4)
chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██mm    ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (4, 4)
Mode.VALID
New loc: (4, 5)
chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██  mm  ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (4, 5)
Mode.VALID
New loc: (4, 6)
chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    █

Old loc: (7, 2)
Mode.PREVIOUS
New loc: (7, 1)
chosen action: Action.LEFT 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░  mm  ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (7, 1)
Mode.PREVIOUS
New loc: (7, 2)
chosen action: Action.RIGHT 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░    mm██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (7, 2)
Mode.PREVIOUS
New loc: (7, 1)
chosen action: Action.LEFT 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░  mm  ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (7, 1)
Mode.TERMINATED
New loc: (7, 2)
chosen action: Action.RIGHT 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██

Old loc: (1, 2)
Mode.INVALID
New loc: (1, 2)
chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░    mm  ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (1, 2)
Mode.VALID
New loc: (1, 3)
chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░      mm██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (1, 3)
Mode.VALID
New loc: (2, 3)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██mm██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (2, 3)
Mode.VALID
New loc: (3, 3)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██mm  ██

Old loc: (0, 6)
Mode.INVALID
New loc: (0, 6)
chosen action: Action.RIGHT 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██        mm██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (0, 6)
Mode.INVALID
New loc: (0, 6)
chosen action: Action.DOWN 	reward: -10
░░░░░░░░░░░░░░░░░░
░  ██        mm██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (0, 6)
Mode.PREVIOUS
New loc: (0, 5)
chosen action: Action.LEFT 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██      mm  ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (0, 5)
Mode.PREVIOUS
New loc: (0, 6)
chosen action: Action.RIGHT 	reward: -5
░░░░░░░░░░░░░░░░░░
░  ██        mm██░
░        ██████  ░
░██  ██  ██████  ░
░    ██  

Old loc: (1, 1)
Mode.VALID
New loc: (1, 2)
chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░    mm  ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (1, 2)
Mode.VALID
New loc: (1, 3)
chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░      mm██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (1, 3)
Mode.VALID
New loc: (2, 3)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██mm██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (2, 3)
Mode.VALID
New loc: (3, 3)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██mm  ██

Old loc: (4, 5)
Mode.VALID
New loc: (4, 6)
chosen action: Action.RIGHT 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██    mm██░
░    ████████    ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (4, 6)
Mode.VALID
New loc: (5, 6)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████mm  ░
░██  ████      ██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (5, 6)
Mode.VALID
New loc: (6, 6)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██    ░
░██    ██      ██░
░    ████████    ░
░██  ████    mm██░
░      ██  ██  EE░
░░░░░░░░░░░░░░░░░░

Old loc: (6, 6)
Mode.VALID
New loc: (7, 6)
chosen action: Action.DOWN 	reward: -0.5
░░░░░░░░░░░░░░░░░░
░  ██          ██░
░        ██████  ░
░██  ██  ██████  ░
░    ██    ██ 