# Install needed libraries

In [1]:
!pip install tensorflow==2.3.0
!pip install gym
!pip install -e . #setup project

#import needed libraries
import gym 
import random
import sys
import numpy as np
import math

#import game environement

import gym_MLAA
env = gym.make('maze-random-5x5-portals-v0')

states = env.observation_space
actions = env.action_space
print(states)
print(actions)

Obtaining file:///Users/Diego/Documents/DATAAI/X-INF581/gym_MLAA
Installing collected packages: gym-MLAA
  Attempting uninstall: gym-MLAA
    Found existing installation: gym-MLAA 0.1
    Uninstalling gym-MLAA-0.1:
      Successfully uninstalled gym-MLAA-0.1
  Running setup.py develop for gym-MLAA
Successfully installed gym-MLAA
pygame 2.1.2 (SDL 2.0.18, Python 3.8.3)
Hello from the pygame community. https://www.pygame.org/contribute.html
Box([0 0], [4 4], (2,), int64)
Discrete(4)


# Setting Env and Training Constants

In [2]:
'''
Defining the environment related constants
'''
# Number of discrete states (bucket) per state dimension
MAZE_SIZE = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))
NUM_BUCKETS = MAZE_SIZE  # one bucket per grid

# Number of discrete actions
NUM_ACTIONS = env.action_space.n  # ["N", "S", "E", "W"]
# Bounds for each discrete state
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))

'''
Learning related constants
'''
MIN_EXPLORE_RATE = 0.001
MIN_LEARNING_RATE = 0.2
DECAY_FACTOR = np.prod(MAZE_SIZE, dtype=float) / 10.0

'''
Defining the simulation related constants
'''
NUM_EPISODES = 50000
MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100
STREAK_TO_END = 100
SOLVED_T = np.prod(MAZE_SIZE, dtype=int)
#DEBUG_MODE = 0
RENDER_MAZE = True
ENABLE_RECORDING = True

'''
Creating a Q-Table for each state-action pair
'''
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)

'''
Begin simulation
'''
recording_folder = "/tmp/maze_q_learning"


# Define Helper Functions

In [3]:
def select_action(state, explore_rate,q_table):
    # Select a random action
    if random.random() < explore_rate:
        action = env.action_space.sample()
    # Select the action with the highest q
    else:
        action = int(np.argmax(q_table[state]))
    return action


def get_explore_rate(t):
    return max(MIN_EXPLORE_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))


def get_learning_rate(t):
    return max(MIN_LEARNING_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))


def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i]-1)*STATE_BOUNDS[i][0]/bound_width
            scaling = (NUM_BUCKETS[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

In [4]:
#different goals
CLOSE_GOAL = np.array((1,1))
MID_GOAL = (np.array(env.maze_view.maze_size)/2).astype(int)
FAR_GOAL = np.array((env.maze_view.maze_size[0]-1,env.maze_view.maze_size[1]-1)) #Far away

#Store all results
Goals = [CLOSE_GOAL,MID_GOAL,FAR_GOAL]

# Q Learning

In [5]:
Results = []
for goal in Goals:
    #to store the results for current goal
    Q_timeSteps_list = []
    Q_TotalReward_list = []
    Q_Episodes_list = []
    #restarting the q_table
    q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)
    #changing goals
    env.maze_view.change_goal(goal)
    # Instantiating the learning related parameters
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    discount_factor = 0.99

    num_streaks = 0
    # Render tha maze
    env.render()
    
    for episode in range(NUM_EPISODES):

        # Reset the environment
        obv = env.reset()

        # the initial state
        state_0 = state_to_bucket(obv)
        action = select_action(state_0, explore_rate,q_table)
        total_reward = 0

        for t in range(MAX_T):

            # execute the action
            obv, reward, done, _ = env.step(action)

            # Observe the result
            state = state_to_bucket(obv)
            total_reward += reward

            # Select an action
            action = select_action(state_0, explore_rate,q_table)

            # Update the Q based on the result
            best_q = np.amax(q_table[state])
            q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * (best_q) - q_table[state_0 + (action,)])
            #q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * q_table[state_0 + (action,)] - q_table[state_0 + (action,)])

            # Setting up for the next iteration
            state_0 = state

            # Render tha maze
#             if episode==0:
            if RENDER_MAZE:
                env.render()

            if env.is_game_over():
                sys.exit()

            if done:
                print("Episode %d finished after %f time steps with total reward = %f (streak %d)."
                      % (episode, t, total_reward, num_streaks))
                Q_Episodes_list.append(episode)
                Q_timeSteps_list.append(t)
                Q_TotalReward_list.append(total_reward)

                if t <= SOLVED_T:
                    num_streaks += 1
                else:
                    num_streaks = 0
                break

            elif t >= MAX_T - 1:
                print("Episode %d timed out at %d with total reward = %f."
                      % (episode, t, total_reward))

        # It's considered done when it's solved over 120 times consecutively
        if num_streaks > STREAK_TO_END:
            break

        # Update parameters
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)
    Results.append(( Q_timeSteps_list,Q_TotalReward_list,Q_Episodes_list ))

Episode 0 finished after 188.000000 time steps with total reward = 0.248000 (streak 0).
Episode 1 finished after 395.000000 time steps with total reward = -0.580000 (streak 0).
Episode 2 finished after 108.000000 time steps with total reward = 0.568000 (streak 0).
Episode 3 finished after 253.000000 time steps with total reward = -0.012000 (streak 0).
Episode 4 finished after 118.000000 time steps with total reward = 0.528000 (streak 0).
Episode 5 finished after 117.000000 time steps with total reward = 0.532000 (streak 0).
Episode 6 finished after 248.000000 time steps with total reward = 0.008000 (streak 0).
Episode 7 finished after 158.000000 time steps with total reward = 0.368000 (streak 0).
Episode 8 finished after 177.000000 time steps with total reward = 0.292000 (streak 0).
Episode 9 finished after 87.000000 time steps with total reward = 0.652000 (streak 0).
Episode 10 finished after 51.000000 time steps with total reward = 0.796000 (streak 0).
Episode 11 finished after 125.0

Episode 93 finished after 16.000000 time steps with total reward = 0.936000 (streak 63).
Episode 94 finished after 16.000000 time steps with total reward = 0.936000 (streak 64).
Episode 95 finished after 16.000000 time steps with total reward = 0.936000 (streak 65).
Episode 96 finished after 16.000000 time steps with total reward = 0.936000 (streak 66).
Episode 97 finished after 16.000000 time steps with total reward = 0.936000 (streak 67).
Episode 98 finished after 16.000000 time steps with total reward = 0.936000 (streak 68).
Episode 99 finished after 16.000000 time steps with total reward = 0.936000 (streak 69).
Episode 100 finished after 16.000000 time steps with total reward = 0.936000 (streak 70).
Episode 101 finished after 16.000000 time steps with total reward = 0.936000 (streak 71).
Episode 102 finished after 16.000000 time steps with total reward = 0.936000 (streak 72).
Episode 103 finished after 16.000000 time steps with total reward = 0.936000 (streak 73).
Episode 104 finis

Episode 186 finished after 10.000000 time steps with total reward = 0.960000 (streak 56).
Episode 187 finished after 10.000000 time steps with total reward = 0.960000 (streak 57).
Episode 188 finished after 10.000000 time steps with total reward = 0.960000 (streak 58).
Episode 189 finished after 10.000000 time steps with total reward = 0.960000 (streak 59).
Episode 190 finished after 10.000000 time steps with total reward = 0.960000 (streak 60).
Episode 191 finished after 10.000000 time steps with total reward = 0.960000 (streak 61).
Episode 192 finished after 10.000000 time steps with total reward = 0.960000 (streak 62).
Episode 193 finished after 10.000000 time steps with total reward = 0.960000 (streak 63).
Episode 194 finished after 10.000000 time steps with total reward = 0.960000 (streak 64).
Episode 195 finished after 10.000000 time steps with total reward = 0.960000 (streak 65).
Episode 196 finished after 10.000000 time steps with total reward = 0.960000 (streak 66).
Episode 19

Episode 49 finished after 10.000000 time steps with total reward = 0.960000 (streak 11).
Episode 50 finished after 10.000000 time steps with total reward = 0.960000 (streak 12).
Episode 51 finished after 10.000000 time steps with total reward = 0.960000 (streak 13).
Episode 52 finished after 10.000000 time steps with total reward = 0.960000 (streak 14).
Episode 53 finished after 10.000000 time steps with total reward = 0.960000 (streak 15).
Episode 54 finished after 10.000000 time steps with total reward = 0.960000 (streak 16).
Episode 55 finished after 10.000000 time steps with total reward = 0.960000 (streak 17).
Episode 56 finished after 10.000000 time steps with total reward = 0.960000 (streak 18).
Episode 57 finished after 10.000000 time steps with total reward = 0.960000 (streak 19).
Episode 58 finished after 10.000000 time steps with total reward = 0.960000 (streak 20).
Episode 59 finished after 10.000000 time steps with total reward = 0.960000 (streak 21).
Episode 60 finished a

Episode 2 finished after 102.000000 time steps with total reward = 0.592000 (streak 0).
Episode 3 finished after 232.000000 time steps with total reward = 0.072000 (streak 0).
Episode 4 finished after 241.000000 time steps with total reward = 0.036000 (streak 0).
Episode 5 finished after 247.000000 time steps with total reward = 0.012000 (streak 0).
Episode 6 finished after 216.000000 time steps with total reward = 0.136000 (streak 0).
Episode 7 finished after 223.000000 time steps with total reward = 0.108000 (streak 0).
Episode 8 finished after 352.000000 time steps with total reward = -0.408000 (streak 0).
Episode 9 finished after 13.000000 time steps with total reward = 0.948000 (streak 0).
Episode 10 finished after 261.000000 time steps with total reward = -0.044000 (streak 1).
Episode 11 finished after 45.000000 time steps with total reward = 0.820000 (streak 0).
Episode 12 finished after 177.000000 time steps with total reward = 0.292000 (streak 0).
Episode 13 finished after 53.

Episode 98 finished after 8.000000 time steps with total reward = 0.968000 (streak 31).
Episode 99 finished after 8.000000 time steps with total reward = 0.968000 (streak 32).
Episode 100 finished after 8.000000 time steps with total reward = 0.968000 (streak 33).
Episode 101 finished after 8.000000 time steps with total reward = 0.968000 (streak 34).
Episode 102 finished after 8.000000 time steps with total reward = 0.968000 (streak 35).
Episode 103 finished after 8.000000 time steps with total reward = 0.968000 (streak 36).
Episode 104 finished after 8.000000 time steps with total reward = 0.968000 (streak 37).
Episode 105 finished after 8.000000 time steps with total reward = 0.968000 (streak 38).
Episode 106 finished after 8.000000 time steps with total reward = 0.968000 (streak 39).
Episode 107 finished after 8.000000 time steps with total reward = 0.968000 (streak 40).
Episode 108 finished after 8.000000 time steps with total reward = 0.968000 (streak 41).
Episode 109 finished af

# Plot Q-Learning Results

In [6]:
import os
titles = ['Close','Mid','Far']
path = "data/Q_learning/Portals/"
# path = "data/Q_learning/NoPortals/"
i=0
for tuple_ in Results:
    #save into files
    if not os.path.exists(path):
        os.makedirs(path)
    np.save(path+f"{titles[i]}",tuple_)
    i+=1

# SARSA

In [7]:
Results = []
for goal in Goals:
    #to store the results for current goal
    S_timeSteps_list = []
    S_TotalReward_list = []
    S_Episodes_list = []
    #restarting the q_table
    q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)
    #changing goals
    env.maze_view.change_goal(goal)
    # Instantiating the learning related parameters
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    discount_factor = 0.99

    num_streaks = 0

    # Render tha maze
    env.render()

    for episode in range(NUM_EPISODES):

        # Reset the environment
        obv = env.reset()

        # the initial state
        state_0 = state_to_bucket(obv)
        total_reward = 0

        for t in range(MAX_T):

            # execute the action
            obv, reward, done, _ = env.step(action)

            # Observe the result
            state = state_to_bucket(obv)
            total_reward += reward

            # Select an action
            action_new = select_action(state, explore_rate,q_table)

            # Update the Q based on the result
            #best_q = np.amax(q_table[state])
            #q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * (best_q) - q_table[state_0 + (action,)])
            q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * q_table[state + (action_new,)] - q_table[state_0 + (action,)])

            # Setting up for the next iteration
            state_0 = state
            action = action_new

            # Render tha maze
            if RENDER_MAZE:
                env.render()

            if env.is_game_over():
                sys.exit()

            if done:
                print("Episode %d finished after %f time steps with total reward = %f (streak %d)."
                      % (episode, t, total_reward, num_streaks))
                S_Episodes_list.append(episode)
                S_timeSteps_list.append(t)
                S_TotalReward_list.append(total_reward)

                if t <= SOLVED_T:
                    num_streaks += 1
                else:
                    num_streaks = 0
                break

            elif t >= MAX_T - 1:
                print("Episode %d timed out at %d with total reward = %f."
                      % (episode, t, total_reward))

        # It's considered done when it's solved over 120 times consecutively
        if num_streaks > STREAK_TO_END:
            break

        # Update parameters
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)
    Results.append(( S_timeSteps_list,S_TotalReward_list,S_Episodes_list ))

Episode 0 finished after 139.000000 time steps with total reward = 0.444000 (streak 0).
Episode 1 finished after 203.000000 time steps with total reward = 0.188000 (streak 0).
Episode 2 finished after 81.000000 time steps with total reward = 0.676000 (streak 0).
Episode 3 finished after 73.000000 time steps with total reward = 0.708000 (streak 0).
Episode 4 finished after 21.000000 time steps with total reward = 0.916000 (streak 0).
Episode 5 finished after 43.000000 time steps with total reward = 0.828000 (streak 1).
Episode 6 finished after 54.000000 time steps with total reward = 0.784000 (streak 0).
Episode 7 finished after 9.000000 time steps with total reward = 0.964000 (streak 0).
Episode 8 finished after 16.000000 time steps with total reward = 0.936000 (streak 1).
Episode 9 finished after 9.000000 time steps with total reward = 0.964000 (streak 2).
Episode 10 finished after 11.000000 time steps with total reward = 0.956000 (streak 3).
Episode 11 finished after 68.000000 time s

Episode 95 finished after 8.000000 time steps with total reward = 0.968000 (streak 83).
Episode 96 finished after 8.000000 time steps with total reward = 0.968000 (streak 84).
Episode 97 finished after 8.000000 time steps with total reward = 0.968000 (streak 85).
Episode 98 finished after 8.000000 time steps with total reward = 0.968000 (streak 86).
Episode 99 finished after 8.000000 time steps with total reward = 0.968000 (streak 87).
Episode 100 finished after 8.000000 time steps with total reward = 0.968000 (streak 88).
Episode 101 finished after 8.000000 time steps with total reward = 0.968000 (streak 89).
Episode 102 finished after 8.000000 time steps with total reward = 0.968000 (streak 90).
Episode 103 finished after 8.000000 time steps with total reward = 0.968000 (streak 91).
Episode 104 finished after 8.000000 time steps with total reward = 0.968000 (streak 92).
Episode 105 finished after 8.000000 time steps with total reward = 0.968000 (streak 93).
Episode 106 finished after

Episode 75 finished after 9.000000 time steps with total reward = 0.964000 (streak 67).
Episode 76 finished after 9.000000 time steps with total reward = 0.964000 (streak 68).
Episode 77 finished after 9.000000 time steps with total reward = 0.964000 (streak 69).
Episode 78 finished after 9.000000 time steps with total reward = 0.964000 (streak 70).
Episode 79 finished after 9.000000 time steps with total reward = 0.964000 (streak 71).
Episode 80 finished after 9.000000 time steps with total reward = 0.964000 (streak 72).
Episode 81 finished after 9.000000 time steps with total reward = 0.964000 (streak 73).
Episode 82 finished after 9.000000 time steps with total reward = 0.964000 (streak 74).
Episode 83 finished after 9.000000 time steps with total reward = 0.964000 (streak 75).
Episode 84 finished after 9.000000 time steps with total reward = 0.964000 (streak 76).
Episode 85 finished after 9.000000 time steps with total reward = 0.964000 (streak 77).
Episode 86 finished after 9.0000

Episode 60 finished after 69.000000 time steps with total reward = 0.724000 (streak 51).
Episode 61 finished after 7.000000 time steps with total reward = 0.972000 (streak 0).
Episode 62 finished after 7.000000 time steps with total reward = 0.972000 (streak 1).
Episode 63 finished after 7.000000 time steps with total reward = 0.972000 (streak 2).
Episode 64 finished after 7.000000 time steps with total reward = 0.972000 (streak 3).
Episode 65 finished after 7.000000 time steps with total reward = 0.972000 (streak 4).
Episode 66 finished after 7.000000 time steps with total reward = 0.972000 (streak 5).
Episode 67 finished after 7.000000 time steps with total reward = 0.972000 (streak 6).
Episode 68 finished after 7.000000 time steps with total reward = 0.972000 (streak 7).
Episode 69 finished after 7.000000 time steps with total reward = 0.972000 (streak 8).
Episode 70 finished after 7.000000 time steps with total reward = 0.972000 (streak 9).
Episode 71 finished after 7.000000 time s

Episode 153 finished after 7.000000 time steps with total reward = 0.972000 (streak 92).
Episode 154 finished after 7.000000 time steps with total reward = 0.972000 (streak 93).
Episode 155 finished after 7.000000 time steps with total reward = 0.972000 (streak 94).
Episode 156 finished after 7.000000 time steps with total reward = 0.972000 (streak 95).
Episode 157 finished after 7.000000 time steps with total reward = 0.972000 (streak 96).
Episode 158 finished after 7.000000 time steps with total reward = 0.972000 (streak 97).
Episode 159 finished after 8.000000 time steps with total reward = 0.968000 (streak 98).
Episode 160 finished after 7.000000 time steps with total reward = 0.972000 (streak 99).
Episode 161 finished after 7.000000 time steps with total reward = 0.972000 (streak 100).


In [8]:
#saving results
titles = ['Close','Mid','Far']
path = "data/SARSA/Portals/"
# path = "data/SARSA/NoPortals/"
i=0
for tuple_ in Results:
    #save into files
    if not os.path.exists(path):
        os.makedirs(path)
    np.save(path+f"{titles[i]}",tuple_)
    i+=1

# DQN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
class Replay(object):
    #memory for training the DQN
    def __init__(self):
        self.max_size = 100
        self.transitions = []  # (old state, new state, action, reward)

    def store(self, s0, s, a, r):
        self.transitions.append((s0, s, a, r))
        if len(self.transitions) > self.max_size:
            self.transitions.pop(0)

    def sample(self):
        return self.transitions[random.randint(0, len(self.transitions) - 1)]


class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.layer1 = nn.LSTM(2, 4, num_layers=32)
        self.layer2 = nn.LSTM(4, 8, num_layers=64)
        self.layer3 = nn.LSTM(8, 16, num_layers=64)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(16, 4)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x, h = self.layer1(x)
        x=self.relu(x)
        x,h=self.layer2(x)
        x=self.relu(x)
        x,h=self.layer3(x)
        x = self.relu(x)
        x = self.fc(x)
        # x = self.sigmoid(x)
        return x

    def test(self):
        pass

    def select_action(self, state, er, device):
        # Select a random action
        if random.random() < er:
            action = env.action_space.sample()
        # Select the action with the highest q
        else:
            with torch.no_grad():
                state = torch.FloatTensor([state[0], state[1]]).view(1, 1, len(state)).to(device)
                val, action = self.forward(state)[0][0].max(0)
                action = action.item()
        return action

In [None]:

# Instantiating the learning related parameters
discount_factor = 0.99
update_target = 5

num_streaks = 0

# Render the maze
env.render()
dqn = DQN()
device = torch.device('cpu')
dqn = dqn.to(device)
target = DQN()
target = target.to(device)
dqn_opt = optim.Adam(dqn.parameters())
dqn_opt.zero_grad()
replay = Replay()
loss = nn.MSELoss()

for episode in range(NUM_EPISODES):
    # Reset the environment
    obv = env.reset()

    # the initial state
    s0 = state_to_bucket(obv)
    total_reward = 0
    total_loss = 0

    # Update parameters
    explore_rate = get_explore_rate(episode)
    learning_rate = get_learning_rate(episode)

    for t in range(MAX_T):
        # Select an action
        action = dqn.select_action(s0, explore_rate, device)

        # execute the action
        obv, reward, done, _ = env.step(action)

        # Observe the result
        s = state_to_bucket(obv)
        total_reward += reward

        # store the transition
        replay.store(s0, s, action, reward)

        # update the dqn
        r_s0, r_s, r_a, r_r = replay.sample()
        r_s0 = torch.FloatTensor([r_s0[0], r_s0[1]]).view(1, 1, len(r_s0)).to(device)
        r_s = torch.FloatTensor([r_s[0], r_s[1]]).view(1, 1, len(r_s)).to(device)
        dqn_output =dqn.forward(r_s0)
        target_output=target.forward(r_s)
        dqn_opt.zero_grad()
        l = loss(dqn_output, target_output)
        l.backward()
        dqn_opt.step()
        total_loss+=l.item()

        # update the target network
        if episode % update_target == 0:
            target.load_state_dict(dqn.state_dict())

        # Setting up for the next iteration
        s0 = s

        # Render tha maze
        if RENDER_MAZE:
            env.render()

        if env.is_game_over():
            sys.exit()

        if done:
            print("Episode %d finished after %f time steps with total reward = %f, loss = %f (streak %d)."
                  % (episode, t, total_reward, total_loss, num_streaks))

            if t <= SOLVED_T:
                num_streaks += 1
            else:
                num_streaks = 0
            break

        elif t >= MAX_T - 1:
            print("Episode %d timed out at %d with total reward = %f."
                  % (episode, t, total_reward))

    # It's considered done when it's solved over 120 times consecutively
    if num_streaks > STREAK_TO_END:
        break
