In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

In [None]:
class GridWorld(object):
    
    def __init__(self, nCols=5, nRows=5, random_move_prob=0.2):
        
        #Initialise values that define the grid and some other essential parameters
        self.nCols = nCols
        self.nRows = nRows
        if self.nRows<2 or self.nCols<2:
            raise ValueError("Number of columns and rows may not be smaller than 2")
        self.nCells = nCols*nRows
        self.random_move_prob = random_move_prob
        
        #Randomly choose bomb and gold position in the bottom half of the grid
        bomb_gold_array = np.arange(0.5*self.nCells, self.nCells, 1, dtype=int)
        self.bomb = np.random.choice(bomb_gold_array)
        self.gold = np.random.choice(np.delete(bomb_gold_array, np.where(bomb_gold_array == self.bomb)))
        
        #Set reward array
        self.reward = np.zeros(self.nCells)
        self.reward[self.gold] = 10
        self.reward[self.bomb] = -10
        
        #Set initial position of the agent randomly in the upper quadrant
        ri = np.random.randint(0, 0.5*self.nRows)
        rj = np.random.randint(0, 0.5*self.nCols)
        self.agent = ri*self.nCols + rj
        
        #Define possible actions
        self.actions = ["NORTH", "SOUTH", "WEST", "EAST"]
        self.nActions = len(self.actions)
        
    def get_avaiable_actions(self):
        return self.actions
    
    def isFinalPosition(self):
        if self.agent in [self.gold, self.bomb]:
            return True
        return False
    
    def reset(self):
        ri = np.random.randint(0, 0.5*self.nRows)
        rj = np.random.randint(0, 0.5*self.nCols)
        self.agent = ri*self.nCols + rj
    
    def makeStep(self, action_index):
        if np.random.uniform(0, 1)<self.random_move_prob:
            action_indices = np.delete(np.arange(0, self.nActions, 1, dtype=int), action_index)
            action_index = np.random.choice(action_indices)
        action=self.actions[action_index]
        
        #test if proposed step would lead the agent outside of the grid. If yes, stay at current position.
        #If not, update agent position according to action selected.
        new_pos = self.agent
        
        if action == "NORTH":
            proposed_pos = self.agent - self.nCols
            if proposed_pos>=0:
                new_pos = proposed_pos
        elif action == "SOUTH":
            proposed_pos = self.agent + self.nCols
            if proposed_pos<self.nCells:
                new_pos = proposed_pos
        elif action == "WEST":
            proposed_pos = self.agent - 1
            if self.agent%self.nCols != 0:
                new_pos = proposed_pos
        elif action == "EAST":
            proposed_pos = self.agent + 1
            if self.agent%self.nCols != self.nCols - 1:
                new_pos = proposed_pos
        else:
            raise ValueError("Invalid action selected! Actions need to be NORTH, SOUTH, WEST or EAST")
        
        self.agent = new_pos
        
        #Calculate reward of the agent's new position
        reward = self.reward[self.agent]
        reward += -1
        
        return new_pos, reward     
    
    def internalStateASCII(self, agent_pos):
        string=''
        for i in range(self.nCols+2):
            string+="-"
        string+="\n"
        for i in range(self.nRows):
            string+="|"
            for j in range(self.nCols):
                index = i*self.nCols + j
                if index == agent_pos:
                    string+="|"
                elif index == self.gold:
                    string+="G"
                elif index == self.bomb:
                    string+="B"
                else:
                    string+=" "
            string+="|\n"
        for i in range(self.nCols+2):
            string+="-"
        return string
                
        

In [None]:
class RandomAgent(object):
    def chooseAction(self, nActions):
        actionIndex = np.random.choice(np.arange(0, nActions, 1, dtype=int))
        return actionIndex

In [None]:
gw = GridWorld(10,10)
ra = RandomAgent()
nActions = gw.nActions
for i in range(1000):
    actionIndex = ra.chooseAction(nActions)
    new_pos, rew = gw.makeStep(actionIndex)
    print(gw.internalStateASCII(new_pos))
    time.sleep(1/10)
    clear_output(wait=True)

In [None]:
class QAgent(object):
    def __init__(self, env, gamma=1, alpha=0.1, eps=0.05):
        self.gamma = gamma
        self.alpha = alpha
        self.eps = eps
        self.nActions = env.nActions
        self.qTable = np.zeros(shape=(env.nCells,self.nActions))
    def chooseAction(self):
        if np.random.uniform() < self.eps:
            action_index = np.random.choice(np.arange(0, self.nActions, 1, dtype=int))
        else:
            pos = env.agent
            max_q_value = max(self.qTable[pos])
            max_indices = np.where(self.qTable[pos] == max_q_value)
            action_index = np.random.choice(max_indices[0])
            
        return action_index
    
    def learn(self, old_pos, reward, new_pos, action_index):
        qMax_new = max(self.qTable[new_pos])
        q_old = self.qTable[old_pos, action_index]
        self.qTable[old_pos, action_index] += self.alpha*(reward + self.gamma*qMax_new - q_old)                        

In [None]:
def play(env, agent, nGames=1000, maxSteps=1000):
        rewards = np.zeros(nGames)
        for i in range(nGames):
            for j in range(maxSteps):
                old_pos = env.agent
                action_index = agent.chooseAction()
                new_pos, reward = env.makeStep(action_index)
                agent.learn(old_pos, reward, new_pos, action_index)
                rewards[i]+=reward
                if env.isFinalPosition():
                    env.reset()
                    break       
        return rewards

In [None]:
env = GridWorld(10, 10)
env2 = GridWorld(10, 10)
env2.reward = env.reward
env2.gold = env.gold
env2.bomb = env.bomb
qa = QAgent(env)
rewards=play(env, qa)
plt.plot(rewards)
plt.show()

In [None]:
for i in range(1000):
    actionIndexQ = qa.chooseAction()
    actionIndexR = ra.chooseAction(env2.nActions)
    new_posQ, _ = env.makeStep(actionIndexQ)
    new_posR, _ = env2.makeStep(actionIndexR)
    if env.isFinalPosition():
        env.reset()
    if env2.isFinalPosition():
        env2.reset()
    string=''
    string+=env.internalStateASCII(new_posQ)+"\n"
    string+=env2.internalStateASCII(new_posR)
    print(string)
    time.sleep(1/5)
    clear_output(wait=True)