In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

In [None]:
class GridWorld(object):
    def __init__(self):
        #basic properties of the grid
        self.rows = int(5)
        self.cols = int(5)
        self.num_cells = self.rows * self.cols
        self.rand_move_prob = 0.2
        
        #choose intial position of the agent randomly within the first quadrant
        self.init_position = [np.random.randint(0, self.cols*0.25), np.random.randint(0, self.rows*0.25)]
        self.agent_position = np.asarray(self.init_position)
        
        #choose position of the bomb and of the gold
        self.bomb_position = np.asarray([3, 3])
        self.gold_position = np.asarray([3, 4])
        
        #set up reward matrix
        self.rewards = np.zeros(shape=(self.rows, self.cols))
        self.rewards[tuple(self.bomb_position)] = -10
        self.rewards[tuple(self.gold_position)] = 10
        
        #define possible actions
        self.actions = ["LEFT", "RIGHT", "UP", "DOWN"]
        self.num_actions = len(self.actions)
    
    def get_available_actions(self):
        return self.actions
        
    def draw_internal_state(self):
        #print a string depicting the current internal state
        string=''
        for j in range(self.rows):
            for i in range(self.cols):
                pos = (i, j)
                if np.array_equal(pos, self.agent_position):
                    string+="O"
                elif np.array_equal(pos, self.bomb_position):
                    string+='B'
                elif np.array_equal(pos, self.gold_position):
                    string+='G'
                else:
                    string+="#"
            string+="\n"
        
        string+="\n"
        #string+="O-Position of Agent\t B-Position of Bomb\t G-Position of Gold"
        return string 
    
    def make_step(self, action_index):
        #Roll a dice and see if agent does a random move instead of the intended move
        if np.random.uniform(0,1) < self.rand_move_prob:
            action_indices = np.arange(self.num_actions, dtype="int")
            action_indices = np.delete(action_indices, action_index)
            action_index = np.random.choice(action_indices, 1)[0]     
        #Check if the agent hits a wall
        action = self.actions[action_index]
        if action == "UP":
            candidate_position = self.agent_position + [0, 1]
            new_position = candidate_position
            if candidate_position[1]==self.rows:
                new_position = self.agent_position
        elif action == "DOWN":
            candidate_position = self.agent_position + [0, -1]
            new_position = candidate_position
            if candidate_position[1]==-1:
                new_position = self.agent_position
        elif action == "LEFT":
            candidate_position = self.agent_position + [-1, 0]
            new_position = candidate_position
            if candidate_position[0]==-1:
                new_position = self.agent_position
        elif action == "RIGHT":
            candidate_position = self.agent_position + [1, 0]
            new_position = candidate_position
            if candidate_position[0]==self.cols:
                new_position = self.agent_position
        
        self.agent_position = new_position
        reward = self.rewards[tuple(self.agent_position)]
        reward+=-1
        return reward, new_position
    
    def reset(self):
        self.agent_position = np.asarray(self.init_position)
        
    def isFinalState(self):
        if np.array_equal(self.agent_position, self.bomb_position) or np.array_equal(self.agent_position, self.gold_position):
            return True
        return False

In [None]:
class RandomAgent():
    def choose_action(self, available_actions):
        number_of_actions = len(available_actions)
        random_action_index = np.random.randint(0, number_of_actions)
        return random_action_index

In [None]:
value = 0
rand_reward_array=[]
env = GridWorld()
agr = RandomAgent()
available_actions = env.actions
k=0
while k<500:
    action_index=agr.choose_action(available_actions)
    rew, pos = env.make_step(action_index)
    value += rew
    if env.isFinalState():
        env.reset()
        rand_reward_array.append(value)
        value=0
        k+=1

In [None]:
class QAgent(object):
    def choose_action(self, available_actions, Q_table, state_index, eps):
        number_of_actions = len(available_actions)
        statex, statey = state_index
        if np.random.uniform(0, 1)<eps:
            action_index=np.random.randint(0, number_of_actions)
        else:
            max_Q = np.max(Q_table[statex, statey])
            max_indices=np.nonzero(Q_table[statex, statey]==max_Q)[0]
            action_index = np.random.choice(max_indices)
        return action_index

In [None]:
env = GridWorld()
agQ = QAgent()
available_actions = env.actions
Q_table = np.zeros((5,5,4))
alpha = 0.1 #learning rate
gamma = 1 #discounted return factor
value = 0
k = 0
step =0
reward_array = []
while k<500:
    game_over=False
    while step<1000 and not game_over:
        statex, statey = env.agent_position
        action_index=agQ.choose_action(available_actions, Q_table, state, 0.05)
        rew, [new_statex, new_statey] = env.make_step(action_index)
        value +=rew
        step +=1
        if env.isFinalState():
            game_over=True
        current_q_value = Q_table[statex, statey, action_index]
        max_q_value_in_new_state = np.max(Q_table[new_statex, new_statey])
        Q_table[statex, statey, action_index] = (1 - alpha) * current_q_value + alpha * (rew + gamma * max_q_value_in_new_state)
    k+=1  
    reward_array.append(value)
    value=0
    step=0
    env.reset()

In [None]:
plt.plot(reward_array)
#plt.plot(rand_reward_array)
plt.show()