In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()

BOARD_ROWS = 3
BOARD_COLS = 4
WIN_STATE = (0, 3)
LOSE_STATE = (2, 3)

REWARD = 1
PENALTY = -1000


class State:
    def __init__(self, state=None):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.board[1, 1] = -1
        if not state:
            
            self.state = self.random_START()
        else:
            self.state = state
        self.isEnd = False
        

    def random_START(self):
        while True:
            i = random.randrange(BOARD_ROWS)
            j = random.randrange(BOARD_COLS)
            
            if (i,j) != (1,1) and (i,j) != WIN_STATE and (i,j) != LOSE_STATE:
                return (i,j)

    def giveReward(self):
        if self.state == WIN_STATE:
            print("reward!")
            return REWARD
        if self.state == LOSE_STATE:
            print("penalty!")
            return PENALTY
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def _chooseActionProb(self, action):
        
        p0 = [0.8, 0.1, 0.1]
        if action == "up":
            return np.random.choice(["up", "left", "right"], p=p0)
        if action == "down":
            return np.random.choice(["down", "left", "right"], p=p0)
        if action == "left":
            return np.random.choice(["left", "up", "down"], p=p0)
        if action == "right":
            return np.random.choice(["right", "up", "down"], p=p0)

    def nxtPosition(self, action):


        # non-deterministic
        action = self._chooseActionProb(action)

        if action == "up":
            nxtState = (self.state[0] - 1, self.state[1])
        elif action == "down":
            nxtState = (self.state[0] + 1, self.state[1])
        elif action == "left":
            nxtState = (self.state[0], self.state[1] - 1)
        else:
            nxtState = (self.state[0], self.state[1] + 1)


        # if next state is legal
        if (nxtState[0] >= 0) and (nxtState[0] <= 2):
            if (nxtState[1] >= 0) and (nxtState[1] <= 3):
                if nxtState != (1, 1):
                    return nxtState
        return self.state



class Agent:

    def __init__(self):
        self.states = []  # record position and action taken at the position
        self.actions = ["up", "down", "left", "right"]
        self.State = State()
        self.isEnd = self.State.isEnd
        self.lr = 0

        self.gamma = 0.95
        self.k = 1.5
        
        # initial Q values
        self.Q_values = {}
        self.Q_values_count = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                self.Q_values_count[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0  # Q value is a dict of dict
                    self.Q_values_count[(i, j)][a] = 0
                    
                    
    def bestAction(self, state):
        mx_nxt_reward = 0
        for a in self.actions:
            

            nxt_reward = self.Q_values[state][a]
            if nxt_reward >= mx_nxt_reward:
                action = a
                mx_nxt_reward = nxt_reward
        return action, mx_nxt_reward
    
    def chooseAction(self):

        mx_nxt_reward = 0
        action = ""


        Q_a_list = []
        current_position = self.State.state
        
        for a in self.actions:
            
            
            Q_a_list.append(self.Q_values[current_position][a])
        
        Q_a_array = self.k**np.array(Q_a_list)
        Prob_a_givenState = Q_a_array/np.sum(Q_a_array)
            
        action = random.choices(population=self.actions, weights=Prob_a_givenState,k=1)[0]
        
       
        return action



    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        # update State
        return State(state=position)

    def reset(self):
#         self.states = []
        self.State = State()
        self.isEnd = self.State.isEnd
        
    def showValues(self):
        
        State_matrix = np.zeros((BOARD_ROWS,BOARD_COLS))
        for i in range(0, BOARD_ROWS):
            for j in range(0, BOARD_COLS):             
                _, V = self.bestAction((i,j))
                State_matrix[i,j] = V

        State_matrix[WIN_STATE[0],WIN_STATE[1]] = REWARD
        State_matrix[LOSE_STATE[0],LOSE_STATE[1]] = PENALTY
        ax = sns.heatmap(State_matrix, annot=True)

    def play(self, rounds=300):
        i = 0
        for i in range(rounds):

            current_state = self.State.state
            action = self.chooseAction()
            self.State = self.takeAction(action)
            
            self.Q_values_count[current_state][action] += 1
            
            
            reward = self.State.giveReward()
            next_state = self.State.state
            
            self.State.isEndFunc()
            self.isEnd = self.State.isEnd
            
            _, V = self.bestAction(next_state)
            

            
            if self.State.isEnd:
                self.Q_values[current_state][action] = (1-self.lr)* self.Q_values[current_state][action]+ self.lr*(reward)
                self.reset()
            else:
                self.lr = 1/(1+self.Q_values_count[current_state][action])
                self.Q_values[current_state][action] = (1-self.lr)* self.Q_values[current_state][action]+ self.lr*(reward + self.gamma*V)
            
            
            

            

if __name__ == "__main__":
    ag = Agent()
    print("initial Q-values ... \n")
    print(ag.Q_values)

    ag.play(10000)
    print("latest Q-values ... \n")
    print(ag.Q_values)

initial Q-values ... 

{(0, 0): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (0, 1): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (0, 2): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (0, 3): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 0): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 1): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 2): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 3): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 0): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 1): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 2): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 3): {'up': 0, 'down': 0, 'left': 0, 'right': 0}}
reward!
penalty!
reward!
reward!
reward!
penalty!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
penalty!
penalty!
reward!
reward!
reward!
reward!
reward!
penalty!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!
reward!