Implementation of policy iteration (Sutton and Barto, section 4.3, page 80)

In [2]:
import pandas as pd
import numpy as np

In [189]:
class Agent:

    '''
    Initializes the agent
    @param rows - The number of rows in the grid world
    @param columns - The number of columns in the grid world
    @param terminal_state - The final state the agent is trying to find the best path to
    '''
    def __init__(self, rows, columns, terminal_state, theta=0.05, gamma=0.05):
        # self.grid = np.random.rand(rows, columns) * -1 # Initializes
        self.rows = rows
        self.columns = columns
        self.values = np.zeros((rows, columns)) # Initializes all state values to zero
        self.policies = self.initializePolicies() # Initializes policies for each cell,
                                                  # policies[row][column] is an array where the value
                                                  # at index 0 is the probability that up is the optimal choice,
                                                  # 1 is right, 2 is down, 3 is left (clockwise around Cartesian)
        self.terminal_state = terminal_state # Where the end state is located on the grid
        self.theta = theta # A value close to zero signifying completion, determines the accuracy of the policy estimation
        self.gamma = gamma # A value signifying by how much to discount future rewards 

    '''
    Initializes the policies for each cell in the grid world
    '''
    def initializePolicies(self):
        policies = np.empty((self.rows, self.columns, 4))

        # Fill the array with random values that sum to 1
        for i in range(self.rows):
            for j in range(self.columns):
                policies[i, j] = np.random.dirichlet(np.ones(4))

        return policies

    def isValidState(self, coords):
        return coords[0] >= 0 and coords[0] < self.rows and coords[1] >= 0 and coords[1] < self.columns

    '''
    Gets the available successor states (necessary because our grid world has walls)
    @param coords - coordinates in tuple form (x, y) of our state
    @return a list of successors
    '''
    def getAvailableSuccessorStates(self, coords):
        successors = []
        
        # if coords[1] != 0:
        successors.append((coords[0], coords[1] - 1))

        # if coords[0] != self.rows - 1:
        successors.append((coords[0] + 1, coords[1]))

        # if coords[1] != self.columns - 1:
        successors.append((coords[0], coords[1] + 1))

        # if coords[0] != 0:
        successors.append((coords[0] - 1, coords[1]))

        return successors

    def getDiscountedValuesForSuccessors(self, state):
        values = np.zeros(4)
        
        successors = self.getAvailableSuccessorStates(state[0]) # Get possible next states
                
        value = 0
        for successor in range(len(successors)):
            if self.isValidState(successors[successor]):
                probability_successor_chosen = self.policies[state[0][0]][state[0][1]][successor]
                successor_value = -1 + self.gamma * self.values[state[0][0]][state[0][1]]
                values[successor] = probability_successor_chosen * successor_value
        
        return values

        
        

    '''
    Performs a policy evaluation step, updates the policy
    '''
    def policyEvaluation(self):
        while True:
            delta = 0 # This is the check to know when to stop evaluation
            for state in np.ndenumerate(self.values): # Iterate over states - in the form ((row, column), value)
                old_value = self.values[state[0]] # Get value of the state
                
                value = np.sum(self.getDiscountedValuesForSuccessors(state))

                self.values[state[0]] = value
                delta = max(delta, np.absolute(old_value - value))
            
            if delta < self.theta:
                break;

    def policyImprovement(self):
        while True:
            stable = True
            for state in np.ndenumerate(self.values):
                old_action = np.argmax(self.policies[state[0]])
                self.policies[state[0]] = self.getDiscountedValuesForSuccessors(state)
                action = np.argmax(self.policies[state[0]])
                if old_action != action:
                    stable = False
            
            if not stable:
                print(self.policies)
                self.policyEvaluation()
            else:
                break

In [190]:
a = Agent(3, 5, (2, 4))

In [191]:
a.values

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [193]:
a.policyImprovement()

[[[ 0.         -0.3029435  -0.19541966  0.        ]
  [-0.34271845 -0.35779106 -0.10351159  0.        ]
  [ 0.54583001  0.22693501  0.01756208  0.        ]
  [ 0.072609    0.72215668  0.0022686   0.        ]
  [ 0.17598584  0.04665875  0.          0.        ]]

 [[ 0.          0.00518371  0.22520556  0.09317663]
  [ 0.46887455  0.24848649  0.19197018  0.03018153]
  [ 0.12635245  0.16251736  0.50760859  0.14303434]
  [ 0.38114494  0.36293034  0.11694381  0.07849364]
  [ 0.19748518  0.69586354  0.          0.0380308 ]]

 [[ 0.          0.          0.13708995  0.33564127]
  [ 0.27488226  0.          0.22516992  0.30761268]
  [ 0.57352992  0.          0.19315201  0.01424366]
  [ 0.52989241  0.          0.28155099  0.00899104]
  [ 0.1922367   0.          0.          0.55162088]]]
[[[ 0.          0.29557824  0.19066856  0.        ]
  [ 0.32947335  0.34396345  0.09951116  0.        ]
  [-0.56828391 -0.23627047 -0.01828454  0.        ]
  [-0.07562232 -0.75212664 -0.00236275  0.        ]
  [-0.

KeyboardInterrupt: 

In [None]:
a.values

array([[-0.81552625, -1.02451344, -0.84228761, -0.9376301 , -0.11189752],
       [-0.56941323, -1.0525    , -1.0525    , -1.0525    , -0.42140923],
       [-0.08181357, -0.67126235, -0.54999137, -0.88176631, -0.14551153]])