# 2 Learning a policy via MC - Policy Iteration

In [None]:
import numpy as np

# inspired by: https://github.com/MJeremy2017/reinforcement-learning-implementation/blob/master/GridWorld/gridWorld.py
BOARD_ROWS = 3
BOARD_COLS = 5
# states = 3x5
WIN_STATE = (0, 2)
LOSE_STATE_I = (1, 0)
LOSE_STATE_II = (1, 4)
START = (2, 2)
WALL = (1, 2)
DETERMINISTIC = True

class GridWorld(): 
    def __init__(self, state=START):
        # Initialize GridWorld board
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])

        self.state = state
        self.det = DETERMINISTIC
        
    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif (self.state == LOSE_STATE_I) or (self.state == LOSE_STATE_II):
            return -1
        else:
            return 0

    def nextPosition(self, action):
        """
        action: up, down, left, right
        return: next position
        """
        if self.det:
            if action == "up":
                nextState = (self.state[0] - 1, self.state[1])
            elif action == "down":
                nextState = (self.state[0] + 1, self.state[1])
            elif action == "left":
                nextState = (self.state[0], self.state[1] - 1)
            else: 
                nextState = (self.state[0], self.state[1] + 1)
            # if next state legal aka. if field is free
            if (nextState[0] >= 0) and (nextState[0] <= (BOARD_ROWS -1)):  
                if (nextState[1] >= 0) and (nextState[1] <= (BOARD_COLS -1)):
                    if nextState != WALL:
                        return nextState
            return self.state

    def showBoard(self):
        """Show the GridWorld playfield in ASCII art"""
        self.board[self.state] = 1
        # Wall / barrier
        self.board[1,2] = -1
        # Win / Lose states
        self.board[WIN_STATE] = 2
        self.board[LOSE_STATE_I] = -2
        self.board[LOSE_STATE_II] = -2
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                # Player
                if self.board[i, j] == 1:
                    token = '*'
                # Wall / barrier
                if self.board[i, j] == -1:
                    token = 'z'
                # Free fields
                if self.board[i, j] == 0:
                    token = '0'
                # Win state
                if self.board[i, j] == 2:
                    token = 'W'
                # Lose state
                if self.board[i, j] == -2:
                    token = 'L'
                out += token + ' | '
            print(out)
        print('-----------------')

GW = GridWorld()

# 3 Implementing a policy
## 3.1 Implement the basic agent & 3.2 Evaluate the policy

In [None]:
class Agent():
    def __init__(self):
        self.GW = GridWorld()
        self.end = False
        # Set of actions
        self.action_list = ["up", "down", "left", "right"]

        # Add a dictionary to safe all state values for visualization later
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = [0, 0]

    def distanceToWin(self):
        ydistance = self.GW.state[0] - WIN_STATE[0]
        xdistance = self.GW.state[1] - WIN_STATE[1]
        return xdistance, ydistance
    
    def chooseAct(self):
        xd, yd = self.distanceToWin()
        
        if xd==0 and yd>0:
            action = self.action_list[0]
        elif xd==0 and yd<0:
            action = self.action_list[1]
        elif xd>0 and yd==0:
            action = self.action_list[2]
        elif xd<0 and yd==0:
            action = self.action_list[3]
        # If x and y > 0 choose the smaller distance
        elif xd>yd:
            if xd>0:
                action = self.action_list[2]
            if xd<0:
                action = self.action_list[3]
        else:
            if yd>0:
                action = self.action_list[0]
            if yd<0:
                action = self.action_list[1]
        return action
    
    # Just for showcase purposes of one episode with print commands
    def ExamplemakeAct(self):
        if np.random.rand() > 0.3:
            action = self.chooseAct()
        else:
            action = self.action_list[np.random.randint(0,4)]
        print("Current position {} action {}".format(self.GW.state, action))
        self.GW.state = self.GW.nextPosition(action)
        self.reward = self.GW.giveReward()
        print("Next state", self.GW.state)
        print("---------------------")
        if self.reward==1 or self.reward==-1:
            # Reset game/ end episode
            self.end = True
            print("Game end, reward", self.reward)
            print("---------------------")

    def policy(self): 
        if np.random.rand() > 0.1:
          action = self.chooseAct()
        else:
          action = self.action_list[np.random.randint(0,4)]
        return action

    
    def makeAct(self, action):
        # Choose action according to distance to goal in 90% of the time,
        # otherwise choose action random

        self.GW.state = self.GW.nextPosition(action)
        self.reward = self.GW.giveReward()
        if self.reward==1 or self.reward==-1:
            # End game
            self.end = True
        self.state_values[self.GW.state]
    
    def reset(self):
        self.GW = GridWorld()
        self.end = False

    def play(self, policy):
        i = 0
        # Create empty list to safe states where the agent was in this episode
        self.state_list = []
        self.state_list.append(self.GW.state)
        # Create empty dictionary to safe MC-estimates
        self.erg = {}

        while self.end==False:
            action = policy
            self.makeAct(action)
            self.state_list.append(self.GW.state)
            i += 1

        # Remove duplicates from list of states
        self.state_list = list(dict.fromkeys(self.state_list))

        for s in reversed(self.state_list):
            # Add reward and state to dictionary of state values
            self.state_values[s][0] += self.reward
            self.state_values[s][1] += 1
        
        for s in ag.state_values:
            try:
                # Safe MC-estimates per k-many visits in state to result dictionary
                self.erg[s] = round(self.state_values[s][0]/self.state_values[s][1],3)
            except:
                # Prevent division by 0
                self.erg[s] = 0

        # Useful for Bugfixing
        #print('Game end, reward', self.reward)
        #print('Turns:',i)
        #print("---------------------")
        self.reset()

    def showValues(self):
        """Show MC-estimates per state aka. field in GridWorld"""
        for i in range(0, BOARD_ROWS):
            print('----------------------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.erg[(i, j)]).ljust(6) + ' | '
            print(out)
        print('----------------------------------------------')
        
ag = Agent()
ag.ExamplemakeAct()
ag.play()

Current position (2, 2) action up
Next state (2, 2)
---------------------


# 4 Visualization (Optional)

In [None]:
GW.showBoard()

for k in range(1,10001):
    ag.play()
    # Show MC-estimates per 50, 200, 500, 1000, 10000 
    if k in [50,200,500,1000,10000]:
        print("k =",k)
        ag.showValues()

-----------------
| 0 | 0 | W | 0 | 0 | 
-----------------
| L | 0 | z | 0 | L | 
-----------------
| 0 | 0 | * | 0 | 0 | 
-----------------
k = 50
----------------------------------------------
| 0.762  | 0.959  | 1.0    | 0.966  | 0.784  | 
----------------------------------------------
| -1.0   | 0.775  | 0      | 0.784  | -1.0   | 
----------------------------------------------
| -0.826 | 0.632  | 0.636  | 0.639  | -0.897 | 
----------------------------------------------
k = 200
----------------------------------------------
| 0.765  | 0.959  | 1.0    | 0.966  | 0.785  | 
----------------------------------------------
| -1.0   | 0.775  | 0      | 0.784  | -1.0   | 
----------------------------------------------
| -0.822 | 0.632  | 0.636  | 0.638  | -0.896 | 
----------------------------------------------
k = 500
----------------------------------------------
| 0.764  | 0.959  | 1.0    | 0.966  | 0.787  | 
----------------------------------------------
| -1.0   | 0.776  | 0      | 0

In [1]:
agent = Agent()

def policy_iteration(epsilon = 0.01, agent):
  policy = agent.policy()
  q_table = np.random.rand(5,3,4) # Gridworld x Actions
  returns = dict.fromkeys(agent.state_values.keys(), (0,0))

  gamma = 0.999

  while True: 
    agent.play(policy) 
    G = 0 
    for t in agent.state_list.reverse(): 
      if t == 0:
        G = agent.reward 
      else: 
        G = gamma * G + 0  # reward only at last step of episode 
      returns(t)[0] += G 
      returns(t)[1] += 1 

      q_table(t[0],t[1], "action") = returns(t)[0] / returns(t)[1] #this is not working

      # @ToDo: we don't save the action but we need this now. 

      # policy is now außerhalb des agents, should rather be inside of agent 
      # how can we let agent choose according to highest q-value instead of minimal distance? 


      

      








SyntaxError: ignored