# 2 Learning a policy via MC - Policy Iteration

In [1]:
import numpy as np

# inspired by: https://github.com/MJeremy2017/reinforcement-learning-implementation/blob/master/GridWorld/gridWorld.py
BOARD_ROWS = 3
BOARD_COLS = 5
# states = 3x5
WIN_STATE = (0, 2)
LOSE_STATE_I = (1, 0)
LOSE_STATE_II = (1, 4)
START = (2, 2)
WALL = (1, 2)
DETERMINISTIC = True

class GridWorld(): 
    def __init__(self, state=START):
        # Initialize GridWorld board
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])

        self.state = state
        self.det = DETERMINISTIC
        
    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif (self.state == LOSE_STATE_I) or (self.state == LOSE_STATE_II):
            return -1
        else:
            return 0

    def nextPosition(self, action):
        """
        action: up, down, left, right
        return: next position
        """
        if self.det:
            if action == "up":
                nextState = (self.state[0] - 1, self.state[1])
            elif action == "down":
                nextState = (self.state[0] + 1, self.state[1])
            elif action == "left":
                nextState = (self.state[0], self.state[1] - 1)
            else: 
                nextState = (self.state[0], self.state[1] + 1)
            # if next state legal aka. if field is free
            if (nextState[0] >= 0) and (nextState[0] <= (BOARD_ROWS -1)):  
                if (nextState[1] >= 0) and (nextState[1] <= (BOARD_COLS -1)):
                    if nextState != WALL:
                        return nextState
            return self.state

    def showBoard(self):
        """Show the GridWorld playfield in ASCII art"""
        self.board[self.state] = 1
        # Wall / barrier
        self.board[1,2] = -1
        # Win / Lose states
        self.board[WIN_STATE] = 2
        self.board[LOSE_STATE_I] = -2
        self.board[LOSE_STATE_II] = -2
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                # Player
                if self.board[i, j] == 1:
                    token = '*'
                # Wall / barrier
                if self.board[i, j] == -1:
                    token = 'z'
                # Free fields
                if self.board[i, j] == 0:
                    token = '0'
                # Win state
                if self.board[i, j] == 2:
                    token = 'W'
                # Lose state
                if self.board[i, j] == -2:
                    token = 'L'
                out += token + ' | '
            print(out)
        print('-----------------')

GW = GridWorld()

# 3 Implementing a policy
## 3.1 Implement the basic agent & 3.2 Evaluate the policy

In [60]:
class Agent():
    def __init__(self):
        self.GW = GridWorld()
        self.end = False
        
        # state-action values
        self.q_table = np.random.rand(3,5,4) # Gridworld x Actions
        
        self.return_table = np.zeros((3,5,4)) # stores the sum of returns
        self.visits_table = np.zeros((3,5,4)) # stores the number of visits
        
        self.epsilon = 1
        self.epsilon_decay = 0.999

    
    # epsilon soft policy
    def epsilon_policy(self, epsilon, state): 
        if np.random.rand() > epsilon:
          action = np.argmax(self.q_table[state]) 
        else:
          action = np.random.randint(0,4)
        return action

    
    def makeAct(self, action):
        self.GW.state = self.GW.nextPosition(action)
        self.reward = self.GW.giveReward()
        if self.reward==1 or self.reward==-1:
            # End game
            self.end = True
        #self.state_values[self.GW.state]
    
    def reset(self):
        self.GW = GridWorld()
        self.end = False

    def play(self):
        i = 0
        
        # Create empty list to safe states where the agent was in this episode
        self.state_action_list = []
        
        # Create empty dictionary to safe MC-estimates
        self.erg = {}

        while self.end==False:
            
            state = self.GW.state
            action = self.epsilon_policy(self.epsilon, state)
            self.epsilon *= self.epsilon_decay
            
            self.makeAct(action)
            
            self.state_action_list.append((self.GW.state, action))
            print(state, action)

        for entry in self.state_action_list:
            self.return_table[entry] += self.reward # sum of returns over all runs
            self.visits_table[entry] += 1 # number of visits over all runs
            
            self.q_table[entry] = self.return_table(entry) / self.visits_table[entry] # average of returns over all runs
        

        # Useful for Bugfixing
        #print('Game end, reward', self.reward)
        #print('Turns:',i)
        #print("---------------------")
        print(self.q_table)
        self.reset()

    def showValues(self):
        """Show MC-estimates per state aka. field in GridWorld"""
        for i in range(0, BOARD_ROWS):
            print('----------------------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.erg[(i, j)]).ljust(6) + ' | '
            print(out)
        print('----------------------------------------------')
        
ag = Agent()
ag.play()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(2, 4) 0
(

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



KeyboardInterrupt: 

# 4 Visualization (Optional)

In [None]:
GW.showBoard()

for k in range(1,10001):
    ag.play()
    # Show MC-estimates per 50, 200, 500, 1000, 10000 
    if k in [50,200,500,1000,10000]:
        print("k =",k)
        ag.showValues()

In [None]:
agent = Agent()

def policy_iteration(epsilon = 0.01, agent):
  policy = agent.policy()
  q_table = np.random.rand(5,3,4) # Gridworld x Actions
  returns = dict.fromkeys(agent.state_values.keys(), (0,0))

  gamma = 0.999

  while True: 
    agent.play(policy) 
    G = 0 
    for t in agent.state_list.reverse(): 
      if t == 0:
        G = agent.reward 
      else: 
        G = gamma * G + 0  # reward only at last step of episode 
      returns(t)[0] += G 
      returns(t)[1] += 1 

      q_table(t[0],t[1], "action") = returns(t)[0] / returns(t)[1] #this is not working

      # @ToDo: we don't save the action but we need this now. 

      # policy is now außerhalb des agents, should rather be inside of agent 
      # how can we let agent choose according to highest q-value instead of minimal distance? 

In [56]:
q_table = np.random.rand(5,3,4) # Gridworld x Actions
state = (0,0)
np.argmax(q_table[state])

1

In [19]:
l = []
a = (1, 2)
b = 3
l.append((a,b))
print(l[0][0])

(1, 2)
