In [287]:
from typing import Sequence, Tuple, Dict
import random

import pandas as pd
import numpy as np

A few examples of State Value function in the gridworld settings explained in [Chapter 3](http://incompleteideas.net/book/RLbook2018.pdf)

# Gridworld

Create Gridworld environment

In [429]:
class GridWorldEnv:
    """Manage Gridworld environment"""
    def __init__(self, special_rewards: Dict, special_moves: Dict, dim: int = 5, ):
        """
        Create gridworld
        :param special_rewards: special reward when moving from this place, dict((row, col), reward)
        :param special_rewards: special moves when moving from this place, dict((row, col), (dest_row, dest_col))
        :param dim: grid lateral dimension
        """
        assert len(special_moves) == len(special_rewards)
        assert all(x in special_rewards for x in special_moves)
        
        self.special_reward = special_rewards
        self.special_moves = special_moves
        self.dim = dim
        self.possible_actions = {
            'up': (0, -1),
            'down': (0, 1),
            'left': (-1, 0),
            'right': (1, 0)
        }
        
    def show_full_space(self):
        """Show all possible action/state combinations and their rewards"""
        for i in range(self.dim):
            for k in range(self.dim):
                for action_name, action in self.possible_actions.items():
                    reward, next_pos = self.step(current_position=(i, k), action_name=action_name)
                    print(f"from {(i, k)} move {action_name} --> into {next_pos} with R={reward}")
                    
    def step(self, current_position: Tuple[int, int], action_name: str):
        """Get reward and next position/state"""
        assert action_name in self.possible_actions, f"Action {action_name} not among possible actions"
        
        action_x, action_y = self.possible_actions[action_name]
        i, k = current_position
        next_pos = i+action_x, k+action_y
        if (i, k) in self.special_moves:  # special move get fixed reward and always move to one location
            next_pos = self.special_moves[(i, k)]
            reward = self.special_reward[(i, k)]
        elif next_pos[0] < 0 or next_pos[0] > self.dim-1 or next_pos[1] < 0 or next_pos[1] > self.dim-1:
            next_pos = i, k
            reward = -1
        else:
            reward = 0
        
        return reward, next_pos
    
    def grid_to_vec(self, row: int, col: int) -> int:
        """get index of vector which corresponds to a grid position"""
        assert 0 <= row < self.dim, f"Invalid row number {row}, max is {self.dim - 1}"
        assert 0 <= col < self.dim, f"Invalid column number {col}, max is {self.dim - 1}"
        
        return row * self.dim + col

    def vec_to_grid(self, idx: int) -> Tuple[int, int]:
        """get grid position which corresponds to index of vector"""
        assert 0 <= idx < self.dim * self.dim, f"Invalid index {idx}, max is {self.dim * self.dim - 1}"
        
        return idx // self.dim, idx % self.dim
    
    def dict_to_grid(self, fun_on_grid: dict):
        ans = []
        for row in range(self.dim):
            ans.append({f"col_{col}": fun_on_grid[(row, col)] for col in range(self.dim)})

        return pd.DataFrame(ans)

Use GridWorld with a small grid and display full spectrum of states-actions

In [430]:
env = GridWorldEnv(special_rewards={(1, 1): 10}, special_moves={(1, 1): (0, 0)}, dim=2)
env.show_full_space()

from (0, 0) move up --> into (0, 0) with R=-1
from (0, 0) move down --> into (0, 1) with R=0
from (0, 0) move left --> into (0, 0) with R=-1
from (0, 0) move right --> into (1, 0) with R=0
from (0, 1) move up --> into (0, 0) with R=0
from (0, 1) move down --> into (0, 1) with R=-1
from (0, 1) move left --> into (0, 1) with R=-1
from (0, 1) move right --> into (1, 1) with R=0
from (1, 0) move up --> into (1, 0) with R=-1
from (1, 0) move down --> into (1, 1) with R=0
from (1, 0) move left --> into (0, 0) with R=0
from (1, 0) move right --> into (1, 0) with R=-1
from (1, 1) move up --> into (0, 0) with R=10
from (1, 1) move down --> into (0, 0) with R=10
from (1, 1) move left --> into (0, 0) with R=10
from (1, 1) move right --> into (0, 0) with R=10


For a given policy (e.g. random policy) the Belmlman equation leads to the value function by solving a system of linear equations.

In [443]:
def bellman_eq_random_policy(env: GridWorldEnv, gamma: float):
    """
    Solve bellman uquation under random policy
    :param env: gridworl environment
    :param: gamma: discount factor
    """
    size = env.dim * env.dim  # linear system size
    coeff = 1/4  # probability under random policy
    matrix = np.zeros([size, size])
    known_term = np.zeros(size)
    
    # build matrix
    for idx in range(size):  # loop on equations
        current_position = env.vec_to_grid(idx)
        matrix[idx, idx] = 1
        
        for action in env.possible_actions:  # loop on four possible actions
            reward, next_position = env.step(current_position=current_position, action_name=action)
            known_term[idx] += coeff * reward
            idx_other = env.grid_to_vec(*next_position)
            matrix[idx, idx_other] -= coeff * gamma
            
    # solve linear system and push results to grid
    res = np.linalg.solve(matrix, known_term)
    
    return env.dict_to_grid(fun_on_grid={env.vec_to_grid(x): np.around(res[x], 1) for x in range(size)})

The cell below gives teh value function for each state, under optimal policy.
Result should be teh same as in [figure 3.2](http://incompleteideas.net/book/RLbook2018.pdf)

In [444]:
env = GridWorldEnv(special_rewards={(0, 1): 10, (0, 3): 5}, special_moves={(0, 1): (4, 1), (0, 3): (2, 3)})
res = bellman_eq_random_policy(env=env, gamma=0.9)
res

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
0,3.3,8.8,4.4,5.3,1.5
1,1.5,3.0,2.3,1.9,0.5
2,0.1,0.7,0.7,0.4,-0.4
3,-1.0,-0.4,-0.4,-0.6,-1.2
4,-1.9,-1.3,-1.2,-1.4,-2.0
