In [1]:
import numpy as np

In [2]:
def print_grid(mat, col_wid=10, additional_format="", header=None):
    if header is not None:
        print('{{:^{}}}'.format(1 + col_wid).format(""), end="")
        print("{{:=^{}}}".format((1 + col_wid) * len(mat[0]) + 1).format(" {} ".format(header)))
    for row in range(len(mat)):
        print('{{:^{}}}'.format(1 + col_wid).format(""), end="")
        print("{{:.^{}}}".format((1 + col_wid) * len(mat[row]) + 1).format(""))
        print('{{:^{}}}'.format(1 + col_wid).format(row), end="")
        for col in range(len(mat[row])):
            print('|{{:^{}{}}}'.format(col_wid, additional_format).format(mat[row][col]), end="")
        print("|")
    print('{{:^{}}}'.format(1 + col_wid).format(""), end="")
    print("{{:.^{}}}".format((1 + col_wid) * len(mat[row]) + 1).format(""))
    print('{{:^{}}}'.format(1 + col_wid).format(""), end="")
    for col in range(len(mat[0])):
        print('{{:^{}}}'.format(1 + col_wid).format(col), end="")
    print("")

In [3]:
# Actions

UP = (-1, 0)
DOWN = (1, 0)
RIGHT = (0, 1)
LEFT = (0, -1)
IMPOSSIBLE_ACTION = (42, 42)

all_actions = [UP, DOWN, RIGHT, LEFT]
act_to_name_d = {UP: 'up', DOWN: 'down', RIGHT: 'right', LEFT: 'left'}
name_to_act_d = {'up': UP, 'down': DOWN, 'right': RIGHT, 'left': LEFT}
act_syms_d = {UP: '🡡', DOWN: '🡣', LEFT: '🡠', RIGHT: '🡢', IMPOSSIBLE_ACTION: '✖'}

def act_by_name(name):
    return name_to_act_d[name]

def name_by_act(act):
    return act_to_name_d[act]

def act_to_sym(act):
    return act_syms_d[act]

def print_act_names(acts):
    print(list(map(name_by_act, acts)))
    
acts_array_to_sym = np.vectorize(act_to_sym)

In [4]:
class State:
    def __init__(self, row, col):
        self.row = row
        self.col = col
    
    def __call__(self, action):
        return State(self.row + action[0], self.col + action[1])
    
    def __str__(self):
        return str((self.row, self.col))
    
    def __repr__(self):
        return self.__str__()
    
    def __hash__(self):
        return hash((self.row, self.col))

    def __eq__(self, other):
        return (self.row, self.col) == (other.row, other.col)

In [5]:
class World:
    def __init__(self, grid):
        """
        grid -- 2d numpy array of rewards
        """
        self.grid = grid
        indices = np.where(grid != np.infty)
        self.states = [State(p[0], p[1]) for p in zip(indices[0], indices[1])]
        self.states_set = set(self.states)
    
    def _is_ok_action(self, state, action):
        return state(action) in self.states_set
    
    def _check_state(self, state):
        if state not in self.states_set:
            raise RuntimeError("Bad state")
            
    def perform_action(self, start_state, action):
        new_state = start_state(action)
        self._check_state(new_state)
        return (new_state, self.get_reward(new_state))
    
    def get_possible_actions(self, state):
        self._check_state(state)
        return [a for a in all_actions if state(a) in self.states_set]
                
    def get_reward(self, state):
        self._check_state(state)
        return self.grid[state.row, state.col]
    
    def get_all_actions(self):
        return all_actions
    
    def state_f_to_mat(self, stateF, none_obj=None):
        """
        any dict from states to X transforms to matrix
        """
        mat = np.empty(self.grid.shape, dtype=object)
        mat.fill(none_obj)
        for state in stateF.keys():
            mat[state.row, state.col] = stateF[state]
        return mat

In [6]:
grid = np.array([
        [0, 0, -100, 0, 100],
        [0, 0, 0, 0, np.infty],
        [0, 0, np.infty, 0, 0],
        [0, 0, 0, -100, 0],
        [0, 0, 0, 0, 0]
])
print_grid(grid, col_wid=10, header="GRID WORLD")

           ........................................................
     0     |   0.0    |   0.0    |  -100.0  |   0.0    |  100.0   |
           ........................................................
     1     |   0.0    |   0.0    |   0.0    |   0.0    |   inf    |
           ........................................................
     2     |   0.0    |   0.0    |   inf    |   0.0    |   0.0    |
           ........................................................
     3     |   0.0    |   0.0    |   0.0    |  -100.0  |   0.0    |
           ........................................................
     4     |   0.0    |   0.0    |   0.0    |   0.0    |   0.0    |
           ........................................................
                0          1          2          3          4     


In [7]:
WORLD = World(grid=grid)

In [8]:
print_act_names(WORLD.get_possible_actions(State(0, 0)))

['down', 'right']


# State-value function for random walk policy

Bellman equation:

$$
v_{\pi}(s) = \sum_{a \in A}{\pi(a | s) \left( R_{s}^{a} + \gamma \sum_{s' \in S}{P_{ss'}^a v_{\pi}(s')} \right)}
$$

For our grid world we have: $P_{ss'}^a = 0$ in case $s + a \neq s'$ and $=1$ else. Also all rewards expectations $R_{s}^a$ are: $R_s^a = reward(s + a)$

And because of random walk policy we have $\pi(a | s) = \frac{1}{|possibleActions(s)|}$

So finally we have:

$$
v(s) = \sum_{a \in possibleActions(s)}{\frac{1}{|possibleActions(s)|} \cdot \left(reward(s + a) + \gamma \cdot v(s+a)\right) }
$$

This may be solved as system of linear equations (for every state $s$).

In [9]:
gamma = 0.9

# Solution with SOLE

In [10]:
state_index = dict(zip(WORLD.states, range(len(WORLD.states))))
reverse_index = dict([(i, s) for (s, i) in state_index.items()])

In [11]:
def build_system():
    system_mat = np.zeros(shape=(len(WORLD.states), len(WORLD.states)))
    free_vec = np.zeros(shape=(len(WORLD.states), 1))
    
    for (state, idx) in state_index.items():
        possible_actions = WORLD.get_possible_actions(state)
        prob = 1.0 / len(possible_actions)
        system_mat[idx][idx] = 1
        for pa in possible_actions:
            state_next = state(pa)
            j = state_index[state_next]
            free_vec[idx] += prob * WORLD.get_reward(state_next)
            system_mat[idx][j] = -prob * gamma
    return (system_mat, free_vec)

In [12]:
A, b = build_system()

In [13]:
solution = np.linalg.solve(A, b)
st_val_mat = np.zeros(shape=WORLD.grid.shape)
for i in range(len(solution)):
    state = reverse_index[i]
    st_val_mat[state.row, state.col] = solution[i][0]

In [14]:
print_grid(WORLD.grid, header="GRID WORLD")

           ........................................................
     0     |   0.0    |   0.0    |  -100.0  |   0.0    |  100.0   |
           ........................................................
     1     |   0.0    |   0.0    |   0.0    |   0.0    |   inf    |
           ........................................................
     2     |   0.0    |   0.0    |   inf    |   0.0    |   0.0    |
           ........................................................
     3     |   0.0    |   0.0    |   0.0    |  -100.0  |   0.0    |
           ........................................................
     4     |   0.0    |   0.0    |   0.0    |   0.0    |   0.0    |
           ........................................................
                0          1          2          3          4     


In [15]:
print_grid(st_val_mat, additional_format=".2f", header="STATE VALUE FUNCTION")

           ........................................................
     0     |  -73.22  | -103.66  |  -87.00  |  -75.57  |  -68.01  |
           ........................................................
     1     |  -59.04  |  -74.21  | -110.76  |  -96.88  |   0.00   |
           ........................................................
     2     |  -49.38  |  -56.35  |   0.00   | -136.62  | -126.97  |
           ........................................................
     3     |  -49.22  |  -64.25  | -117.16  | -120.42  | -145.54  |
           ........................................................
     4     |  -50.42  |  -62.83  |  -94.76  | -135.88  | -126.64  |
           ........................................................
                0          1          2          3          4     


# Iterative solution

In [16]:
import copy
# state value function
svf = np.zeros(shape=grid.shape)
eps = 1e-3
while True:
    delta = 0
    nsvf = np.zeros(shape=WORLD.grid.shape)
    for s in WORLD.states:
        v = svf[s.row, s.col]
        acts = WORLD.get_possible_actions(s)
        for a in acts:
            ns = s(a)
            nsvf[s.row, s.col] += 1 / len(acts) * (WORLD.get_reward(ns) + gamma * svf[ns.row, ns.col])
        delta = max(delta, np.abs(v - nsvf[s.row, s.col]))
    if delta < eps:
        break
    svf = nsvf

In [17]:
print_grid(svf, additional_format=".2f", header="STATE VALUE FUNCTION")

           ........................................................
     0     |  -73.21  | -103.65  |  -86.99  |  -75.56  |  -68.01  |
           ........................................................
     1     |  -59.04  |  -74.20  | -110.75  |  -96.88  |   0.00   |
           ........................................................
     2     |  -49.38  |  -56.35  |   0.00   | -136.61  | -126.97  |
           ........................................................
     3     |  -49.21  |  -64.25  | -117.16  | -120.42  | -145.54  |
           ........................................................
     4     |  -50.42  |  -62.83  |  -94.76  | -135.88  | -126.64  |
           ........................................................
                0          1          2          3          4     


# Temporal Difference Learning

In [18]:
class LearningAgent:
    def __init__(self, world, initial_state, actionChoosePolicy):
        self.world = world
        self.state = initial_state
        self.actionChooser = actionChoosePolicy
        # action value function
        self.Q = {s:{k: 0 for k in world.get_possible_actions(s)} for s in world.states}

    
    def do_learn_step(self):
        pass
        
    def get_live_policy(self):
        return {state: max(self.Q[state], key=lambda a: self.Q[state][a]) for state in self.Q.keys()}
    
    def get_action_value_fun(self):
        return self.Q
    
    def print_action_value(self):
        for a in self.world.get_all_actions():
            qq = {state: self.Q[state][a] if a in self.world.get_possible_actions(state) else -np.inf for state in self.Q.keys()}
            mat = self.world.state_f_to_mat(qq, none_obj=0)
            print_grid(mat, header=name_by_act(a), additional_format=".3f")

## Q Learning with eps-greedy walking

In [19]:
def epsGreedyPolicy(eps):
    def getAction(state, possible_actions, Q):
        if np.random.uniform() < eps:
            act = possible_actions[np.random.choice(len(possible_actions))]
        else:
            act = max(possible_actions, key=lambda a: Q[state][a])
        return act
    return getAction

In [20]:
class QLearningAgent(LearningAgent):
    def __init__(self, world, initial_state, actionChoosePolicy, gamma, alpha):
        LearningAgent.__init__(self, world, initial_state, actionChoosePolicy)
        self.gamma = gamma
        self.alpha = alpha

    
    def do_learn_step(self):
        possible_acts = self.world.get_possible_actions(self.state) # get possible actions
        act = self.actionChooser(self.state, possible_acts, self.Q) # choose action with policy
        new_state = self.state(act) # next agent state
        reward = self.world.get_reward(new_state) # reward for action
        
        oldQsa = self.Q[self.state][act]
        max_aQns = max(self.Q[new_state][a] for a in self.world.get_possible_actions(new_state))
        newQsa = oldQsa + self.alpha * (reward + self.gamma * max_aQns - oldQsa)
        self.Q[self.state][act] = newQsa

        # do not forget to enter new state!
        self.state = new_state
        
        return np.abs(newQsa - oldQsa)

In [21]:
import sys
def do_q_learning(eps, gamma, alpha):
    # initializing world
    random_start_state = WORLD.states[np.random.choice(len(WORLD.states))]
    agent = QLearningAgent(WORLD, random_start_state, epsGreedyPolicy(eps), gamma, alpha)
    
    nb_iters = 0
    while True:
        delta = agent.do_learn_step()
        nb_iters += 1
        if nb_iters > 10000:
            break
        
    return agent

In [22]:
final_agent = do_q_learning(eps=0.7, gamma=0.9, alpha=0.7)

In [23]:
opt_policy = final_agent.get_live_policy()
acts = WORLD.state_f_to_mat(opt_policy, none_obj=IMPOSSIBLE_ACTION)
symbolic_acts = acts_array_to_sym(acts)
print_grid(symbolic_acts)

           ........................................................
     0     |    🡢     |    🡣     |    🡢     |    🡢     |    🡠     |
           ........................................................
     1     |    🡢     |    🡢     |    🡢     |    🡡     |    ✖     |
           ........................................................
     2     |    🡢     |    🡡     |    ✖     |    🡡     |    🡠     |
           ........................................................
     3     |    🡡     |    🡡     |    🡠     |    🡡     |    🡡     |
           ........................................................
     4     |    🡡     |    🡡     |    🡠     |    🡢     |    🡡     |
           ........................................................
                0          1          2          3          4     


In [24]:
opt_policy = final_agent.get_live_policy()
import copy
# state value function
state_v_fun = {s: 0 for s in WORLD.states}
eps = 1e-3
gamma = 0.5
while True:
    delta = 0
    new_svf = {s: 0 for s in WORLD.states}
    for s in WORLD.states:
        v = state_v_fun[s]
        action = opt_policy[s]
        ns = s(action)
        new_svf[s] += (WORLD.get_reward(ns) + gamma * state_v_fun[ns])
        delta = max(delta, np.abs(v - new_svf[s]))
    state_v_fun = new_svf
    if delta < eps:
        break

In [25]:
print_grid(WORLD.state_f_to_mat(state_v_fun, none_obj=np.inf), additional_format=".2f")

           ........................................................
     0     |   4.17   |   8.33   |  66.67   |  133.33  |  66.67   |
           ........................................................
     1     |   8.33   |  16.67   |  33.33   |  66.67   |   inf    |
           ........................................................
     2     |   4.17   |   8.33   |   inf    |  33.33   |  16.67   |
           ........................................................
     3     |   2.08   |   4.17   |   2.08   |  16.67   |   8.33   |
           ........................................................
     4     |   1.04   |   2.08   |   1.04   |   2.08   |   4.17   |
           ........................................................
                0          1          2          3          4     


In [26]:
print_grid(WORLD.grid)

           ........................................................
     0     |   0.0    |   0.0    |  -100.0  |   0.0    |  100.0   |
           ........................................................
     1     |   0.0    |   0.0    |   0.0    |   0.0    |   inf    |
           ........................................................
     2     |   0.0    |   0.0    |   inf    |   0.0    |   0.0    |
           ........................................................
     3     |   0.0    |   0.0    |   0.0    |  -100.0  |   0.0    |
           ........................................................
     4     |   0.0    |   0.0    |   0.0    |   0.0    |   0.0    |
           ........................................................
                0          1          2          3          4     


# SARSA

In [41]:
class BoltzmannPolicy:
    def __init__(self, T):
        self.initT = T
        self.curT = T
        self.step = 2
    
    def __call__(self, state, possible_actions, Q):
        self.curT = self.initT / np.log(self.step)
        
        probs = np.array([np.exp(Q[state][a] / self.curT) for a in possible_actions])
        probs /= np.sum(probs)
        idx = np.where(np.random.multinomial(1, probs)==1)[0][0]
        
        self.step += 1
        return possible_actions[idx]

In [42]:
class SarsaLearningAgent(LearningAgent):
    def __init__(self, world, initial_state, actionChoosePolicy, gamma, alpha):
        LearningAgent.__init__(self, world, initial_state, actionChoosePolicy)
        self.gamma = gamma
        self.alpha = alpha
        self.act = self.actionChooser(self.state, 
                                      self.world.get_possible_actions(self.state),
                                      self.Q)

    def reenter(self, new_state):
        self.state = new_state
        self.act = self.actionChooser(self.state, 
                                      self.world.get_possible_actions(self.state),
                                      self.Q)
    
    def do_learn_step(self):
        new_state = self.state(self.act) # next agent state
        reward = self.world.get_reward(new_state) # reward for action   
        possible_acts = self.world.get_possible_actions(new_state)
        
        act_new = self.actionChooser(new_state, possible_acts, self.Q)
        oldQsa = self.Q[self.state][self.act]
        newQsa = oldQsa + self.alpha * (reward + self.gamma * self.Q[new_state][act_new] - oldQsa)
        self.Q[self.state][self.act] = newQsa
        
        # do not forget to enter new state!
        self.state = new_state
        self.act = act_new
        
        return np.abs(newQsa - oldQsa)
        

In [43]:
import sys
from tqdm import tqdm
def do_sarsa_learning(T, gamma, alpha):
    # initializing world
    policy = BoltzmannPolicy(T)
#     policy = epsGreedyPolicy(0.2)
    
    agent = SarsaLearningAgent(WORLD, WORLD.states[0], policy, gamma, alpha)
    
    N = 1000
    M = 1000
    with tqdm(range(N * M)) as timer:
        for i in range(N):
            random_start_state = WORLD.states[np.random.choice(len(WORLD.states))]
            agent.reenter(random_start_state)
            for j in range(M):
                agent.do_learn_step()
                timer.update()
        
    return agent

In [44]:
print_grid(WORLD.grid)

           ........................................................
     0     |   0.0    |   0.0    |  -100.0  |   0.0    |  100.0   |
           ........................................................
     1     |   0.0    |   0.0    |   0.0    |   0.0    |   inf    |
           ........................................................
     2     |   0.0    |   0.0    |   inf    |   0.0    |   0.0    |
           ........................................................
     3     |   0.0    |   0.0    |   0.0    |  -100.0  |   0.0    |
           ........................................................
     4     |   0.0    |   0.0    |   0.0    |   0.0    |   0.0    |
           ........................................................
                0          1          2          3          4     


In [45]:
final_agent = do_sarsa_learning(T = 20, gamma=0.7, alpha=0.2)

100%|██████████| 1000000/1000000 [01:34<00:00, 10547.58it/s]


In [47]:
opt_policy = final_agent.get_live_policy()
acts = WORLD.state_f_to_mat(opt_policy, none_obj=IMPOSSIBLE_ACTION)
symbolic_acts = acts_array_to_sym(acts)
print_grid(symbolic_acts)

           ........................................................
     0     |    🡣     |    🡣     |    🡢     |    🡢     |    🡠     |
           ........................................................
     1     |    🡢     |    🡢     |    🡢     |    🡡     |    ✖     |
           ........................................................
     2     |    🡢     |    🡡     |    ✖     |    🡡     |    🡠     |
           ........................................................
     3     |    🡡     |    🡡     |    🡠     |    🡡     |    🡡     |
           ........................................................
     4     |    🡡     |    🡡     |    🡠     |    🡠     |    🡡     |
           ........................................................
                0          1          2          3          4     


In [48]:
final_agent.print_action_value()

           ........................................................
     0     |   -inf   |   -inf   |   -inf   |   -inf   |   -inf   |
           ........................................................
     1     |  0.000   |  0.000   | -20.000  | 137.255  |  0.000   |
           ........................................................
     2     |  0.654   |  47.078  |  0.000   |  96.078  |   -inf   |
           ........................................................
     3     |  23.068  |  32.955  |   -inf   |  67.254  |  47.078  |
           ........................................................
     4     |  16.144  |  23.068  |  -0.898  | -20.000  |  32.950  |
           ........................................................
                0          1          2          3          4     
           ........................................................
     0     |  32.951  |  47.072  |  0.000   |  0.000   |   -inf   |
           ......................................