In [1]:
from lib.util import *
from lib.policy import *
from lib.mdp import *
from lib.mrp import *
from lib.env import *

# $\varepsilon$-Greedy Policy Improvement

###### Theorem
For any $\varepsilon$-greedy policy $\pi$, the $\varepsilon$-greedy policy $\pi'$ with respect to $q_\pi$ is an improvement, $v_{\pi'}(s) \geq v_{\pi}(s)$

###### Proof
\begin{align*}
v_{\pi'}(s) &= \sum_{a \in A} \pi'(a\mid s)q_{\pi}(s, a) \\
&= \frac{\varepsilon}{m} \sum_{a \in A} q_{\pi}(s, a) + (1-\varepsilon)\max_{a \in A}q_{\pi}(s, a) \\
&\geq  \frac{\varepsilon}{m}\sum_{a \in A} q_{\pi}(s, a) + (1-\varepsilon)\mathbb{E}_{\sim (\pi - \varepsilon/m)/(1-\varepsilon)}[q_{\pi}(s, a)] \\
&= \frac{\varepsilon}{m}\sum_{a \in A} q_{\pi}(s, a) + (1-\varepsilon)\sum_{a \in A} \frac{\pi(a\mid s)-\varepsilon/m}{1-\varepsilon}q_{\pi}(s, a) \\
&= \sum_{a \in A} \pi(a\mid s)q_{\pi}(s, a) \\
&= v_\pi(s)
\end{align*}

# GLIE (Greedy in the Limit with Infinite Exploration)

All state-action pairs are explored infinitely many times,
$$ lim_{k\rightarrow \infty} N_k (s, a) = \infty $$
The policy converges on a greedy policy,
$$ lim_{k\rightarrow \infty} \pi_k (a\mid s) = \mathbb{1}[a = \text{argmax}_{a' \in A} Q_k (s, a'))] $$

# SARSA

In [2]:
from lib.sarsa import *

# Q-learning

In [3]:
from lib.q_learning import *

# Example

In [4]:
n = 7
n_episodes = 50
gamma = 0.8

In [5]:
P = generate_stochastic_matrix(n)
R = generate_reward_vector(n)
mrp = MRP(P, R, gamma)
mdp = MDP(gamma, [mrp]*n)
Q = generate_stochastic_matrix(n)
policy = Policy(Q)

In [6]:
env = Env(mdp)

In [7]:
sarsa(env, n_episodes, policy, gamma)

defaultdict(float,
            {(5, 2): 1.7441277267635202,
             (6, 3): 1.8671615798147732,
             (6, 4): 1.83150631822924,
             (0, 6): 1.6424485197931444,
             (0, 5): 1.7437418790800172,
             (4, 0): 1.8716324947073513,
             (4, 1): 1.8079877280527397,
             (6, 5): 1.8812020523182533,
             (6, 6): 1.7524346778677926,
             (4, 3): 1.7706479629304468,
             (4, 4): 1.8875995255150522,
             (4, 2): 1.882396837753244,
             (6, 2): 1.8818163573159465,
             (0, 0): 1.7117672795755892,
             (1, 3): 1.8070335338100272,
             (1, 1): 1.7949096460288303,
             (6, 1): 1.8141619056770917,
             (2, 2): 1.8111123207287727,
             (2, 4): 1.7524876112437713,
             (0, 3): 1.709505380053316,
             (0, 1): 1.7861660938639579,
             (2, 6): 1.7953957870824078,
             (5, 5): 1.7512300305230637,
             (5, 4): 1.6628058741943172,
 

In [8]:
qLearning(env, n_episodes, policy, gamma)

defaultdict(float,
            {(0, 4): 2.12857123580569,
             (5, 0): 2.0043651375315705,
             (5, 1): 2.010649409483529,
             (5, 2): 2.0355436869000605,
             (5, 3): 1.9952125872775457,
             (5, 4): 2.043338621584539,
             (5, 5): 2.066236515306301,
             (5, 6): 1.9549746473271739,
             (0, 0): 2.021206747944144,
             (0, 1): 2.076627374154728,
             (0, 2): 2.0174233780020536,
             (0, 3): 2.065551874637221,
             (0, 5): 1.9987673662530567,
             (0, 6): 2.0259938992832596,
             (3, 0): 2.2813419883741117,
             (3, 1): 2.1699066949377848,
             (3, 2): 2.188374351547453,
             (3, 3): 2.0809138949312866,
             (3, 4): 2.1406393476968995,
             (3, 5): 2.1605017722166555,
             (3, 6): 2.1911327030872743,
             (6, 0): 2.1442360487446654,
             (6, 1): 2.119395557815245,
             (6, 2): 2.202120728752106,
        