In [28]:
!git clone https://github.com/deepmuseum/Algorithms-for-Reinforcement-Learning.git

fatal: destination path 'Algorithms-for-Reinforcement-Learning' already exists and is not an empty directory.


In [27]:
import sys
sys.path.insert(0, './Algorithms-for-Reinforcement-Learning/envs')
import numpy as np
from scipy.special import softmax 
import matplotlib.pyplot as plt
import json
import math
from test_env import ToyEnv1

## The environment: a quick start

In [None]:
env = ToyEnv1(gamma=0.99)

# Useful attributes
print("Set of states:", env.states)
print("Set of actions:", env.actions)
print("Number of states: ", env.Ns)
print("Number of actions: ", env.Na)
print("P has shape: ", env.P.shape)  # P[s, a, s'] = env.P[s, a, s']
print("discount factor: ", env.gamma)
print("")

# Usefult methods
state = env.reset() # get initial state
print("initial state: ", state)
print("reward at (s=1, a=3,s'=2): ", env.reward_func(1,3,2))
print("")

# A random policy
policy = np.random.randint(env.Na, size = (env.Ns,))
print("random policy = ", policy)

# Interacting with the environment
print("(s, a, s', r):")
for time in range(4):
    action = policy[state]
    next_state, reward, done, info = env.step(action)
    print(state, action, next_state, reward)
    if done:
        break
    state = next_state
print("")
print(env.R.shape)

Set of states: [0, 1, 2]
Set of actions: [0, 1]
Number of states:  3
Number of actions:  2
P has shape:  (3, 2, 3)
discount factor:  0.99

initial state:  0
reward at (s=1, a=3,s'=2):  1.0

random policy =  [0 0 1]
(s, a, s', r):
0 0 1 0.0
1 0 2 1.0

(3, 2, 3)


## Tabular TD(0) for estimating $v_π$



```
Input: the policy π to be evaluated
Initialize V (s) arbitrarily (e.g., V (s) = 0, for all s ∈ S + )
Repeat (for each episode):
  Initialize S
  Repeat (for each step of episode):
    A ← action given by π for S
    Take action A, observe R, S'
    V (S) ← V (S) + α (R + γV (S') − V (S))
    S ← S'
  until S is terminal
```



In [None]:
# A random policy
policy = np.random.randint(env.Na, size = (env.Ns,))

V=np.zeros(3)
n_episodes=1000
alpha=0.1
gamma=env.gamma
for episode in range(n_episodes):
  state=env.reset()
  done=False
  while not done:
    action=policy[state]
    next_state, reward, done, info = env.step(action)
    V[state]+=alpha*(reward+gamma*V[next_state]-V[state])
    state=next_state

print('Predicted Value Function: ', V)

Predicted Value Function:  [0.99043148 0.96781512 0.        ]


## Sarsa (on-policy TD control) for estimating $Q ≈ q^∗$



```
Initialize Q(s, a), for all s ∈ S, a ∈ A(s), arbitrarily, and Q(terminal-state, ·) = 0
Repeat (for each episode):
  Initialize S
  Choose A from S using policy derived from Q (e.g., eps-greedy)
  Repeat (for each step of episode):
    Take action A, observe R, S'
    Choose A' from S' using the policy derived from Q (e.g. eps-greedy)
    Q(S, A) ← Q(S, A) + α (R + γQ(S', A') − Q(S, A))
    S ← S' ; A ← A' ;
  until S is terminal
```



In [None]:
def eps_greedy(state, Q, epsilon):
  if np.random.uniform()<=epsilon:
    return np.random.randint(0,env.Na)
  else:
    return np.argmax(Q[state])

Q=np.zeros((env.Ns,env.Na))
n_episodes=100
aplha=0.1
epsilon=0.1
gamma=env.gamma
for episode in range(n_episodes):
  state=env.reset()
  done=False
  while not done:
    action=eps_greedy(state,Q,epsilon)
    next_state, reward, done, info = env.step(action)
    next_action=eps_greedy(next_state,Q,epsilon)
    Q[state,action]+=alpha*(reward+gamma*Q[next_state,next_action]-Q[state,action])
    state,action=next_state,next_action

print('Optimal Q-function: ')
print(Q)

Optimal Q-function: 
[[0.90571261 0.45590375]
 [0.87275019 0.65371894]
 [0.         0.        ]]


## Q-learning (off-policy TD control) for estimating $π ≈ π^∗$


```
Initialize Q(s, a), for all s ∈ S, a ∈ A(s), arbitrarily, and Q(terminal-state, ·) = 0
Repeat (for each episode):
  Initialize S
  Repeat (for each step of episode):
    Choose A from S using policy derived from Q (e.g., eps-greedy)
    Take action A, observe R, S'
    Q(S, A) ← Q(S, A) + α( R + γ max_a Q(S', a) − Q(S, A))
    S ← S'
  until S is terminal
```



In [None]:
Q=np.zeros((env.Ns,env.Na))
n_episodes=100
aplha=0.1
epsilon=0.1
gamma=env.gamma
for episode in range(n_episodes):
  state=env.reset()
  done=False
  while not done:
    action=eps_greedy(state,Q,epsilon)
    next_state, reward, done, info = env.step(action)
    next_action=np.argmax(Q[next_state])
    Q[state,action]+=alpha*(reward+gamma*Q[next_state,next_action]-Q[state,action])
    state=next_state

print('Optimal Q-function: ')
print(Q)

Optimal Q-function: 
[[0.95962607 0.46161132]
 [0.96214882 0.45946134]
 [0.         0.        ]]


## Double Q-learning