In [None]:
# https://lilianweng.github.io/lil-log/2018/02/19/a-long-peek-into-reinforcement-learning.html#value-estimation

In [16]:
#!/usr/bin/env python3
""" Monte Carlo """

""" Epsilon Greedy """
import numpy as np

def f_of_x(x):
    """
    This is the main function we want to integrate over.
    Args:
    - x (float) : input to function; must be in radians
    Return:
    - output of function f(x) (float)
    """
    return (e**(-1*x))/(1+(x-1)**2)


def monte_carlo(env, V, policy, episodes=5000,
                max_steps=100, alpha=0.1, gamma=0.99):
    """
    ***********************************************************
    *************perform the Monte Carlo algorithm*************
    ***********************************************************
    @env: is the openAI environment instance
    @V: is a numpy.ndarray of shape (s,) containing the value estimate
    @policy: is a function that takes in a state and returns the next action to take
    @episodes: is the total number of episodes to train over
    @max_steps: is the maximum number of steps per episode
    @alpha: is the learning rate
    @gamma: is the discount rate
    Returns: V, the updated value estimate
    """

    for ep in range(episodes):
        # Reseting the environment each time as per requirement
        state = env.reset()
        episode = []
        for step in range(max_steps):
            action = policy(state)
            
            # Taking the action and getting the reward and outcome state
            new_state, reward, done, info = env.step(action)
            episode.append([state, action, reward])

            if done:
                break
            state = new_state

        episode = np.array(episode, dtype=int)
        T = len(episode)
        G = 0
        for t in range(T):
            state, action, Returns = episode[t]
            G = gamma**t * G + Returns
            ##G = sum(gamma**k * episode[t][2] for k in range(T-t-1))
            if state not in episode[:ep, 0]:
                V[state] = V[state] + alpha * (G - V[state])
            

    return V




In [137]:
#!/usr/bin/env python3
""" Monte Carlo """

""" Epsilon Greedy """
import numpy as np

# all states
N_STATES = 19

# all states but terminal states
STATES = np.arange(1, N_STATES + 1)

# start from the middle state
START_STATE = 10

# two terminal states
# an action leading to the left terminal state has reward -1
# an action leading to the right terminal state has reward 1
END_STATES = [0, N_STATES + 1]

# true state values from Bellman equation
TRUE_VALUE = np.arange(-20, 22, 2) / 20.0
TRUE_VALUE[0] = TRUE_VALUE[N_STATES + 1] = 0.0

class ValueFunction:
    # @rate: lambda, as it's a keyword in python, so I call it rate
    # @stepSize: alpha, step size for update
    def __init__(self, rate, step_size, V):
        self.rate = rate
        self.step_size = step_size
        self.weights = V# np.zeros(N_STATES + 2)

    # the state value is just the weight
    def value(self, state):
        return self.weights[state]

    # feed the algorithm with new observation
    # derived class should override this function
    def learn(self, state, reward):
        return

    # initialize some variables at the beginning of each episode
    # must be called at the very beginning of each episode
    # derived class should override this function
    def new_episode(self):
        return

class TrueOnlineTemporalDifferenceLambda(ValueFunction):
    def __init__(self, rate, step_size, old_state_value, last_state, s, V):
        ValueFunction.__init__(self, rate, step_size, V)
        # initialize the eligibility trace
        self.eligibility = np.zeros(s)
        # initialize the beginning state
        self.last_state = last_state
        # initialize the old state value
        self.old_state_value = 0.0

    def new_episode(self):
        # initialize the eligibility trace
        self.eligibility = np.zeros(N_STATES + 2)
        # initialize the beginning state
        self.last_state = START_STATE
        # initialize the old state value
        self.old_state_value = 0.0

    def learn(self, state, reward):
        # update the eligibility trace and weights
        last_state_value = self.value(self.last_state)
        state_value = self.value(state)
        dutch = 1 - self.step_size * self.rate * self.eligibility[self.last_state]
        self.eligibility *= self.rate
        self.eligibility[self.last_state] += dutch
        delta = reward + state_value - last_state_value
        self.weights += self.step_size * (delta + last_state_value - self.old_state_value) * self.eligibility
        self.weights[self.last_state] -= self.step_size * (last_state_value - self.old_state_value)
        self.old_state_value = state_value
        self.last_state = state

def monte_carlo(env, V, policy, episodes=5000,
                max_steps=100, alpha=0.1, gamma=0.99):
    """
    ***********************************************************
    *************perform the Monte Carlo algorithm*************
    ***********************************************************
    @env: is the openAI environment instance
    @V: is a numpy.ndarray of shape (s,) containing the value estimate
    @policy: is a function that takes in a state and returns
             the next action to take
    @episodes: is the total number of episodes to train over
    @max_steps: is the maximum number of steps per episode
    @alpha: is the learning rate
    @gamma: is the discount rate
    Returns: V, the updated value estimate
    """

    for ep in range(episodes):
        # Reseting the environment each time as per requirement
        state = env.reset()
        episode = []
        for step in range(max_steps):
            # taking action
            action = policy(state)
            # Taking the action and getting the reward and outcome state
            new_state, reward, done, info = env.step(action)
            # append results for each state of episode
            episode.append([state, action, reward])

            if done:
                break
            state = new_state
        # Cast and turn episode list to np.ndarray
        episode = np.array(episode, dtype=int)
        # initiate needed variabes
        T = len(episode)  # total number of states starting from 0
        G = 0  # empirical return
        for t in range(T):
            state, action, Returns = episode[t]
            # calculate empirical return
            G = gamma**t * G + Returns  # summing returns (rewards)
            # Value Estimation
            if state not in episode[:ep, 0]:
                V[state] = V[state] + alpha * (G - V[state])
    # Returning the updated Value Estimate
    return V



In [3]:
#!/usr/bin/env python3

import gym
import numpy as np
monte_carlo = __import__('0-monte_carlo').monte_carlo

np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
print(V.shape)
np.set_printoptions(precision=2)
env.seed(0)

print(monte_carlo(env, V, policy).reshape((8, 8)))
print(td_lambtha(env, V, policy, 1).reshape((8, 8)))


(64,)
[[ 0.81  0.9   0.48  0.43  0.39  0.43  0.66  0.9 ]
 [ 0.9   0.73  0.59  0.48  0.59  0.28  0.28  0.39]
 [ 1.    0.53  0.73 -1.    1.    0.39  0.28  0.43]
 [ 1.    0.59  0.81  0.9   1.   -1.    0.39  0.66]
 [ 1.    0.66  0.81 -1.    1.    1.    0.73  0.53]
 [ 1.   -1.   -1.    1.    1.    1.   -1.    0.9 ]
 [ 1.   -1.    1.    1.   -1.    1.   -1.    1.  ]
 [ 1.    1.    1.   -1.    1.    1.    1.    1.  ]]
[[ 0.68  0.62  0.19  0.22  0.39  0.43  0.66  0.9 ]
 [ 0.71  0.51  0.24 -0.    0.58  0.28  0.28  0.39]
 [ 0.33  0.41  0.23 -1.    0.91  0.21  0.27  0.38]
 [ 0.38  0.38  0.39  0.6   0.63 -1.    0.39  0.66]
 [ 0.65  0.23  0.19 -1.    1.    1.    0.73  0.53]
 [ 0.62 -1.   -1.    1.    1.    1.   -1.    0.9 ]
 [ 0.56 -1.    1.    1.   -1.    1.   -1.    1.  ]
 [ 0.9   1.    1.   -1.    1.    1.    1.    1.  ]]


In [None]:
[[ 0.81  0.9   0.48  0.43  0.39  0.43  0.66  0.9 ]
 [ 0.9   0.73  0.59  0.48  0.59  0.28  0.28  0.39]
 [ 1.    0.53  0.73 -1.    1.    0.39  0.28  0.43]
 [ 1.    0.59  0.81  0.9   1.   -1.    0.39  0.66]
 [ 1.    0.66  0.81 -1.    1.    1.    0.73  0.53]
 [ 1.   -1.   -1.    1.    1.    1.   -1.    0.9 ]
 [ 1.   -1.    1.    1.   -1.    1.   -1.    1.  ]
 [ 1.    1.    1.   -1.    1.    1.    1.    1.  ]]

In [68]:
import random 
swap_win = no_swap_win = 0 
 
doors = {'a','b','c'} 
 
for trial in range(1, 10001): 
    # Choose which doors win and which lose 
    winning = random.choice(list(doors)) 
    loosing = doors - set(winning) 	  
 
    # Randomly choose which door is picked and which is revealed 
    picked = random.choice(list(doors))   
 
    # Compere never reveals a winning door 
    revealed = random.choice(list(loosing - set(picked))) 
     
    # Randomly choose if the contestant swaps 
    swap = random.choice((True, False)) 
    if swap: 
        # If contestants swaps they choose other than the 
		# one they picked or the compere revealed 
        picked = random.choice(list(doors - set(picked) - set(revealed))) 
 
    # Increment counts 
    if picked == winning: 
        if swap: 
            swap_win += 1 
        else: 
            no_swap_win +=1  
 
print(f'After {trial} trials: Win on Swap {swap_win} - Win when Not swap {no_swap_win}') 

After 10000 trials: Win on Swap 3316 - Win when Not swap 1661


In [49]:
np.random.choice((True, False)) 

False

In [139]:
#!/usr/bin/env python3
""" Monte Carlo """

""" Epsilon Greedy """
import numpy as np

# all states
N_STATES = 19

# all states but terminal states
STATES = np.arange(1, N_STATES + 1)

# start from the middle state
START_STATE = 10

# two terminal states
# an action leading to the left terminal state has reward -1
# an action leading to the right terminal state has reward 1
END_STATES = [0, N_STATES + 1]

# true state values from Bellman equation
TRUE_VALUE = np.arange(-20, 22, 2) / 20.0
TRUE_VALUE[0] = TRUE_VALUE[N_STATES + 1] = 0.0

class ValueFunction:
    # @rate: lambda, as it's a keyword in python, so I call it rate
    # @stepSize: alpha, step size for update
    def __init__(self, rate, step_size, V):
        self.rate = rate
        self.step_size = step_size
        self.weights = V# np.zeros(N_STATES + 2)

    # the state value is just the weight
    def value(self, state):
        return self.weights[state]

    # feed the algorithm with new observation
    # derived class should override this function
    def learn(self, state, reward):
        return

    # initialize some variables at the beginning of each episode
    # must be called at the very beginning of each episode
    # derived class should override this function
    def new_episode(self):
        return

class TrueOnlineTemporalDifferenceLambda(ValueFunction):
    def __init__(self, rate, step_size, old_state_value, last_state, s, V):
        ValueFunction.__init__(self, rate, step_size, V)
        # initialize the eligibility trace
        self.eligibility = np.zeros(s)
        # initialize the beginning state
        self.last_state = last_state
        # initialize the old state value
        self.old_state_value = 0.0

    def new_episode(self):
        # initialize the eligibility trace
        self.eligibility = np.zeros(N_STATES + 2)
        # initialize the beginning state
        self.last_state = START_STATE
        # initialize the old state value
        self.old_state_value = 0.0

    def learn(self, state, reward):
        # update the eligibility trace and weights
        last_state_value = self.value(self.last_state)
        state_value = self.value(state)
        dutch = 1 - self.step_size * self.rate * self.eligibility[self.last_state]
        self.eligibility *= self.rate
        self.eligibility[self.last_state] += dutch
        delta = reward + state_value - last_state_value
        self.weights += self.step_size * (delta + last_state_value - self.old_state_value) * self.eligibility
        self.weights[self.last_state] -= self.step_size * (last_state_value - self.old_state_value)
        self.old_state_value = state_value
        self.last_state = state

#!/usr/bin/env python3
""" TD(λ) """
import numpy as np


def td_lambtha(env, V, policy, lambtha=1, episodes=5000,
               max_steps=100, alpha=0.1, gamma=0.99):
    """
    ************************************************************
    ****************performs the TD(λ) algorithm****************
    ************************************************************
    @env: is the openAI environment instance
    @V: is a numpy.ndarray of shape (s,) containing the value estimate
    @policy: is a function that takes in a state and returns the next
             action to take
    @lambtha: is the eligibility trace factor
    @episodes: is the total number of episodes to train over
    @max_steps: is the maximum number of steps per episode
    @alpha: is the learning rate
    @gamma: is the discount rate
    Returns: V, the updated value estimate
    """
    for ep in range(episodes):
        # Reseting the environment
        state = env.reset()
        episode = []
        for step in range(max_steps):
            # Taking action
            action = policy(state)
            # Getting the reward and outcome state
            new_state, reward, done, info = env.step(action)
            # Appending results for each state of episode
            episode.append([state, action, reward, new_state])
            if done:
                break
            # Incrementing the satete
            state = new_state
        # Cast and turn episode list to np.ndarray
        episode = np.array(episode, dtype=int)
        # initiate needed variabes
        T = len(episode)  # total number of states starting from 0
        G = 0  # empirical return
        n = 1  # number of steps
        for t in range(T):
            state, action, Returns, new_state = episode[t]
            # calculate Gt de n step
            G = gamma**t * G + Returns  # summing returns (rewards)
            Gtn = G + gamma**(n) * V[new_state]  # adding V(s of t+1)
            # calculate Gtn lambda by weights decay:
            #                             a factor λ with n,  λ^(n−1)
            Gtnlamda = (1 - lambtha) * (Gtn + lambtha**(n - 1))  # λ-return
            # Value Estimation
            if state not in episode[:ep, 0]:
                V[state] = (1 - alpha) * (V[state] + Gtnlamda)
                V[state] = V[state] + alpha * (Gtnlamda - V[state])
                V[state] = V[state] + alpha * (Returns + gamma
                                               * V[new_state] - V[state])
            n += 1
    # Returning the updated Value Estimate
    return V


In [1]:
#!/usr/bin/env python3

import gym
import numpy as np
td_lambtha = __import__('1-td_lambtha').td_lambtha

np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64') 
np.set_printoptions(precision=4)
print(td_lambtha(env, V, policy, 0.8, episodes=500, max_steps=25, alpha=0.07, gamma=0.95).reshape((8, 8)))

[[ 0.9137  0.9509  0.8348  0.8438  0.6991  0.875   0.6712  0.7066]
 [ 0.5792  0.7919  0.7782  0.6965  0.7106  0.8752  0.9143  1.    ]
 [ 0.7902  0.6431  0.8612 -1.      0.7434  0.9372  0.9369  1.    ]
 [ 0.6604  0.5058  0.6626  0.6149  0.7395 -1.      0.8198  0.7641]
 [ 0.7059  0.393   0.4348 -1.      0.9327  1.      0.7935  1.    ]
 [ 1.     -1.     -1.      1.      1.      1.     -1.      1.    ]
 [ 1.     -1.      1.      1.     -1.      1.     -1.      1.    ]
 [ 1.      1.      1.     -1.      1.      1.      1.      1.    ]]


In [2]:
#!/usr/bin/env python3

import gym
import numpy as np
#td_lambtha = __import__('1-td_lambtha').td_lambtha

np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64') 
np.set_printoptions(precision=4)
#env.seed(123)
print(td_lambtha(env, V, policy, 0.9).reshape((8, 8)))


[[ 0.8603  0.9059  1.      1.      1.      1.      1.      1.    ]
 [ 0.6211  0.777   0.8545  0.9174  0.9172  0.917   1.      1.    ]
 [ 0.5552  0.5735  0.4333 -1.      1.      0.6374  0.9079  1.    ]
 [ 0.6758  0.6307  0.641   0.3649  0.8372 -1.      1.      1.    ]
 [ 0.6888  0.7199  0.2744 -1.      0.9198  0.7056  1.      1.    ]
 [ 0.4074 -1.     -1.      1.      1.      1.     -1.      1.    ]
 [ 0.9194 -1.      1.      1.     -1.      1.     -1.      1.    ]
 [ 0.7191  0.705   1.     -1.      1.      1.      1.      1.    ]]


In [107]:
def td_lambtha(env, V, policy, lambtha, episodes=5000,
               max_steps=100, alpha=0.1, gamma=0.99):
    """
    ***********************************************************
    *************perform the Monte Carlo algorithm*************
    ***********************************************************
    @env: is the openAI environment instance
    @V: is a numpy.ndarray of shape (s,) containing the value estimate
    @policy: is a function that takes in a state and returns the next action to take
    @lambtha: is the eligibility trace factor
    @episodes: is the total number of episodes to train over
    @max_steps: is the maximum number of steps per episode
    @alpha: is the learning rate
    @gamma: is the discount rate
    Returns: V, the updated value estimate
    """
    s = V.shape[0]
    
    for ep in range(episodes):
        # Reseting the environment each time as per requirement
        state = env.reset()
        episode = []
        R = 0
        t = 0
        z = 0
        states = []
        eligibility = [0] * s
        for step in range(max_steps):
            action = policy(state)
            # Taking the action and getting the reward and outcome state
            new_state, reward, done, info = env.step(action)
            eligibility[state] += 1
            target = reward + gamma * V[new_state] - V[state]
            eligibility[state]  *= gamma * lambtha
            V[state] = alpha * target * eligibility[state] 
            
            if done:
                break
            state = new_state
         
    return V.round(4)

In [74]:
def epsilon_greedy(Q, state, epsilon):
    """
    ******************************************************
    ***********uses epsilon-greedy to determine***********
    *******************the next action********************
    ******************************************************
    @Q: is a numpy.ndarray containing the q-table
    @state: is the current state
    @epsilon: is the epsilon to use for the calculation
    *** You should sample p with numpy.random.uniformn to determine
        if your algorithm should explore or exploit
    *** If exploring, you should pick the next action with
        numpy.random.randint from all possible actions
    Returns:
            the next action index
    """
    p = np.random.uniform()
    if p < epsilon:
        index = np.random.randint(Q.shape[1])
    else:
        index = np.argmax(Q[state])

    return index


def sarsa_lambtha(env, Q, lambtha, episodes=5000, max_steps=100,
                  alpha=0.1, gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):
    
    for i in range(episodes):
        done = False
        state = env.reset()
        action = epsilon_greedy(Q, state, epsilon=epsilon)
        for step in range(max_steps):
            new_state, reward, done, info = env.step(action)
            new_action = epsilon_greedy(Q, new_state, epsilon=epsilon)
            Q[state, action] = (((1-lambtha )* Q[state, action])
                                + alpha * (reward + gamma * Q[new_state, new_action]
                                           - Q[state, action]))
        if done:
            break
        state = new_state
        action = new_action

    return Q

In [75]:
#!/usr/bin/env python3

import gym
import numpy as np
#sarsa_lambtha = __import__('2-sarsa_lambtha').sarsa_lambtha

np.random.seed(0)
env = gym.make('FrozenLake8x8-v0')
Q = np.random.uniform(size=(64, 4))
np.set_printoptions(precision=4)
print(sarsa_lambtha(env, Q, 0.9))

[[0.069  0.7152 0.0575 0.0883]
 [0.4237 0.6459 0.4376 0.8918]
 [0.9637 0.3834 0.7917 0.5289]
 [0.568  0.9256 0.071  0.0871]
 [0.0202 0.8326 0.7782 0.87  ]
 [0.9786 0.7992 0.4615 0.7805]
 [0.1183 0.6399 0.1434 0.9447]
 [0.5218 0.4147 0.2646 0.7742]
 [0.4562 0.5684 0.0188 0.6176]
 [0.6121 0.6169 0.9437 0.6818]
 [0.3595 0.437  0.6976 0.0602]
 [0.6668 0.6706 0.2104 0.1289]
 [0.3154 0.3637 0.5702 0.4386]
 [0.9884 0.102  0.2089 0.1613]
 [0.6531 0.2533 0.4663 0.2444]
 [0.159  0.1104 0.6563 0.1382]
 [0.1966 0.3687 0.821  0.0971]
 [0.8379 0.0961 0.9765 0.4687]
 [0.9768 0.6048 0.7393 0.0392]
 [0.2828 0.1202 0.2961 0.1187]
 [0.318  0.4143 0.0641 0.6925]
 [0.5666 0.2654 0.5232 0.0939]
 [0.5759 0.9293 0.3186 0.6674]
 [0.1318 0.7163 0.2894 0.1832]
 [0.5865 0.0201 0.8289 0.0047]
 [0.6778 0.27   0.7352 0.9622]
 [0.2488 0.5762 0.592  0.5723]
 [0.2231 0.9527 0.4471 0.8464]
 [0.6995 0.2974 0.8138 0.3965]
 [0.8811 0.5813 0.8817 0.6925]
 [0.7253 0.5013 0.9561 0.644 ]
 [0.4239 0.6064 0.0192 0.3016]
 [0.6602