In [1]:
from lib.util import *
from lib.policy import *
from lib.mdp import *
from lib.mrp import *
from lib.env import *
from sklearn.linear_model import LinearRegression

In [2]:
n = 5
gamma = 0.95

In [3]:
P = generate_stochastic_matrix(n)
R = generate_reward_vector(n)
mrp = MRP(P, R, gamma)
mdp = MDP(gamma, [mrp]*n)
Q = generate_stochastic_matrix(n)
policy = Policy(Q)

In [4]:
env = Env(mdp)

# Linear model

In [5]:
from lib.linear_model import *

In [6]:
lm = LinearModel()
state = 9
print(lm.feature_extractor(state))
lm.update(2, 3)
print(lm.weights)
print(lm.predict(6))

[ 1  9 81]
[0.03 0.06 0.12]
4.71


# Monte-Carlo Prediction algorithm with Value Function approximation

- Return $G_t$ is an unbiased, noisy sample of true value $v_\pi(S_t)$
- Can therefore apply supervised learning to “training data”:
$$<S_1, G_1>, <S_2, G_2>, \dots <S_T, G_T>$$
- For example, using linear Monte-Carlo policy evaluation
$$∆w = \alpha(G_t − \hat{v}(S_t, w))\nabla_w \hat{v}(S_t, w)= \alpha(G_t − \hat{v}(S_t, w))x(S_t)$$
- Monte-Carlo evaluation converges to a local optimum
- Even when using non-linear value function approximation

In [7]:
def feature_extractor(state, n_features = 3):
    return np.array([state**k for k in range(n_features)])

Experimented with directly using supervised learning.

In [8]:
def monte_carlo(env, policy, n_episodes=5, gamma = 0.9):
    """
    This is an every visit Monte Carlo
    """
    state_counter = defaultdict(int)
    states = []
    Y = []
    for episode in range(n_episodes):
        memory = []
        state = env.reset()
        done = False
        while not done:
            action = policy.sample_action(state)
            state, reward, done, _ = env.step(state, action)
            memory.append((state, action, reward))
        
        for i in range(len(memory)):
            state, _, reward = memory[i]
            G = discounted_return(memory[i:], gamma)
            states.append(state)
            Y.append(G)
    return states,np.array(Y)

In [9]:
def get_X(states):
    res = []
    for s in states:
        res.append(feature_extractor(s))
    return np.array(res)

In [10]:
states, Y = monte_carlo(env, policy)
X = get_X(states)
model = LinearRegression()
model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

The model diverges, which is a usual issue when using linear model approximator with RL.

In [11]:
model.predict([[0, 1, 2]])

array([20.25902607])

We now use the previous template of linear models to perform step by step gradient descent updates.

In [12]:
def monte_carlo(env, policy, n_episodes=5, gamma = 0.9):
    """
    This is an every visit Monte Carlo
    """
    state_counter = defaultdict(int)
    lm = LinearModel()
    for episode in range(n_episodes):
        memory = []
        state = env.reset()
        done = False
        while not done:
            action = policy.sample_action(state)
            state, reward, done, _ = env.step(state, action)
            memory.append((state, action, reward))
        
        for i in range(len(memory)):
            state, _, reward = memory[i]
            G = discounted_return(memory[i:], gamma)
            lm.update(state, G)
    return lm

In [13]:
lm = monte_carlo(env, policy)

In [14]:
lm.weights

array([  7.46502395, -19.75451062,   0.5452306 ])

# 1-step TD Prediction algorithm with Value Function approximation

We use another target: $$R_{t+1} + \gamma \hat{v}(S_{t+1}, w)$$

In [15]:
def td_zero(env, policy, n=10, gamma = 0.9, alpha = 0.9):
    lm = LinearModel()
    for _ in range(n):
        state = env.reset()
        done = False
        while not done:
            action = policy.sample_action(state)
            next_state, reward, done, _ = env.step(state, action)
            target = reward + gamma * lm.predict(next_state)
            lm.update(state, target)
            state = next_state
    return lm

# Eligibility-Traces-based TD(lambda) Prediction algorithm with Value Function approximation

We use $G_t^{\lambda}$ as a target here

In [16]:
def forward_td_lambda(env, policy, n_episodes=10, lambd=0.9, gamma=0.9, alpha = 0.9):
    def n_step_return(memory, n, gamma, end_val):
        return sum([gamma**k * memory[k][1] for k in range(n)])+ gamma**n*end_val
    lm = LinearModel()
    for episode in range(n_episodes):
        memory = []
        states = set()
        state = env.reset()
        done = False
        while not done:
            action = policy.sample_action(state)
            state, reward, done, _ = env.step(state, action)
            states.add(state)
            memory.append((state, action, reward))
    
        for i in range(len(memory)):
            s, _, _ = memory[i]
            gt_lambda = (1-lambd)*sum([lambd**(k-1) * n_step_return(memory[i:], k, gamma, state_value[memory[i+k][0]]) for k in range(len(memory)-i-1)])
            lm.update(s, gt_lambda)   
    return state_value

# SARSA with Value Function approximation

In [17]:
from lib.epsilon_greedy_policy import *

In [18]:
def sarsa(env, n_episodes, alpha=0.1):
    policy = EpsilonPolicy(action_number=env.mdp.action_number)
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            action = policy.sample_action(state)
            next_state, reward, done, _ = env.step(state, action)
            next_action = policy.sample_action(next_state)
            target = reward + gamma*policy.approximator.predict(next_state, next_action) - policy.approximator.predict(state, action)
            policy.update(state,target,action)
            action, state = next_action, next_state
    return policy

In [None]:
policy = sarsa(env, 5)

In [20]:
policy.approximator.predict(2, 0)

nan

# Q-learning with Value Function approximation

In [21]:
def qLearning(env, n_episodes, alpha=0.1):
    policy = EpsilonPolicy(action_number=env.mdp.action_number)
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            action = policy.sample_action(state)
            next_state, reward, done, _ = env.step(state, action)
            next_action = policy.sample_action(next_state)
            target = reward + gamma*max([policy.approximator.predict(next_state, a) for a in range(env.mdp.action_number)]) - policy.approximator.predict(state, action)
            policy.update(state,target,action)
            action, state = next_action, next_state
    return policy

In [None]:
policy = qLearning(env, 100)

In [23]:
policy.approximator.predict(2,0)

nan