In [2]:
"""
Solving FrozenLake8x8 environment using Policy iteration.
Author : Moustafa Alzantot (malzantot@ucla.edu)
"""
import numpy as np
import gym
from gym import wrappers


def run_episode(env, policy, gamma = 1.0, render = False):
    """ Runs an episode and return the total reward """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward


In [3]:
def evaluate_policy(env, policy, gamma = 1.0, n = 100):
    scores = [run_episode(env, policy, gamma, False) for _ in range(n)]
    return np.mean(scores)

In [2]:
def extract_policy(v, gamma = 1.0):
    """ Extract the policy given a value-function """
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.nA)
        for a in range(env.nA):
            q_sa[a] = sum([p * (r + gamma * v[s_]) for p, s_, r, _ in  env.P[s][a]])
        policy[s] = np.argmax(q_sa)   #贪婪式的提高方法
    return policy

In [5]:
def compute_policy_v(env, policy, gamma=1.0):
    """ Iteratively evaluate the value-function under policy.
    Alternatively, we could formulate a set of linear equations in iterms of v[s] 
    and solve them to find the value function.
    """
    v = np.zeros(env.nS)
    eps = 1e-10
    while True:
        prev_v = np.copy(v)
        for s in range(env.nS):
            policy_a = policy[s]
            v[s] = sum([p * (r + gamma * prev_v[s_]) for p, s_, r, _ in env.P[s][policy_a]])
        if (np.sum((np.fabs(prev_v - v))) <= eps):
            # value converged
            break
    return v

In [6]:
def policy_iteration(env, gamma = 1.0):
    """ Policy-Iteration algorithm """
    policy = np.random.choice(env.nA, size=(env.nS))  # initialize a random policy
    max_iterations = 200000
    gamma = 1.0
    for i in range(max_iterations):
        old_policy_v = compute_policy_v(env, policy, gamma)
        new_policy = extract_policy(old_policy_v, gamma)
        if (np.all(policy == new_policy)):
            print ('Policy-Iteration converged at step %d.' %(i+1))
            break
        policy = new_policy
    return policy

In [7]:
if __name__ == '__main__':
    env_name  = 'FrozenLake8x8-v0'
    env = gym.make(env_name)
    optimal_policy = policy_iteration(env, gamma = 1.0)
    scores = evaluate_policy(env, optimal_policy, gamma = 1.0)
    print('Average scores = ', np.mean(scores))

Policy-Iteration converged at step 12.
Average scores =  0.9


In [None]:
import tensorflow as tf
import numpy as np
import random
import gym
import math
import matplotlib.pyplot as plt

def softmax(x):
    e_x = np.exp(x - np.max(x))
    out = e_x / e_x.sum()
    return out


def policy_value():
    with tf.variable_scope("policy_value"):
        state = tf.placeholder("float",[None,4])

        #newvals is future reward
        newvals = tf.placeholder("float",[None,1])

        w1 = tf.get_variable("w1",[4,10])
        b1 = tf.get_variable("b1",[10])

        h1 = tf.nn.relu(tf.matmul(state,w1) + b1)
        w2 = tf.get_variable("w2",[10,2])
        b2 = tf.get_variable("b2",[2])

        w3 = tf.get_variable("w3",[10,1])
        b3 = tf.get_variable("b3",[1])

        #policy gradient
        calculated = tf.matmul(h1,w2) + b2
        probabilities = tf.nn.softmax(calculated)

        actions = tf.placeholder("float",[None,2])
        advantages = tf.placeholder("float",[None,1])

        good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1])
        eligibility = tf.log(good_probabilities) * advantages
        loss1 = -tf.reduce_sum(eligibility)

        #value gradient
        calculated1 = tf.matmul(h1,w3) + b3
        diffs = calculated1 - newvals
        loss2 = tf.nn.l2_loss(diffs)

        #policy loss + value loss
        loss = loss1+loss2

        optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)#AdamOptimizer

        return probabilities,calculated1, actions,state,advantages, newvals, optimizer, loss1,loss2

def run_episode(env, policy_value, sess,is_train = True):    
    p_probabilities,v_calculated,p_actions, pv_state, p_advantages, v_newvals, pv_optimizer,loss1,loss2 = policy_value

    observation = env.reset()
    totalreward = 0
    states = []
    actions = []
    advantages = []
    transitions = []
    update_vals = []


    for _ in range(200):
        # calculate policy
        obs_vector = np.expand_dims(observation, axis=0)
        #calculate action according to current state
        probs = sess.run(p_probabilities,feed_dict={pv_state: obs_vector})

        action = 1 if probs[0][0]<probs[0][1] else 0
        #take a random action when training
        if is_train:
            action = 0 if random.uniform(0,1) < probs[0][0] else 1
        # record the transition
        states.append(observation)
        actionblank = np.zeros(2)
        actionblank[action] = 1
        actions.append(actionblank)
        # take the action in the environment
        old_observation = observation
        observation, reward, done, info = env.step(action)
        transitions.append((old_observation, action, reward))
        totalreward += reward

        if done:
            break
    #return totalreward if it is testing
    if not is_train:
        return totalreward

    #training
    for index, trans in enumerate(transitions):
        obs, action, reward = trans

        # calculate discounted monte-carlo return
        future_reward = 0
        future_transitions = len(transitions) - index
        decrease = 1
        for index2 in range(future_transitions):
            future_reward += transitions[(index2) + index][2] * decrease
            decrease = decrease * 0.97
        obs_vector = np.expand_dims(obs, axis=0)
        #value function: calculate max reward under current state 
        currentval = sess.run(v_calculated,feed_dict={pv_state: obs_vector})[0][0]

        # advantage: how much better was this action than normal
        # 根据实际数据得到future_reward比值函数计算出来的reward要好多少
        # 训练到后来,这个currentval:即在当前reward会估计的比较准确,在当前state下能够获得的
        # 最大reward或者平均reward,而有了这个估计,用实际的reward减去这个reward,就可以判断这个
        # action的好坏,即这个currentval是训练时用来评估某个action的好坏
        # 用future_reward减去这个最大reward,就得到了这个action
        # 对应的label,如果比估计的值更大,那说明要根据该参数进行更新,如果比该值小,那说明
        # 达不到平均水平,那么将将该action对应的梯度进行反向更新(相减为负值),使得下次碰到这个
        # 类似的state的时候,不再采取这个action
        advantages.append(future_reward - currentval)

        #advantages.append(future_reward-2.0)

        update_vals.append(future_reward)

    # update value function
    update_vals_vector = np.expand_dims(update_vals, axis=1)

    advantages_vector = np.expand_dims(advantages, axis=1)
    #train network
    _,print_loss1,print_loss2 = sess.run([pv_optimizer,loss1,loss2], feed_dict={pv_state: states,v_newvals: update_vals_vector, p_advantages: advantages_vector, p_actions: actions})

    print("policy loss ",print_loss1)
    print("value loss ",print_loss2)
    return totalreward


env = gym.make('CartPole-v0')

PolicyValue = policy_value()

sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())

for i in range(1500):
    reward = run_episode(env, PolicyValue, sess)

t = 0
for _ in range(1000):
    #env.render()
    reward = run_episode(env, PolicyValue, sess,False)
    t += reward
print(t / 1000)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
policy loss  455.7054
value loss  239.71481
policy loss  729.7158
value loss  364.66998
policy loss  423.19095
value loss  239.75842
policy loss  536.0046
value loss  296.5809
policy loss  2068.702
value loss  909.6444
policy loss  228.42177
value loss  141.64803
policy loss  1276.2365
value loss  591.4498
policy loss  406.60703
value loss  231.26395
policy loss  4734.3013
value loss  1761.7063
policy loss  52711.68
value loss  13684.197
policy loss  2488.8054
value loss  1017.84656
policy loss  36231.516
value loss  9388.404
policy loss  4091.1638
value loss  1567.7795
policy loss  2563.1375
value loss  989.8098
policy loss  655.36816
value loss  338.60156
policy loss  1055.596
value loss  484.67856
policy loss  1422.6884
value loss  659.66394
policy loss  1693.0361
value loss  758.8818
policy loss  13520.119
value loss  4215.286
policy loss  2908.982
value loss  124

policy loss  245426.84
value loss  29143.457
policy loss  288994.62
value loss  31279.285
policy loss  319284.56
value loss  33004.57
policy loss  126710.2
value loss  18089.889
policy loss  35190.6
value loss  6679.211
policy loss  140048.36
value loss  19387.969
policy loss  416833.7
value loss  38538.773
policy loss  142559.1
value loss  18380.648
policy loss  201327.6
value loss  23562.303
policy loss  433.1053
value loss  426.97437
policy loss  167553.9
value loss  20323.54
policy loss  70951.26
value loss  10862.06
policy loss  221483.72
value loss  23020.922
policy loss  931.21985
value loss  649.1178
policy loss  315.0408
value loss  804.76556
policy loss  98940.57
value loss  13229.162
policy loss  412985.75
value loss  35873.973
policy loss  178004.88
value loss  20362.121
policy loss  -1550.1754
value loss  521.87854
policy loss  361983.88
value loss  32894.97
policy loss  673.65497
value loss  899.24316
policy loss  173149.36
value loss  18181.941
policy loss  369503.3
valu

policy loss  61552.797
value loss  5059.647
policy loss  63914.797
value loss  4446.4336
policy loss  34402.293
value loss  3430.6406
policy loss  38616.758
value loss  3574.8936
policy loss  39750.234
value loss  3445.272
policy loss  60547.88
value loss  4230.996
policy loss  42015.953
value loss  3282.9978
policy loss  47577.875
value loss  3029.491
policy loss  31305.172
value loss  2921.6365
policy loss  45486.438
value loss  4022.7593
policy loss  75002.336
value loss  3514.142
policy loss  60548.766
value loss  3264.9358
policy loss  9897.929
value loss  4505.791
policy loss  80043.42
value loss  3474.5352
policy loss  77281.36
value loss  3473.1765
policy loss  -13447.509
value loss  4835.0103
policy loss  34398.457
value loss  3400.121
policy loss  8524.377
value loss  3683.1455
policy loss  63753.734
value loss  3100.051
policy loss  32217.354
value loss  3376.3804
policy loss  45912.008
value loss  3143.0178
policy loss  34284.28
value loss  2552.0852
policy loss  48697.2
va

policy loss  -38337.07
value loss  5538.09
policy loss  -11838.429
value loss  1216.3196
policy loss  -13591.449
value loss  1306.9736
policy loss  -8879.525
value loss  2050.327
policy loss  -6604.8438
value loss  1376.0375
policy loss  -7190.413
value loss  6587.9175
policy loss  10141.177
value loss  1585.7721
policy loss  26996.943
value loss  2225.5627
policy loss  36455.074
value loss  1433.0994
policy loss  3390.3784
value loss  2239.7664
policy loss  -1379.7822
value loss  1497.6858
policy loss  -36822.83
value loss  1965.3881
policy loss  30768.33
value loss  1665.269
policy loss  32894.504
value loss  1943.2958
policy loss  44932.42
value loss  2074.442
policy loss  26053.533
value loss  2871.0713
policy loss  31239.0
value loss  2060.7087
policy loss  -164.5542
value loss  2184.4312
policy loss  -20937.969
value loss  2126.6472
policy loss  24438.35
value loss  1758.4548
policy loss  38585.03
value loss  2985.2812
policy loss  5196.7944
value loss  2035.6416
policy loss  370

policy loss  -12471.467
value loss  2003.8936
policy loss  31428.309
value loss  2272.6655
policy loss  35409.684
value loss  3210.69
policy loss  13344.5625
value loss  1823.5511
policy loss  11744.945
value loss  1549.1967
policy loss  -6146.5283
value loss  2077.3801
policy loss  -10699.274
value loss  1499.0967
policy loss  21243.799
value loss  2931.6526
policy loss  10757.371
value loss  2064.2617
policy loss  -25385.0
value loss  1316.5962
policy loss  -14749.094
value loss  2117.1445
policy loss  -48371.39
value loss  5584.539
policy loss  3596.3262
value loss  2164.8347
policy loss  -13807.418
value loss  2016.255
policy loss  3721.2876
value loss  1959.6753
policy loss  -8117.9375
value loss  2443.764
policy loss  -16192.992
value loss  2679.0762
policy loss  -10919.714
value loss  2222.5728
policy loss  -47999.703
value loss  2596.1477
policy loss  -27065.049
value loss  2359.9646
policy loss  -15445.928
value loss  3127.6582
policy loss  -25363.896
value loss  2749.9622
pol

policy loss  15538.973
value loss  1238.1096
policy loss  40464.35
value loss  2299.1064
policy loss  5313.1685
value loss  1297.8636
policy loss  31417.412
value loss  1736.028
policy loss  22525.152
value loss  2209.635
policy loss  29411.504
value loss  1635.9905
policy loss  -1809.8135
value loss  1476.0499
policy loss  -13016.425
value loss  1613.2327
policy loss  -4939.502
value loss  911.77057
policy loss  -25195.193
value loss  1560.2009
policy loss  -10484.551
value loss  2261.2424
policy loss  -19825.49
value loss  2253.981
policy loss  29885.75
value loss  2074.8938
policy loss  1281.6294
value loss  2105.8618
policy loss  -2523.6396
value loss  1277.2812
policy loss  28322.494
value loss  2168.77
policy loss  6524.8174
value loss  2483.687
policy loss  49180.96
value loss  2745.9492
policy loss  32199.94
value loss  1929.9602
policy loss  50023.844
value loss  3130.6182
policy loss  22624.98
value loss  1244.044
policy loss  28534.15
value loss  2059.921
policy loss  20632.