In [1]:
import numpy as np
import gym
from gym import wrappers


def run_episode(env, policy, gamma = 1.0, render = False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
         # 计算每个状态从策略中得到的动作，然后计算值函数
        # 遍历每个状态
        if done:
            break  # 收敛判断
    return total_reward


In [2]:
def evaluate_policy(env, policy, gamma = 1.0,  n = 100):
    """ Evaluates a policy by running it n times.
    returns:
    average total reward
    """
    scores = [
            run_episode(env, policy, gamma = gamma, render = False)
            for _ in range(n)]
    return np.mean(scores)

In [3]:
def extract_policy(v, gamma = 1.0):
    """ Extract the policy given a value-function """
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for next_sr in env.P[s][a]:
                # next_sr is a tuple of (probability, next state, reward, done)
                p, s_, r, _ = next_sr     # 更新Q表，即更新动作对应的Q值（4个动作分别由0-3表示）
                q_sa[a] += (p * (r + gamma * v[s_]))
                 # 当前状态下，选取使Q值最大的那个策略
        policy[s] = np.argmax(q_sa)
    return policy

In [4]:
def value_iteration(env, gamma = 1.0):# 初始化随机策略，下句代码即为初始策略全为0（向左走）
    """ Value-iteration algorithm """
    v = np.zeros(env.nS)  # 设置迭代次数
    max_iterations = 100000 # 开始迭代
    eps = 1e-20
    for i in range(max_iterations):
        prev_v = np.copy(v)
        for s in range(env.nS):
            q_sa = [sum([p*(r + prev_v[s_]) for p, s_, r, _ in env.P[s][a]]) for a in range(env.nA)] 
            v[s] = max(q_sa)
            # 判断迭代终止条件（策略不变时）
        if (np.sum(np.fabs(prev_v - v)) <= eps):
            print ('Value-iteration converged at iteration# %d.' %(i+1))
            break 
            # 新的策略为下一次的执行策略
    return v # 返回新的策


In [5]:
if __name__ == '__main__':
    env_name  = 'FrozenLake8x8-v0'
    gamma = 1.0
    env = gym.make(env_name)
    optimal_v = value_iteration(env, gamma);
    policy = extract_policy(optimal_v, gamma)
    policy_score = evaluate_policy(env, policy, gamma, n=100)
    print('Policy average score = ', policy_score)

Value-iteration converged at iteration# 2357.
Policy average score =  0.82
