## Polic comparisons

In [3]:
!pip install gym

Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/d2/88/a7186ffe1f33570ad3b8cd635996e5a3e3e155736e180ae6a2ad5e826a60/gym-0.15.3.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 20.4MB/s ta 0:00:01
Collecting pyglet<=1.3.2,>=1.2.0 (from gym)
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 25.5MB/s ta 0:00:01
[?25hCollecting cloudpickle~=1.2.0 (from gym)
  Downloading https://files.pythonhosted.org/packages/c1/49/334e279caa3231255725c8e860fa93e72083567625573421db8875846c14/cloudpickle-1.2.2-py2.py3-none-any.whl
Collecting future (from pyglet<=1.3.2,>=1.2.0->gym)
[?25l  Downloading https://files.pythonhosted.org/packages/3f/bf/57733d44afd0cf67580658507bd11d3ec629612d5e0e432beb4b8f6fbb04/future-0.18.1.tar.gz (828kB)
[K    100% |████████████████████████████████| 829kB 30.6M

In [4]:
import numpy
import time
import gym 

### Randomly select policy among the best action method

In [6]:
# Execution
def execute(env, policy, episodeLength=100, render=False):
    """
    Args:
      policy: [S, A] shaped matrix representing the policy.
      env: OpenAI gym env.
      render: boolean to turn rendering on/off.
    """ 
    totalReward = 0
    start = env.reset()
    for t in range(episodeLength):
        if render:
            env.render()
        action = policy[start]
        start, reward, done, _ = env.step(action)
        totalReward += reward
        if done:
            break
    return totalReward 

In [7]:
# Evaluation
def evaluatePolicy(env, policy, n_episodes=100):
    totalReward = 0.0
    for _ in range(n_episodes):
        totalReward += execute(env, policy)
    return totalReward / n_episodes 

In [8]:
# Function for a random policy
def gen_random_policy():
    return numpy.random.choice(4, size=((16)))

In [9]:
# Generate GymAI environment
env = gym.make('FrozenLake-v0')

# Policy search
n_policies   = 1000
startTime    = time.time()
policy_set   = [gen_random_policy() for _ in range(n_policies)]
policy_score = [evaluatePolicy(env, p) for p in policy_set]
endTime = time.time()
print("Best score = %0.2f. Time taken = %4.4f seconds" %(numpy.max(policy_score) ,
endTime - startTime)) 

Best score = 0.52. Time taken = 12.0044 seconds


Best score = 0.52. Time taken = 12.0044 seconds.

This searches the environment for best policy in a random set of 1000 solutions and evaluates them. The best policy score we get is 0.40 in 12.0044 seconds. It means that the chance of agent reaching the goal is 52%. It is not even close to our goal. Random search does not work well
for complex problems where the search space is huge.

The goal of the Agent is to pick the best policy that will maximize the total rewards received from
the environment. 

### Value-Iteration method select policy among the best action

In [12]:
def execute(env, policy, gamma=1.0, render=False):
    """
    Args:
      policy: [S, A] shaped matrix representing the policy.
      env: OpenAI gym env.
      env.P represents the transition probabilities of the environment.
      env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
      env.nS is a number of states in the environment.
      env.nA is a number of actions in the environment.
      gamma: Gamma discount factor.
      render: boolean to turn rendering on/off.
    """ 
    start = env.reset()
    totalReward = 0
    stepIndex = 0
    while True:
        if render:
            env.render() 
        action = int(policy[start])
        start, reward, done, _ = env.step(action)
        totalReward += (gamma ** stepIndex * reward)
        stepIndex += 1
        if done:
            break
    return totalReward 

In [13]:
# Evaluates a policy by running it n times.returns:average total reward
def evaluatePolicy(env, policy, gamma=1.0, n=100):
    scores = [execute(env, policy, gamma=gamma, render=False) for _ in range(n)]
    return numpy.mean(scores)

In [14]:
# choosing the policy given a value-function
def calculatePolicy(v, gamma=1.0):
    policy = numpy.zeros(env.env.nS)

    for s in range(env.env.nS):
        q_sa = numpy.zeros(env.action_space.n)

        for a in range(env.action_space.n):

            for next_sr in env.env.P[s][a]:
                # next_sr is a tuple of (probability, next state, reward, done)
                p, s_, r, _ = next_sr
                # Bellman equation
                q_sa[a] += (p * (r + gamma * v[s_]))

        policy[s] = numpy.argmax(q_sa)

    return policy

In [15]:
# Value Iteration Agorithm
def valueIteration(env, gamma=1.0):
    value = numpy.zeros(env.env.nS) # initialize value-function
    max_iterations = 10000
    eps = 1e-20
 
    for i in range(max_iterations):
        prev_v = numpy.copy(value)
        
        for s in range(env.env.nS): 
            q_sa = [sum([p * (r + prev_v[s_]) for p, s_, r, _ in env.env.P[s][a]]) for a in range(env.env.nA)] 
            value[s] = max(q_sa)  

        if (numpy.sum(numpy.fabs(prev_v - value)) <= eps):
            print('Value-iteration converged at # %d.' % (i + 1))
            break

    return value 

In [16]:
gamma = 1.0
env = gym.make("FrozenLake-v0")
optimalValue = valueIteration(env, gamma);
startTime = time.time()
policy = calculatePolicy(optimalValue, gamma)
policy_score = evaluatePolicy(env, policy, gamma, n=1000)
endTime = time.time()
print("Best score = %0.2f. Time taken = %4.4f seconds" % (numpy.mean(policy_score), endTime - startTime)) 

Value-iteration converged at # 1373.
Best score = 0.76. Time taken = 0.5340 seconds


Output:

Value-iteration converged at # 1373.

Best score = 0.76. Time taken = 0.4184 seconds.

### Policy-Iteration method

The value-iteration algorithm keeps improving the value function at each iteration, until the valuefunction converges. Since the agent only cares about the finding the optimal policy, sometimes the optimal policy will converge before the value function. Therefore, we have another algorithm called policy-iteration. Instead of repeatedly improving the value-function estimate, it will re-define the policy at each step and compute the value according to this new policy until the policy converges. Policy iteration is also guaranteed to converge to the optimal policy and it often takes less iterations to converge than the value-iteration algorithm.

In [17]:
# executes an episode
def execute(env, policy, gamma = 1.0, render = False):
    """
    Args:
      policy: [S, A] shaped matrix representing the policy.
      env: OpenAI gym env.
      env.P represents the transition probabilities of the environment.
      env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
      env.nS is a number of states in the environment.
      env.nA is a number of actions in the environment.
      gamma: Gamma discount factor.
      render: boolean to turn rendering on/off.
    """ 
    start = env.reset()
    totalReward = 0
    stepIndex = 0
    while True:
        if render:
            env.render()
        action = int(policy[start])
        start, reward, done , _ = env.step(action)
        totalReward += (gamma ** stepIndex * reward)
        stepIndex += 1

        if done:
            break
      
    return totalReward 

In [18]:
# executes an episode
def execute(env, policy, gamma = 1.0, render = False):
    start = env.reset()
    totalReward = 0
    stepIndex = 0

    while True:
        if render:
            env.render()

        action = int(policy[start])
        start, reward, done , _ = env.step(action)
        totalReward += (gamma ** stepIndex * reward)
        stepIndex += 1

        if done:
            break
            
    return totalReward 

In [19]:
def evaluatePolicy(env, policy, gamma = 1.0, n = 100):
    scores = [execute(env, policy, gamma, False) for _ in range(n)]
    return numpy.mean(scores) 

In [20]:
# Extract the policy given a value-function
def extractPolicy(v, gamma = 1.0):
    policy = numpy.zeros(env.env.nS)

    for s in range(env.env.nS):
        q_sa = numpy.zeros(env.env.nA)

        for a in range(env.env.nA):
            q_sa[a] = sum([p * (r + gamma * v[s_]) for p, s_, r, _ in env.env.P[s][a]])

        policy[s] = numpy.argmax(q_sa)
        
    return policy

In [22]:
# Iteratively calculates the value-function under policy.
def CalcPolicyValue(env, policy, gamma=1.0):
    value = numpy.zeros(env.env.nS)
    eps = 1e-10
    
    while True:
        previousValue = numpy.copy(value)

        for states in range(env.env.nS):
            policy_a = policy[states]
            value[states] = sum([p * (r + gamma * previousValue[s_]) for p, s_, r, _ in env.env.P[states][policy_a]])
        if (numpy.sum((numpy.fabs(previousValue - value))) <= eps):
            # value converged
            break
    return value

In [23]:
# PolicyIteration algorithm
def policyIteration(env, gamma = 1.0):
    policy = numpy.random.choice(env.env.nA, size=(env.env.nS)) # initialize a random policy
    maxIterations = 1000
    gamma = 1.0

    for i in range(maxIterations):
        oldPolicyValue = CalcPolicyValue(env, policy, gamma)
        newPolicy = extractPolicy(oldPolicyValue, gamma)
    
        if (numpy.all(policy == newPolicy)):
            print ('Policy Iteration converged at %d.' %(i+1))
            break
    
        policy = newPolicy

    return policy 

In [24]:
env_name = 'FrozenLake-v0'
env = gym.make(env_name)
start = time.time()
optimalPolicy = policyIteration(env, gamma = 1.0)
scores = evaluatePolicy(env, optimalPolicy, gamma = 1.0)
end = time.time()
print("Best score = %0.2f. Time taken = %4.4f seconds" %(numpy.max(scores) , end - start))

Policy Iteration converged at 4.
Best score = 0.64. Time taken = 0.1731 seconds


Output:

Policy-Iteration converged at step 5.

Best score = 0.64. Time taken = 0.1731 seconds

### Value-Iteration vs Policy-Iteration

Both value-iteration and policy-iteration algorithms can be used for offline planning where the agent is assumed to have prior knowledge about the effects of its actions on the environment (they assume the MDP model is known). Comparing each other, policy-iteration is computationally efficient as it often takes considerably fewer number of iterations to converge although each iteration is more computationally expensive.