# **Block Discount World**

5 State = {a, b, c, d, e}<br>
3 Actions = {Left, Right, Exit}<br>
Exit available only in a & e.<br>
Exit from a yields reward of 10<br>
Exit from e yields reward of 1

### Parts 1 & 2:
*   Calculate Optimum Policy for cases: Transitions are deterministic, 𝛾=1, 𝛾=0.1
*   Calculate the value of the sequence of rewards from each of the states under the optimum policy for both previous cases.



In [None]:
import numpy as np

In [1]:
# Defining our world
class BlockDiscountWorld:
  def __init__(self):
    self.states = ['a', 'b', 'c', 'd', 'e']
    self.actions = ['left', 'right', 'exit']
    self.rewards = {'a': 10, 'e': 1}
    self.transitions = {
        'a': {'left': 'a', 'right': 'b', 'exit': None},
        'b': {'left': 'a', 'right': 'c', 'exit': None},
        'c': {'left': 'b', 'right': 'd', 'exit': None},
        'd': {'left': 'c', 'right': 'e', 'exit': None},
        'e': {'left': 'd', 'right': 'e', 'exit': None}
  }
  def terminal_state(self, state, action):
    return action == 'exit'

  def get_next_state(self, state, action):
    return self.transitions[state][action]

  def get_reward(self, state, action):
    if action == 'exit' and state in self.rewards:
      return self.rewards[state]
    return 0

In [7]:
# Function to calculate the optimal policy
def optimal_policy(gamma, world, iterations = 1000):
  value = {state: 0 for state in world.states}
  policy = {state: None for state in world.states}

  for i in range(iterations):
    new_value = value.copy()
    for state in world.states:
      if state in world.rewards:
        new_value[state] = world.rewards[state]
        policy[state] = 'exit'
        continue

      max_val = 0
      best_action = None

      for action in world.actions:
        next_state = world.get_next_state(state, action)
        reward = world.get_reward(state, action)

        if next_state is None:
          reward_value = reward
        else:
          reward_value = reward + gamma * value[next_state]

        if reward_value > max_val:
          max_val = reward_value
          best_action = action

      new_value[state] = max_val
      policy[state] = best_action

    value = new_value

  for state in world.states:
    if state in world.rewards:
      policy[state] = 'exit'
      value[state] = world.rewards[state]

  return policy, value

In [8]:
world = BlockDiscountWorld()

# Calculating the optimal policy with gamma values g1 = 1, g2 = 0.1
g1 = 1
g2 = 0.1

g1_policy, g1_value = optimal_policy(g1, world)
g2_policy, g2_value = optimal_policy(g2, world)

# Part 1: Print the optimal policies for gamma values
print("Optimal Policy when gamma = 1: ", g1_policy)
print("Optimal Policy when gamma = 0.1: ", g2_policy)

# Part 2: Print the rewards
print("Rewards value when gamma = 1: ", g1_value)
print("Rewards value when gamma = 0.1: ", g2_value)

Optimal Policy when gamma = 1:  {'a': 'exit', 'b': 'left', 'c': 'left', 'd': 'left', 'e': 'exit'}
Optimal Policy when gamma = 0.1:  {'a': 'exit', 'b': 'left', 'c': 'left', 'd': 'right', 'e': 'exit'}
Rewards value when gamma = 1:  {'a': 10, 'b': 10, 'c': 10, 'd': 10, 'e': 1}
Rewards value when gamma = 0.1:  {'a': 10, 'b': 1.0, 'c': 0.1, 'd': 0.1, 'e': 1}


# Part 3
For which 𝛾, are West and East equally good when in state d?

10 * ${x^3}$ = 1 * ${x^1}$<br>
10 * ${x^3}$ = x<br>
10${x^3}/x$ = 1<br>
10${x^2}$ = 1<br>
${x^2}$ = 1/10<br>
x = .316227<br>

In state d, West and East are equally good when 𝛾 = 0.316227

