# 가치 반복법

In [4]:
import numpy as np
from Grid import Grid

In [3]:
ACTIONS = ('U', 'D', 'L', 'R')
DELTA_THRESHOLD = 1e-3
GAMMA = 0.9

In [29]:
def print_values(V, grid):
    for i in range(grid.rows):
        print('----------------------------')
        for k in range(grid.cols):
            value = V.get((i,k), 0)
            if value >= 0:
                print(f' {value:.2f} |', sep='', end='')
            else:
                print(f'{value:.2f} |', sep='', end='')
        print()

In [33]:
def print_policy(P, grid):
    for i in range(grid.rows):
        print('----------------------------')
        for k in range(grid.cols):
            action = P.get((i,k), ' ')
            print(f'   {action}  |', sep='', end='')
        print()

### 선택 가능한 행동 정의

In [5]:
grid = Grid(3, 4, (2, 0))   # rows, cols, start
rewards = {(0,3): 1, (1,3): -1}
actions = {
    (0,0): ('D','R'),
    (0,1): ('L','R'),
    (0,2): ('L','D','R'),
    (1,0): ('U','D'),
    (1,2): ('U','D','R'),
    (2,0): ('U','R'),
    (2,1): ('L','R'),
    (2,2): ('L','U','R'),
    (2,3): ('L','U')
}
grid.set(rewards, actions)

In [31]:
# 보상을 출력
print_values(grid.rewards, grid)

----------------------------
 0.00 | 0.00 | 0.00 | 1.00 |
----------------------------
 0.00 | 0.00 | 0.00 |-1.00 |
----------------------------
 0.00 | 0.00 | 0.00 | 0.00 |


In [34]:
# 정책 초기화
# 초기 정책은 각 상태에서 선택 가능한 행동을 무작위로 선택
policy = {}
for s in grid.actions.keys():
    policy[s] = np.random.choice(ACTIONS)
print_policy(policy, grid)

----------------------------
   R  |   L  |   L  |      |
----------------------------
   L  |      |   D  |      |
----------------------------
   L  |   D  |   L  |   U  |


In [37]:
# 가치 함수 초기화
V = {}
states = grid.all_states()
for s in states:
    if s in grid.actions:
        V[s] = np.random.random()
    else:
        V[s] = 0    # 종단 상태

print_values(V, grid)

----------------------------
 0.10 | 0.11 | 0.34 | 0.00 |
----------------------------
 0.90 | 0.00 | 0.24 | 0.00 |
----------------------------
 0.67 | 0.75 | 0.84 | 0.36 |


In [38]:
# 수렴할 때까지 반복
i = 0
while True:
    maxChange = 0
    for s in states:
        oldValue = V[s]

        # 종단 상태가 아닌 상태에 대해서만 V(s)를 계산
        if s in policy:
            newValue = float('-inf')
            for a in ACTIONS:
                grid.set_state(s)
                r = grid.move(a)
                # 벨만 방정식 계산
                v = r + GAMMA * V[grid.current_state()]
                if v > newValue:
                    newValue = v
            V[s] = newValue
            maxChange = max(maxChange, np.abs(oldValue - V[s]))

    print("\n%i  번째 반복" % i, end = "\n")
    print_values(V, grid)
    i += 1 

    if maxChange < DELTA_THRESHOLD:
        break


0  번째 반복
----------------------------
 0.81 | 0.31 | 1.00 | 0.00 |
----------------------------
 0.81 | 0.00 | 0.76 | 0.00 |
----------------------------
 0.81 | 0.76 | 0.76 | 0.76 |

1  번째 반복
----------------------------
 0.81 | 0.90 | 1.00 | 0.00 |
----------------------------
 0.73 | 0.00 | 0.90 | 0.00 |
----------------------------
 0.73 | 0.73 | 0.81 | 0.68 |

2  번째 반복
----------------------------
 0.81 | 0.90 | 1.00 | 0.00 |
----------------------------
 0.73 | 0.00 | 0.90 | 0.00 |
----------------------------
 0.66 | 0.73 | 0.81 | 0.73 |

3  번째 반복
----------------------------
 0.81 | 0.90 | 1.00 | 0.00 |
----------------------------
 0.73 | 0.00 | 0.90 | 0.00 |
----------------------------
 0.66 | 0.73 | 0.81 | 0.73 |

4  번째 반복
----------------------------
 0.81 | 0.90 | 1.00 | 0.00 |
----------------------------
 0.73 | 0.00 | 0.90 | 0.00 |
----------------------------
 0.66 | 0.73 | 0.81 | 0.73 |


In [39]:
# 최적 가치 함수를 찾는 정책을 도출
for s in policy.keys():
    bestAction = None
    bestValue = float('-inf')
    # 가능한 모든 행동에 대해 반복
    for a in ACTIONS:
        grid.set_state(s)
        r = grid.move(a)
        v = r + GAMMA * V[grid.current_state()]
        if v > bestValue:
            bestValue = v
            bestAction = a
    policy[s] = bestAction

# 계산된 가치 함수와 정책을 출력
print("가치 함수: ")
print_values(V, grid)

print("\n정책: ")
print_policy(policy, grid)

가치 함수: 
----------------------------
 0.81 | 0.90 | 1.00 | 0.00 |
----------------------------
 0.73 | 0.00 | 0.90 | 0.00 |
----------------------------
 0.66 | 0.73 | 0.81 | 0.73 |

정책: 
----------------------------
   R  |   R  |   R  |      |
----------------------------
   U  |      |   U  |      |
----------------------------
   U  |   R  |   U  |   L  |
