In [None]:
from env import GridWorld
import numpy as np

def solve_linear_system(env):
    """
    선형 연립방정식을 풀어서 가치함수 구하기
    (I - γP)v = r
    """
    states = [s for s in env.get_states() if not env.is_terminal(s)]
    n_states = len(states)
    state_to_idx = {s: i for i, s in enumerate(states)}
    
    # P 행렬과 r 벡터 구성
    P = np.zeros((n_states, n_states))
    r = np.zeros(n_states)
    
    for i, state in enumerate(states):
        # Equiprobable policy: π(a|s) = 0.25
        for action in env.actions:
            next_state = env.get_next_state(state, action)
            reward = env.get_reward(state, action)
            
            prob = 0.25  # equiprobable
            
            # r 벡터: 즉시 보상의 기댓값
            r[i] += prob * reward
            
            # P 행렬: 전이 확률
            if not env.is_terminal(next_state):
                j = state_to_idx[next_state]
                P[i, j] += prob
    
    # 선형 시스템 풀기: (I - γP)v = r
    I = np.eye(n_states)
    A = I - env.gamma * P
    v_array = np.linalg.solve(A, r)
    
    # Dictionary로 변환
    V = {}
    for i, state in enumerate(states):
        V[state] = v_array[i]
    
    # Terminal states
    for terminal in env.terminals:
        V[terminal] = 0.0
    
    return V


# 실행
env = GridWorld()

V_linear = solve_linear_system(env)

print("\n=== Linear System Method (Grid View) ===\n")
for r in range(env.rows - 1, -1, -1):
    row_str = ""
    for c in range(env.cols):
        state = (r, c)
        if state == env.wall:
            row_str += " [WALL]  "
        elif state in env.terminals:
            row_str += "  0.000  "
        else:
            row_str += f"{V_linear[state]:7.3f} "
    print(row_str)


=== Linear System Method (Grid View) ===

 -0.761  -0.551  -0.142   0.000  
 -0.865  [WALL]   -0.715   0.000  
 -0.909  -0.913  -0.877  -0.950 


In [2]:
def policy_evaluation(env, theta=0.0001, max_iterations=1000):
    """
    Equiprobable policy (π(a|s) = 0.25)에 대한 Policy Evaluation
    """
    # Value function 초기화
    V = {}
    for state in env.get_states():
        V[state] = 0.0
    
    # Iterative Policy Evaluation
    for iteration in range(max_iterations):
        delta = 0
        
        # 모든 상태에 대해 업데이트
        for state in env.get_states():
            if env.is_terminal(state):
                continue  # 터미널 상태는 V=0 유지
            
            v = V[state]
            
            # Bellman equation for equiprobable policy
            new_v = 0
            for action in env.actions:
                next_state = env.get_next_state(state, action)
                reward = env.get_reward(state, action)
                
                # π(a|s) = 0.25 (equiprobable)
                prob = 0.25
                new_v += prob * (reward + env.gamma * V[next_state])
            
            V[state] = new_v
            delta = max(delta, abs(v - new_v))
        
        # 수렴 체크
        if delta < theta:
            print(f"수렴 완료! (iteration: {iteration + 1})")
            break
    
    return V

env = GridWorld()
V = policy_evaluation(env)

# Grid 형태로 시각화
print("\n=== Value Function (Grid View) ===\n")
for r in range(env.rows - 1, -1, -1):  # 2, 1, 0 순서 (상하반전)
    row_str = ""
    for c in range(env.cols):
        state = (r, c)
        if state == env.wall:
            row_str += " [WALL]  "
        elif state in env.terminals:
            row_str += "  0.000  "
        else:
            row_str += f"{V[state]:7.3f} "
    print(row_str)

수렴 완료! (iteration: 41)

=== Value Function (Grid View) ===

 -0.761  -0.551  -0.142   0.000  
 -0.865  [WALL]   -0.715   0.000  
 -0.909  -0.912  -0.877  -0.950 


In [3]:
def compute_q_values(env, V):
    """
    V값으로부터 Q값 계산
    Q(s,a) = r(s,a) + γ V(s')
    """
    Q = {}
    
    for state in env.get_states():
        if env.is_terminal(state):
            # Terminal state는 Q값이 0
            for action in env.actions:
                Q[(state, action)] = 0.0
            continue
        
        for action in env.actions:
            next_state = env.get_next_state(state, action)
            reward = env.get_reward(state, action)
            
            # Q(s,a) = r + γ V(s')
            Q[(state, action)] = reward + env.gamma * V[next_state]
    
    return Q


def print_q_values(env, Q, state):
    """
    특정 상태의 Q값들 출력
    """
    print(f"\n=== Q-values for state {state} ===")
    for action in env.actions:
        print(f"Q({state}, {action:>4s}) = {Q[(state, action)]:7.2f}")

# 실행
env = GridWorld()

# V값 계산 (iterative 또는 linear system 중 하나 선택)
V = policy_evaluation(env)

# Q값 계산
Q = compute_q_values(env, V)

# 특정 상태의 Q값 확인
print_q_values(env, Q, (0, 0))  # 시작 위치
print_q_values(env, Q, (0, 1))  # (0,1)
print_q_values(env, Q, (0, 2))  # (0,1)
print_q_values(env, Q, (0, 3))  # 오른쪽 위

print_q_values(env, Q, (1, 0))  # 시작 위치
print_q_values(env, Q, (1, 2))  # (0,1)
print_q_values(env, Q, (1, 3))  # 오른쪽 위

print_q_values(env, Q, (2, 0))  # 시작 위치
print_q_values(env, Q, (2, 1))  # (0,1)
print_q_values(env, Q, (2, 2))  # (0,1)
print_q_values(env, Q, (2, 3))  # 오른쪽 위



수렴 완료! (iteration: 41)

=== Q-values for state (0, 0) ===
Q((0, 0),    U) =   -0.88
Q((0, 0),    D) =   -0.92
Q((0, 0),    L) =   -0.92
Q((0, 0),    R) =   -0.92

=== Q-values for state (0, 1) ===
Q((0, 1),    U) =   -0.92
Q((0, 1),    D) =   -0.92
Q((0, 1),    L) =   -0.92
Q((0, 1),    R) =   -0.89

=== Q-values for state (0, 2) ===
Q((0, 2),    U) =   -0.74
Q((0, 2),    D) =   -0.89
Q((0, 2),    L) =   -0.92
Q((0, 2),    R) =   -0.95

=== Q-values for state (0, 3) ===
Q((0, 3),    U) =   -1.00
Q((0, 3),    D) =   -0.95
Q((0, 3),    L) =   -0.89
Q((0, 3),    R) =   -0.95

=== Q-values for state (1, 0) ===
Q((1, 0),    U) =   -0.78
Q((1, 0),    D) =   -0.92
Q((1, 0),    L) =   -0.88
Q((1, 0),    R) =   -0.88

=== Q-values for state (1, 2) ===
Q((1, 2),    U) =   -0.23
Q((1, 2),    D) =   -0.89
Q((1, 2),    L) =   -0.74
Q((1, 2),    R) =   -1.00

=== Q-values for state (1, 3) ===
Q((1, 3),    U) =    0.00
Q((1, 3),    D) =    0.00
Q((1, 3),    L) =    0.00
Q((1, 3),    R) =    0.00

===

In [4]:
0.25 * (
    (-0.1 + 0.9*(-0.761))
  + (-0.1 + 0.9*(-0.909))
  + (-0.1 + 0.9*(-0.865))
  + (-0.1 + 0.9*(-0.865))
)

-0.865

In [5]:
((-0.78)+(-0.78)+(-0.88)+(-0.60))/4

-0.76

In [6]:
-0.1 + 0.9*(-0.76)

-0.784

In [7]:
np.average([0.23,-0.60,-1.00,-0.74])


-0.5275000000000001