# Dynamic Programming
---
> 동적계획법 실습

렁니
1. Policy Iteration
2. Value Iteration

### Policy Iteration
---

* d

In [1]:
import numpy as np

In [694]:
class Policy_Iteration:
    
    def __init__(self):
        self.n_map = 4
        self.n_action = 4
        self.gamma = 0.9
        self.grid_world = np.array([0,0,0,0,
                     0,-1,0,-1,
                     0,0,0,-1,
                    -1,0,0,1]).reshape(n_map,-1)
        self.value = np.zeros_like(self.grid_world)
        self.policy = np.ones([self.n_action, self.n_map, self.n_map]) / self.n_action
    
    
    '''
    Evaluation
    '''
    def Evaluation(self):
        v_tmp = np.zeros_like(self.grid_world, dtype=np.float16)
        for row in range(n_map):
            for col in range(n_map):
                for a in range(n_action):
                    reward = self.get_reward(a, row, col)
                    next_state = self.get_state(a, row, col)

                    # P.I 확률계산식 - bellman expected equation.
                    v_tmp[row, col] += self.policy[a, row, col] * (reward + self.gamma * self.value[next_state[0], next_state[1]])
        
        v_tmp[3,3] = 0
        self.value = v_tmp
    
    
    '''
    Imporvement
    '''
    def Improvement(self):
        for row in range(n_map):
            for col in range(n_map):
                next_p = np.zeros([4], dtype=np.float16)
                cnt = 1
                max_tmp = -9999
                idx = []
                
                for a in range(n_action):
                    next_state = self.get_state(a, row, col)
                    tmp = self.get_reward(a, row, col) + self.gamma * self.value[next_state[0], next_state[1]]
                    
                    if(tmp > max_tmp):
                        max_tmp = tmp
                        idx.clear()
                        cnt = 1
                        idx.append(a)
                        
                    elif(tmp == max_tmp):
                        cnt += 1
                        idx.append(a)
                        
                prob = 1.0 / cnt
                
                for i in idx:
                    next_p[i] = prob
                    
                self.policy[ : , row, col] = next_p
    
    
    '''
    R(s,a)
    P(s'|s,a)
    
    0  1  2  3
    상 하 좌 우
    '''
    def get_reward(self, a, r, c):
        
        if(a == 0 and r > 0):
            r -= 1

        elif(a == 1 and r < 3):
            r += 1

        elif(a == 2 and c > 0):
            c -= 1

        elif(a == 3 and c < 3):
            c += 1

        # 맵 넘어가면 부적 보상
        else:
            return -100

        reward = self.grid_world[r, c]

        return reward

    
    '''
    next state
    '''
    def get_state(self, a, r, c):
        
        if(a == 0 and r > 0):
            r -= 1

        elif(a == 1 and r < 3):
            r += 1

        elif(a == 2 and c > 0):
            c -= 1

        elif(a == 3 and c < 3):
            c += 1
            
        return r, c
    
    def get_policy(self):
        policy = np.zeros_like(self.grid_world)
        
        for row in range(self.n_map):
            for col in range(self.n_map):
                policy[row, col] = np.argmax(self.policy[ : , row, col])
                
        return policy

In [695]:
p_i = Policy_Iteration()

In [709]:
for i in range(10):
    # GPI(Generalized Policy Improvement) - 1 Eval --> 1 Improve
    p_i.Evaluation()
    p_i.Improvement()

print(p_i.grid_world)
print(p_i.value)
print(p_i.policy)
print(p_i.get_policy())

[[ 0  0  0  0]
 [ 0 -1  0 -1]
 [ 0  0  0 -1]
 [-1  0  0  1]]
[[0.591  0.6562 0.729  0.6562]
 [0.6562 0.729  0.81   0.729 ]
 [0.729  0.81   0.9    1.    ]
 [0.81   0.9    1.     0.    ]]
[[[0.  0.  0.  0. ]
  [0.  0.  0.  0. ]
  [0.  0.  0.  0. ]
  [0.  0.  0.  0. ]]

 [[0.5 0.  1.  0. ]
  [1.  0.5 1.  0. ]
  [0.  0.5 1.  1. ]
  [0.  0.  0.  0. ]]

 [[0.  0.  0.  1. ]
  [0.  0.  0.  1. ]
  [0.  0.  0.  0. ]
  [0.  0.  0.  1. ]]

 [[0.5 1.  0.  0. ]
  [0.  0.5 0.  0. ]
  [1.  0.5 0.  0. ]
  [1.  1.  1.  0. ]]]
[[1 3 1 2]
 [1 1 1 2]
 [3 1 1 1]
 [3 3 3 2]]


### Value Iteration
---

In [713]:
class Value_Iteration:
    
    def __init__(self):
        self.n_map = 4
        self.n_action = 4
        self.gamma = 0.9
        self.grid_world = np.array([0,0,0,0,
                     0,-1,0,-1,
                     0,0,0,-1,
                    -1,0,0,1]).reshape(n_map,-1)
        self.value = np.zeros_like(self.grid_world)
    
    
    '''
    Evaluation
    '''
    def Evaluation(self):
        v_tmp = np.zeros_like(self.grid_world, dtype=np.float16)
        for row in range(self.n_map):
            for col in range(self.n_map):
                
                max_val = -9999
                for a in range(self.n_action):
                    reward = self.get_reward(a, row, col)
                    
                    # V.I 계산식 - bellman optimality equation.
                    next_state = self.get_state(a, row, col)
                    max_tmp = reward + self.gamma * self.value[next_state[0], next_state[1]]
                    if(max_tmp > max_val):
                        max_val = max_tmp
                        
                v_tmp[row, col] = max_val
        
        v_tmp[3,3] = 0
        self.value = v_tmp
        

    '''
    R(s,a)
    P(s'|s,a)
    
    0  1  2  3
    상 하 좌 우
    '''
    def get_reward(self, a, r, c):
        
        if(a == 0 and r > 0):
            r -= 1

        elif(a == 1 and r < 3):
            r += 1

        elif(a == 2 and c > 0):
            c -= 1

        elif(a == 3 and c < 3):
            c += 1

        # 맵 넘어가면 부적 보상
        else:
            return -100

        reward = self.grid_world[r, c]

        return reward

    
    '''
    next state
    '''
    def get_state(self, a, r, c):
        
        if(a == 0 and r > 0):
            r -= 1

        elif(a == 1 and r < 3):
            r += 1

        elif(a == 2 and c > 0):
            c -= 1

        elif(a == 3 and c < 3):
            c += 1
            
        return r, c

In [714]:
v_i = Value_Iteration()

In [722]:
for i in range(10):
    v_i.Evaluation()

print(v_i.grid_world)
print(v_i.value)

[[ 0  0  0  0]
 [ 0 -1  0 -1]
 [ 0  0  0 -1]
 [-1  0  0  1]]
[[0.591  0.6562 0.729  0.6562]
 [0.6562 0.729  0.81   0.729 ]
 [0.729  0.81   0.9    1.    ]
 [0.81   0.9    1.     0.    ]]
