### 개요

정책 기반 강화학습의 목표
$$
\text{maximize } J(\theta)
$$

정책신경망 업데이트
$$
\theta_{t+1}=\theta_t+\alpha \nabla_\theta J(\theta)
$$

가치함수로 나타내는 목표함수의 정의
$$
J(\theta) = v_{\pi_\theta}(s_0)
$$

목표함수의 미분
\begin{align}
\nabla_\theta J(\theta) & = \nabla_\theta v_{\pi_\theta} (s_0) \\
\nabla_\theta J(\theta) & = \sum_s d_{\pi_\theta} (s) \sum_a \nabla_\theta \pi_\theta (a | s) q_\pi (s, a) \\
\nabla_\theta J(\theta) & = \sum_s d_{\pi_\theta} (s) \sum_a \pi_\theta (a | s) \times \nabla_\theta \log \pi_\theta (a|s) q_\pi (s,a) \\ 
\nabla_\theta J(\theta) & = E_{\pi_\theta} [ \nabla_\theta \log \pi_\theta (a|s) q_\pi (s,a) ] \\ 
\end{align}

폴리시 그레디언트 업데이트식
\begin{align}
\theta_{t+1} & = \theta_t+\alpha \nabla_\theta J(\theta) \approx \theta_t + \alpha [ \nabla_\theta \log \pi_\theta (a|s) q_\pi (s,a) ] \\
\theta_{t+1} & \approx \theta_t + \alpha [ \nabla_\theta \log \pi_\theta (a|s) G_t ]
\end{align}

In [None]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import mgym
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras import backend as K

In [None]:
EPISODES = 2500

In [None]:
class REINFORCEAgent:
    def __init__(self, env):
        self.action_size = env.action_space.n
        self.state_size = 4
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        
        self.model = self._build_model()
        self.optimizer = self._build_optimizer()
        self.states, self.actions, self.rewards = [], [], []
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.summary()
        return model
    
    def _build_optimizer(self):
        action = K.placeholder(shape=[None, 5])
        discounted_rewards = K.placeholder(shape=[None,])
        action_prob = K.sum(action * self.model.output, axis=1)
        cross_entropy = K.log(action_prob) * discounted_rewards
        loss = -K.sum(cross_entropy)
        
        optimizer = Adam(lr=self.learning_rate)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
        train = K.function([self.model.input, action, discounted_rewards], [], updates=updates)
        
        return train
    
    def get_action(self, state):
        policy = self.model.predict(state)[0]
        return np.random.choice(self.action_size, 1, p=policy)[0], policy
    
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add + self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards
    
    def append_sample(self, state, action, reward):
        self.states.append(state[0])
        self.rewards.append(reward)
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)
        
    def train_model(self):
        # print(self.states)
        discounted_rewards = np.float32(self.discount_rewards(self.rewards))
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)
        self.states = np.array(self.states)
        self.actions = np.array(self.actions)
        self.optimizer([self.states, self.actions, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []

In [None]:
env = mgym.make('5x5moving')
agent = REINFORCEAgent(env)

global_step = 0
scores, episodes = [], []

for e in range(EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, -1])
    
    while not done:
        global_step += 1
        
        action, policy = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, -1])
        agent.append_sample(state, action, reward)
        
#         if global_step % 1000 == 0:
#             print(global_step, state, action, next_state, reward, done)
        
        score += reward
        state = copy.deepcopy(next_state)
        
        if done:
            agent.train_model()
            
            scores.append(score)
            episodes.append(e)
            score = round(score, 2)
            print(e, score, global_step)
