In [3]:
import gym
import numpy as np
import tensorflow as tf

# State: 카트의 위치, 카트의 속도, 막대기의 각도, 막대기의 회전율
env = gym.make('CartPole-v0')
env.reset()

array([-0.03401052,  0.01207753, -0.03297462,  0.03503135])

In [4]:
# 사용할 뉴럴넷 구성하기
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__(self)
        self.num_actions = env.action_space.n
        self.hidden_layer_0 = tf.keras.layers.Dense(units=64, activation='relu')
        self.hidden_layer_1 = tf.keras.layers.Dense(units=64, activation='relu')
        self.neural_ans = tf.keras.layers.Dense(self.num_actions, activation = 'tanh')
        
    def call(self, inputs):
        x = tf.convert_to_tensor(inputs)
        hidden_0 = self.hidden_layer_0(x)
        hidden_1 = self.hidden_layer_1(hidden_0)
        neural_ans = self.neural_ans(hidden_1)

        return neural_ans

In [5]:
# 저장 창고
class Storage():
        def __init__(self):
            self.total_rewards=[]
            self.total_gradients=[]

        def save(self, reward, gradient):
            gradient = [values.numpy() for values in gradient]
            self.total_rewards.append(reward)
            self.total_gradients.append(gradient)

        def clear(self):
            self.total_rewards=[]
            self.total_gradients=[]

        # 이것이 동작하는 방식이 엄청 중요함!
        # 𝐺_𝑡를 계산하기 위한 것
        def discounted_rewards(self, discount_factor):
            #zeros_like는 그냥 같은 모양의 arry를 만들어 주는 것
            discounted_r = np.zeros_like(np.array(self.total_rewards))
            running_add = 0
            
            # 뒤로 뒤집어서 하나씩 곱해준 후 더해주는 것
            for t in reversed(range(0, len(np.array(self.total_rewards)))):
                # v(t) = reward(현재 시점) + discount_factor * v(t+1)
                running_add = np.array(self.total_rewards)[t] + running_add * discount_factor
                discounted_r[t] = running_add

            return np.array(discounted_r)

## 안녕

In [10]:
# Brain 불러오기
class Brain:
    def __init__(self, model, storage):
        self.model = model
        self.storage = storage
        self.episode = 0
        self.num_actions = env.action_space.n
        self.discount_factor = 0.99
        self.learning_rate = 0.001

        # 모델을 선 build 하는 과정
        self.model.build((1,env.observation_space.shape[0]))
        self.optimizer = tf.optimizers.Adam(self.learning_rate)


    def cost_fn(self, action_list, probability):
        # 앞에서 -를 붙이는 이유는? : 이것은 아직 미분을 하기 전이기 때문!
        cost = tf.reduce_mean(-tf.reduce_sum(action_list*tf.math.log((probability))))
        return cost

    def action(self, state):
        # state inference
        probability = self.model([state])
        temp_action = tf.random.categorical(probability, 1)                
        action = tf.one_hot(temp_action, self.num_actions)
        probability = tf.nn.softmax(probability)
        return action, probability


    def initialize_grad_memory(self, grad_arry):
        for idx, grad in enumerate(grad_arry):
            grad_arry[idx] = 0


    def train(self, env):
        # 환경 리셋
        state = env.reset()
        
        # model의 trainable weights와 같은 수의 행렬 생성
        grad_arry = self.model.trainable_weights

        # 시작하기 전에 grads를 저장할 공간 확보
        self.initialize_grad_memory(grad_arry)
        update_every = 1
        running_reward = 0
        sum_rewards = 0

        while True:
            if self.episode > 1000:
                env.render()

            # Gradient를 계산하는 공간
            with tf.GradientTape() as tape:

                # 현재 state에서 할 action
                action_list, probability = self.action(state)
                next_state, reward, done, info = env.step(np.argmax(action_list))
                cost = self.cost_fn(action_list, probability)

            # cost에 대해서, model.trainable_weights로 미분한 값
            grads = tape.gradient(cost, self.model.trainable_weights)
            self.storage.save(reward, grads)
            state = next_state

            if done:
                self.episode += 1
                sum_rewards = np.sum(np.array(self.storage.total_rewards))
                running_reward = running_reward*0.99+sum_rewards*0.01

                saved_gradient = np.array(self.storage.total_gradients)
                saved_discounted_rewards = self.storage.discounted_rewards(self.discount_factor)

                # Gradinets들을 apply하기 위해서 만드는 작업
                # Gt = saved_discounted_rewards
                # saved_gradient = 각 위치에서의 w값
                for grads, r in zip(saved_gradient, saved_discounted_rewards):
                    for idx, grad in enumerate(grads):
                        grad_arry[idx] += grad * r
                        

                # 업데이트 시, 5개씩 모아서 업데이트를 취함
                self.optimizer.apply_gradients(zip(grad_arry, model.trainable_variables))
                self.initialize_grad_memory(grad_arry)

                print(f'run {self.episode} done with score {sum_rewards} and running mean {running_reward}')
                self.storage.clear()
                state = env.reset()
                
    

In [11]:
model = Model()
storage = Storage()
agent = Brain(model, storage)
model.summary()
agent.train(env)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              multiple                  320       
_________________________________________________________________
dense_7 (Dense)              multiple                  4160      
_________________________________________________________________
dense_8 (Dense)              multiple                  130       
Total params: 4,610
Trainable params: 4,610
Non-trainable params: 0
_________________________________________________________________


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

run 1 done with score 13.0 and running mean 0.13
run 2 done with score 29.0 and runnin

KeyboardInterrupt: 

In [29]:
0.5288473+0.47115272

1.00000002

In [30]:
0.5144196+0.48558033

0.99999993