# 정책기반 강화학습 

In [1]:
score=200
score = score if score==200 else score +100

print(score)

200


In [2]:
import numpy as np
a=[1,2,3,4]
b=np.vstack(a)
print(b)

[[1]
 [2]
 [3]
 [4]]


In [6]:
import gym
import matplotlib.pyplot as plt
%matplotlib inline
env=gym.make('CartPole-v1')
# render=lambda:plt.imshow(env.render(mode='rgb_array'))
env.reset()
for _ in range(10):
    env.step(env.action_space.sample())
    # render()
    
env.close()
env=gym.make('CartPole-v1')
print(env.observation_space.shape[0])
print(env.action_space)

4
Discrete(2)


In [7]:
import gym
import numpy as np
import plotly.express as px
import numpy as np
import tensorflow as tf

import random
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [8]:
# 맞춤형 딥러닝 모형 설계 
class REINFORCE(tf.keras.models.Model):
    def __init__(self, action_size,**kwargs):
        super().__init__(**kwargs)
        self.fc1 = Dense(24, activation='relu')
        self.fc2 = Dense(24, activation='relu')
        self.fc_out = Dense(action_size, activation='softmax')   # binary 분포를 출력중

    # 구체적인 모형 정의 
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        policy = self.fc_out(x)
        return policy

In [9]:
class REINFORCEAgent:
    def __init__(self, state_size, action_size):
        # 상태의 크기와 행동의 크기 정의
        self.state_size = state_size
        self.action_size = action_size
        
        # REINFORCE 하이퍼 파라메터
        self.discount_factor = 0.99
        self.learning_rate = 0.001

        self.model = REINFORCE(self.action_size)   # 앞에서 지정한 맞춤형 딥러닝 모형 객체화 
        self.optimizer = Adam(lr=self.learning_rate)
        self.states, self.actions, self.rewards = [], [], []

    
    # state를 정책 딥러닝에 입력
    def get_action(self, state):
        policy = self.model(state)[0]   
        policy = np.array(policy)    # 출력된 action의 분포
        return np.random.choice(self.action_size, 1, p=policy)[0]       # 분포에 따라 선택된 action 반환
        

    ## reward 계산
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    # 1 episode에서 생성된 state, action, reward 저장 
    def append_sample(self, state, action, reward):
        self.states.append(state[0])
        self.rewards.append(reward)
        act = np.zeros(self.action_size)
        act[action] = 1   # action을 원핫 벡터로 저장
        self.actions.append(act)

    ## 정책 딥러닝 학습시키기
    def train_model(self):
        discounted_rewards = np.float32(self.discount_rewards(self.rewards))
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)
        
        
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:   # 미분할 모수 지정
            tape.watch(model_params)
            policies = self.model(np.array(self.states))
            actions = np.array(self.actions)
            action_prob = tf.reduce_sum(actions * policies, axis=1)
            cross_entropy = - tf.math.log(action_prob + 1e-5)
            loss = tf.reduce_sum(cross_entropy * discounted_rewards)  # 손실함수 반환
            
        
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        self.states, self.actions, self.rewards = [], [], []        

# Train 
- 매 episode 마다 모수 업데이트

In [10]:
if __name__ == "__main__":
    
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    
    agent = REINFORCEAgent(state_size, action_size)

    scores, episodes = [], []
    episode_score=[]
    EPISODES = 1000
    
    for e in range(EPISODES):
        done = False
        score = 0
        
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            
            action = agent.get_action(state)

            
            next_state, reward, done,_ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            agent.append_sample(state, action, reward)
            score += reward

            state = next_state

            if done:
               
                agent.train_model()
                
                print("episode:", e, "score:", score)
                episode_score.append([e,score])
                scores.append(score)
                episodes.append(e)        

episode: 0 score: 15.0
episode: 1 score: 20.0
episode: 2 score: 14.0
episode: 3 score: 15.0
episode: 4 score: 17.0
episode: 5 score: 22.0
episode: 6 score: 12.0
episode: 7 score: 26.0
episode: 8 score: 34.0
episode: 9 score: 30.0
episode: 10 score: 13.0
episode: 11 score: 33.0
episode: 12 score: 11.0
episode: 13 score: 14.0
episode: 14 score: 34.0
episode: 15 score: 20.0
episode: 16 score: 10.0
episode: 17 score: 18.0
episode: 18 score: 22.0
episode: 19 score: 42.0
episode: 20 score: 14.0
episode: 21 score: 11.0
episode: 22 score: 21.0
episode: 23 score: 14.0
episode: 24 score: 15.0
episode: 25 score: 22.0
episode: 26 score: 18.0
episode: 27 score: 15.0
episode: 28 score: 23.0
episode: 29 score: 25.0
episode: 30 score: 40.0
episode: 31 score: 22.0
episode: 32 score: 17.0
episode: 33 score: 31.0
episode: 34 score: 32.0
episode: 35 score: 17.0
episode: 36 score: 27.0
episode: 37 score: 11.0
episode: 38 score: 23.0
episode: 39 score: 34.0
episode: 40 score: 30.0
episode: 41 score: 19.0
ep

In [11]:
import pandas as pd
import plotly.express as px
df=pd.DataFrame(episode_score,columns=['episode','return'])
fig=px.line(df,x='episode',y='return', title='Return_Monitor')
fig.show()

return 이 500인 경우가 많으므로 정책 딥러닝이 잘 학습된 것처럼 보이지만, 변동성이 큰 모습임.