### 패키지 설치

In [None]:
!sudo apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb \
    xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
!pip install pyvirtualdisplay
!pip install piglet

## gym
!pip install gym[classic_control]

##ffmpeg
!sudo apt-get install ffmpeg -y

### Video

In [1]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
from base64 import b64encode
from glob import glob
from IPython.display import HTML
from IPython import display as ipy_display
from gym import logger as gym_logger
from gym.wrappers.record_video import RecordVideo

In [2]:
#### show video func
def show_video(mode='train', filename=None):
    mp4_list = glob(mode+'/*.mp4')
    # print(mp4_list)
    if mp4_list:
        if filename :
            file_lists = glob(mode+'/'+filename)
            if not file_lists:
                print('No {} found'.format(filename))
                return -1
            mp4 = file_lists[0]
                    
        else:
            mp4 = sorted(mp4_list)[-1]

        print(mp4)
        video = open(mp4, 'r+b').read()
        encoded = b64encode(video)
        ipy_display.display(HTML(data='''
            <video alt="gameplay" autoplay controls style="height: 400px;">
                <source src="data:video/mp4;base64,%s" type="video/mp4" />
            </video>
        ''' % (encoded.decode('ascii'))))
    else:
        print('No video found')
        return -1

In [3]:
## save them to file if done
def plot_result(save_epi_score):
    plt.plot(save_epi_score)
    plt.show()

### DDPG Agent

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import os
import gym
import random

import tensorflow as tf
from tensorflow import keras
from keras.utils.vis_utils import plot_model

import warnings
warnings.filterwarnings(action='ignore')

from collections import deque

In [5]:
class DDPGagent():
    def __init__(self, state_size, action_size, max_action):
        # 상태 및 행동 크기 정의
        self.state_size = state_size
        self.action_size = action_size
        self.action_bound = max_action

        ## hyperparameters
        self.gamma = 0.95
        self.batch_size = 32

        # 리플레이 버퍼 크기 및 학습 시작 크기 정의
        self.buffer_size = 20000
        self.buffer_size_train_start = 2000

        self.buffer = deque(maxlen=self.buffer_size)


        ## NN Network
        self.actor = self.actor_network()
        self.target_actor = self.actor_network()

        self.critic = self.critic_network()
        self.target_critic = self.critic_network()

        self.actor_learning_rate = 0.0001
        self.critic_learning_rate = 0.001
        self.TAU = 0.001

        self.actor_opt =  keras.optimizers.Adam(learning_rate = self.actor_learning_rate, clipnorm=5.0)
        self.critic_opt =  keras.optimizers.Adam(learning_rate = self.critic_learning_rate, clipnorm=5.0)


        # save the results
        self.save_epi_score = []

    ## actor 
    def actor_network(self,): 
        input_ = keras.layers.Input(shape=(self.state_size))

        x = keras.layers.Dense(24, activation='relu')(input_)
        x = keras.layers.Dense(64, activation='relu', kernel_initializer=keras.initializers.RandomUniform(-1e-3, 1e-3))(x)
        x = keras.layers.Dense(16, activation='tanh', kernel_initializer=keras.initializers.RandomUniform(-1e-3, 1e-3))(x)
        action = keras.layers.Dense(self.action_size, kernel_initializer=keras.initializers.RandomUniform(-1e-3, 1e-3))(x)
        action=keras.layers.Lambda(lambda x : x * self.action_bound)(action)
        ## model
        model = keras.models.Model(inputs=[input_], outputs=[action])

        return model   

    ## critic
    def critic_network(self,):
        input_state = keras.layers.Input(shape=(self.state_size))
        input_action = keras.layers.Input(shape=(self.action_size))

        state = keras.layers.Dense(32, activation='relu')(input_state)
        action = keras.layers.Dense(32, activation='relu')(input_action)

        h = keras.layers.concatenate([state, action], axis=-1)
        x = keras.layers.Dense(32, activation='relu')(h)
        x = keras.layers.Dense(16, activation='relu')(x)
        q_func = keras.layers.Dense(1, activation='relu')(x)
        ## model
        model = keras.models.Model(inputs=[input_state, input_action], outputs=[q_func])

        return model


    # 입력받은 상태, 행동, 보상, 다음상태, done flag를 리플레이 버퍼에 축적하는 함수 구현
    def remember(self, state, action, reward, next_state, done):
        item = (state, action, reward, next_state, done)
        self.buffer.append(item)


    ## get action
    def get_action(self,state, pre_noise):
        action = self.actor(tf.convert_to_tensor([state], dtype=tf.float32))
        # print(f'action_raw : {action}')
        action = action.numpy()[0]
        ## noise
        noise = self.ou_noise(pre_noise, dim=self.action_size)
        # clip continuous action to be within action_bound
        action = np.clip(action + noise, -self.action_bound, self.action_bound)

        # print(f'action : {action}')
        return action , noise



      ## Soft update Target network
    def update_target_network(self, TAU):
        theta = self.actor.get_weights()
        target_theta = self.target_actor.get_weights()
        for i in range(len(theta)):
            target_theta[i] = TAU * theta[i] + (1 - TAU) * target_theta[i]
        self.target_actor.set_weights(target_theta)

        phi = self.critic.get_weights()
        target_phi = self.target_critic.get_weights()
        for i in range(len(phi)):
            target_phi[i] = TAU * phi[i] + (1 - TAU) * target_phi[i]
        self.target_critic.set_weights(target_phi)


    ## single gradient update on a single batch data
    def critic_learn(self, states, actions, td_targets):
        with tf.GradientTape() as tape:
            q = self.critic([states, actions], training=True)
            loss = tf.reduce_mean(tf.square(q - td_targets))

        grads = tape.gradient(loss, self.critic.trainable_variables)
        self.critic_opt.apply_gradients(zip(grads, self.critic.trainable_variables))

    ## train the actor network
    def actor_learn(self, states):
        with tf.GradientTape() as tape:
            actions = self.actor(states, training=True)
            critic_q = self.critic([states, actions])
            loss = -tf.reduce_mean(critic_q)

        grads = tape.gradient(loss, self.actor.trainable_variables)
        self.actor_opt.apply_gradients(zip(grads, self.actor.trainable_variables))
  
    ## Ornstein Uhlenbeck Noise
    def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return x + rho*(mu - x)*dt + sigma*np.sqrt(dt)*np.random.normal(size=dim)


    ## computing TD target: y_k = r_k + gamma*Q(x_k+1, u_k+1)
    def td_target(self, rewards, q_values, dones):
        y_k = np.asarray(q_values)
        for i in range(q_values.shape[0]): # number of batch
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.gamma * q_values[i]
        return y_k

    ## load actor weights
    def load_weights(self, path):
        self.actor.load_weights(f'./{path}/actor/mountain_car.h5')
        self.critic.load_weights(f'./{path}/critic/mountain_car.h5')


    ## train
    def train_model(self):
        ### replay memory 에서 random하게 minibatch 만큼 샘플을 가져옴
        mini_batch = random.sample(self.buffer, self.batch_size)
        # mini_batch에서 각 아래 정보로 분리하기
        states, actions, rewards, next_states, dones = zip(*mini_batch)

        # 분리된 정보를 tensor 형태로 변환
        states = tf.convert_to_tensor(states)
        actions = tf.convert_to_tensor(actions)
        rewards = tf.convert_to_tensor(rewards)
        next_states = tf.convert_to_tensor(next_states)
        # dones를 True False로 바꿀 껀데 tf.float32 실수 형태로 바꿔 주는코드 (1.0 , 0.0)
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)


        # predict target Q-values
        target_qs = self.target_critic([
                                        next_states,   ## next_state (s_prime)
                                        self.target_actor(next_states) ## next_action (a_prime)
                                      ])
        
        # compute TD targets
        y_i = self.td_target(rewards, target_qs.numpy(), dones)

        # train critic using sampled batch
        self.critic_learn(states ,   ### state (s)
                          actions,   ### action (a)
                          y_i )      ## TD target: y_k = r_k + gamma*Q(x_k+1, u_k+1)

        # train actor
        self.actor_learn(states)

        # update both target network
        self.update_target_network(self.TAU)

### Env

In [None]:
ENV_NAME = 'MountainCarContinuous-v0'
env = gym.make(ENV_NAME)

# 비디오 레코딩
env = RecordVideo(env, './train', episode_trigger =lambda episode_number: True )
# env.metadata = {'render.modes': ['human', 'ansi']}

# MountainCar 환경의 상태와 행동 크기 정의
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
max_action = env.action_space.high[0]

# print(f'state_size : {state_size} , action_size:{action_size} , max_action: {max_action}')

# 위에서 정의한 DDPG 클래스를 활용하여 agent 정의
agent = DDPGagent(state_size, action_size, max_action)

## 초기값
success = 0
max_position = -0.4
# initial transfer model weights to target model network
agent.update_target_network(1.0)

num_episode = 300


for ep in range(num_episode):
      # reset episode
    step, time, episode_score, done = 0 ,0, 0, False

    max_position = -0.4

    # 초기 noise 설정
    pre_noise = np.zeros(agent.action_size)

    # 환경 reset을 통해 초기 상태 정의
    state = env.reset()

    while not done:
        action , noise = agent.get_action(state, pre_noise) ## actor network로 action 생성 // 다음 step 때 pre_noise를 여기서 생성된 noise로 사용

        # observe reward, new_state
        next_state, reward, done, _ = env.step(action)

        # score
        episode_score += reward


        ## 보상설계
        car_pos = next_state[0]
        car_vel = next_state[1]


        ## 2차함수로 만들어 속도가 커지게 더큰 리워드를 위치에 따라 받게함
        if car_vel > 0:
            reward = float(((car_pos+0.5)*20)**2/10+15*car_vel - step/300) 
        else:
            reward = float(((car_pos+0.5)*20)**2/10 - step/300) 


        ### max position   
        if car_pos > max_position:
          ## max position
          max_position = car_pos 

        ## 성공 시 success
        if car_pos >=  0.45:
            reward+=100
            success += 1

        step+=1

        # add transition to replay buffer
        train_reward= reward

        # 획득된 상태, 행동, 보상, 다음상태, done flag를 리플레이 버퍼에 축적
        agent.remember(state, action, reward, next_state, done)


        # buffer 크기가 일정 기준 이상 쌓이면 학습 진행
        if len(agent.buffer) >= agent.buffer_size_train_start :
            agent.train_model()


        # update current state
        pre_noise = noise
        state = next_state
        success = success
        time += 1

    ## display rewards every episode
    print(f'Episode: {ep+1}, Success: {success}, max_position: {max_position :.2f}, Time: {time}, Reward: {episode_score :.2f}')

    agent.save_epi_score.append(episode_score)

    ## save weights every episode
    #print('Now save')
    save_path = './save_weights'
    try:
        os.makedirs(f'{save_path}/critic')
        os.makedirs(f'{save_path}/actor')
        print("make folder")
    except:
        pass


    agent.actor.save_weights(f"{save_path}/actor/mountain_car.h5")
    agent.critic.save_weights(f"{save_path}/critic/mountain_car.h5")


np.savetxt('./save_weights/mountain_car_epi_reward.txt', agent.save_epi_score)
print(agent.save_epi_score)


plot_result(agent.save_epi_score)

Episode: 1, Success: 0, max_position: 0.00, Time: 999, Reward: -21.75
Episode: 2, Success: 0, max_position: 0.14, Time: 999, Reward: -11.92
Episode: 3, Success: 1, max_position: 0.45, Time: 893, Reward: 93.30


In [None]:
### max episode
### nan이 젤 큰값이므로 이값을제거하고 계산함
episode=np.argmax(agent.save_epi_score)
# episode=4
filename = 'rl-video-episode-{}.mp4'.format(episode)
print("최대 avg : {} ,에피소드 번호 : {}".format(max(agent.save_epi_score) , episode))
show_video(filename=filename)

### Test

In [None]:
ENV_NAME = 'MountainCarContinuous-v0'
env = gym.make(ENV_NAME)
# 비디오 레코딩
env = RecordVideo(env, './test', episode_trigger =lambda episode_number: True )
agent = DDPGagent(env)
agent.load_weights('./save_weights/')

time = 0
state = env.reset()

while True:
    action = agent.actor(tf.convert_to_tensor([state], dtype=tf.float32)).numpy()[0]
    # print(action.shape)
    state, reward, done, _ = env.step(action)
    time += 1
    
    if done:
       print('Time: ', time, 'Reward: ', reward)
       break