In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

In [2]:
class DDPG_Mu(nn.Module):
    def __init__(self):
        super(DDPG_Mu, self).__init__()
        self.fc1 = nn.Linear(3, 512)
        self.fc_mu = nn.Linear(512, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=0.0001)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        mu = torch.tanh(self.fc_mu(x))*2
        return mu
    
    def train(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
class DDPG_Q(nn.Module):
    def __init__(self):
        super(DDPG_Q, self).__init__()
        self.fc_a = nn.Linear(1, 64)
        self.fc_s = nn.Linear(3, 64)
        self.fc_1 = nn.Linear(128, 128)
        self.fc_q = nn.Linear(128, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
    
    def forward(self, x, a):
        x1 = F.relu(self.fc_a(a))
        x2 = F.relu(self.fc_s(x))
        x = torch.cat([x1, x2], dim=1)
        x = F.relu(self.fc_1(x))
        q = self.fc_q(x)
        return q

    
    def train(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [3]:
env = gym.make('Pendulum-v0')
Q, Q_p, Mu, Mu_p = DDPG_Q(), DDPG_Q(), DDPG_Mu(), DDPG_Mu()
GAMMA = 0.99 #discount factor
BATCH_SIZE = 32
BUFFER_SIZE = 30000 #replay buffer size
replay_buffer = [] #다른 자료구조로 바꾸어도 상관없음.(list, queue, dict 등)
TAU = 0.01 #soft update parameter
PARAMETER_NOISE_COEF = 0.0005
ITER = 10 #training 함수가 호출될때 학습 iteration 횟수.

In [4]:
def training():
    ###############Put your code here############
    for i in range(ITER):
      minibatch = make_minibatch()
      state_b = []
      action_b = []
      reward_b = []
      next_state_b =[]
      done_b = []
      for state, action, reward, next_state, done in minibatch:
        state_b.append(state)
        action_b.append(action)
        reward_b.append(reward)
        next_state_b.append(next_state)
        done_b.append(done)
      for i in range(len(state_b)):    
        state_b[i] = state_b[i].tolist()
      state_b = np.array(state_b).reshape(32,3)
      action_b = np.array(action_b).reshape(32,1)
      reward_b =np.array(reward_b).reshape(32,1)
      for i in range(len(next_state_b)):    
        next_state_b[i] = next_state_b[i].tolist()
      next_state_b = np.array(next_state_b).reshape(32,3)
      done_b = np.array(done_b,dtype=bool).reshape(32,1)
      state_tensor = torch.from_numpy(state_b).float()
      action_tensor = torch.from_numpy(action_b).float()
      reward_tensor = torch.from_numpy(reward_b).float()
      next_state_tensor = torch.from_numpy(next_state_b).float()
      done_tensor = torch.from_numpy(done_b)

      y = reward_tensor + GAMMA* Q_p(next_state_tensor,Mu_p(next_state_tensor))
      critic_l = F.mse_loss(Q(state_tensor,action_tensor),y)
      Q.train(critic_l)
      actor_l = -Q(state_tensor,Mu(state_tensor)).mean()
      Mu.train(actor_l)

      soft_target_update(Mu,Mu_p)
      soft_target_update(Q,Q_p)
    #############################################
    
def soft_target_update(model, model_p):
    ###############Put your code here############
    # print(model)
    for param,target in zip(model.parameters(), model_p.parameters()):
      target.data = target.data*(1-TAU) + param.data*TAU
    #############################################
        
def init_target_param(model, model_p):
    ###############Put your code here############
    for param, target in zip(model.parameters(), model_p.parameters()):
      target.data = param.data
    #############################################
        
def parameter_noise(model):
    with torch.no_grad():
        for param in model.parameters():
            param.add_(torch.randn(param.size()) * PARAMETER_NOISE_COEF)
            
def store_transition(s, a, r, s_prime, done):
    ###############Put your code here############
    tmp = [s,a,r,s_prime,done]
    if len(replay_buffer) >= BUFFER_SIZE :
        replay_buffer.pop(0)
    replay_buffer.append(tmp)
    #############################################
    
def make_minibatch():
    ###############Put your code here############
    minibatch = random.sample(replay_buffer,BATCH_SIZE)
    
    return minibatch
    #############################################

In [5]:
reward_sum = 0.0
reward_list = []
init_target_param(Mu, Mu_p)
init_target_param(Q, Q_p)

for ep in range(20000):
    observation = env.reset()
    while True:
        state = torch.tensor(observation, dtype=torch.float)
        parameter_noise(Mu)
        action = Mu(state).detach()
        observation, reward, done, _ = env.step([action.item()])
        reward_sum += reward
        next_state = torch.tensor(observation, dtype=torch.float)
        store_transition(state, action, reward, next_state, done)
        if done:
            break
            
    if len(replay_buffer) >= 500:
        training()
            
    if ep % 20 == 19:
        print('Episode %d'%ep,', Reward mean : %f'%(reward_sum/20.0))
        if reward_sum/20.0 > -200.0:
            break
        reward_sum = 0.0

Episode 19 , Reward mean : -1350.407593
Episode 39 , Reward mean : -1243.094730
Episode 59 , Reward mean : -1236.100445
Episode 79 , Reward mean : -1491.276838
Episode 99 , Reward mean : -1766.659902
Episode 119 , Reward mean : -1586.728410
Episode 139 , Reward mean : -1508.894156
Episode 159 , Reward mean : -1440.508896
Episode 179 , Reward mean : -1274.340279
Episode 199 , Reward mean : -1288.026364
Episode 219 , Reward mean : -1160.041389
Episode 239 , Reward mean : -1177.602819
Episode 259 , Reward mean : -1098.795469
Episode 279 , Reward mean : -1220.525328
Episode 299 , Reward mean : -1226.366781
Episode 319 , Reward mean : -1030.278114
Episode 339 , Reward mean : -869.598597
Episode 359 , Reward mean : -837.045471
Episode 379 , Reward mean : -828.768274
Episode 399 , Reward mean : -815.158301
Episode 419 , Reward mean : -865.327387
Episode 439 , Reward mean : -731.700358
Episode 459 , Reward mean : -786.530504
Episode 479 , Reward mean : -912.133507
Episode 499 , Reward mean : -