In [1]:
import subprocess
import sys
GITPATH = subprocess.run('git rev-parse --show-toplevel'.split(' '), \
        stdout=subprocess.PIPE).stdout.decode('utf-8').replace('\n','')
sys.path.append(GITPATH)
import dobroEnv



In [2]:
import os
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt
from IPython.display import clear_output

import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp

from utils import soft_update, OrnsteinUhlenbeckProcess

In [3]:
gym.logger.set_level(40)

In [4]:
ENV_NAME = 'DobroHalfCheetah-v0'
state_dim = 20
action_dim = 6

In [5]:
num_hidden1 = 64
num_hidden2 = 64

critic_lr = 1e-3
actor_lr = 1e-3

NUM_EPISODES = 10000
batch_size = 64
max_buff_size = 10000
warm_up = 100

decay_rate = 0.99


In [6]:
class ActorNet(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorNet, self).__init__()
        
        self.actor_layer = nn.Sequential(
            nn.Linear(state_dim, num_hidden1),
            nn.ReLU(),
            nn.Linear(num_hidden1, num_hidden2),
            nn.ReLU(),
            nn.Linear(num_hidden2, action_dim)
        )
    
    def forward(self, x):
        return self.actor_layer(x)
    
    
class CriticNet(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(CriticNet, self).__init__()
        
        self.critic_layer = nn.Sequential(
            nn.Linear(state_dim+action_dim, num_hidden1),
            nn.ReLU(),
            nn.Linear(num_hidden1, num_hidden2),
            nn.ReLU(),
            nn.Linear(num_hidden2, 1)
        )
        
    def forward(self, state, actoin):
        x = torch.cat([state, action], 0)
        return self.critic_layer(x)

In [15]:
def sample_minibatch(R, batch_size):
    minibatch = np.array(random.sample(R, batch_size))

    def array_to_tensor(minibatch, idx):
        return torch.tensor(np.stack(minibatch[:, idx]).astype(np.float32))

    states = array_to_tensor(minibatch, 0)
    actions = array_to_tensor(minibatch, 1)
    rewards = array_to_tensor(minibatch, 2)
    next_states = array_to_tensor(minibatch, 3)
    return states, actions, rewards, next_states

In [16]:
cnet = CriticNet(state_dim, action_dim)
anet = ActorNet(state_dim, action_dim)
target_cnet = CriticNet(state_dim, action_dim)
target_anet = ActorNet(state_dim, action_dim)
target_cnet.load_state_dict(cnet.state_dict())
target_anet.load_state_dict(anet.state_dict())

critic_optim = optim.Adam(cnet.parameters(), lr=critic_lr, weight_decay=1e-5)
actor_optim = optim.Adam(anet.parameters(), lr=actor_lr, weight_decay=1e-5)
critic_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=critic_optim, gamma=decay_rate)
actor_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=actor_optim, gamma=decay_rate)

random_process = OrnsteinUhlenbeckProcess(theta=0.15, mu=0., sigma=0.2, size=action_dim)
criterion = nn.MSELoss()

R = []
ridx = 0
env = gym.make(ENV_NAME)
env.unwrapped.initialize()

for ne in range(NUM_EPISODES):
    # init random process N
    random_process.reset_states()
    obs = env.reset()
    state = torch.tensor(obs.astype(np.float32))
    done = False

    while not done:
        state = torch.tensor(obs.astype(np.float32))
        a = anet(state).detach().numpy()
        noise = random_process.sample()
        action = a + noise

        pre_obs = obs
        obs, reward, done, _ = env.step(action)
        
        # [s_t, a_t, r_t, s_(t+1)]
        replay_data = [pre_obs, action, reward, obs]
        if len(R)<max_buff_size:
            R.append(replay_data)
            if len(R)<warm_up:
                continue
        else:
            R[ridx%max_buff_size] = replay_data
            ridx += 1
        
        # sample minibatch
#         minibatch = np.array(random.sample(R, batch_size))
#         states = torch.tensor(minibatch[:, 0])
#         actions = torch.tensor(minibatch[:, 1])
#         rewards = torch.tensor(minibatch[:, 2])
#         next_states = torch.tensor(minibatch[:, 3])
        states, actions, rewards, next_states = sample_minibatch(R, batch_size)

        next_actions = target_anet(next_states) # mu'(s_(i+1))
        next_q_values = target_cnet(next_states, next_actions) # q'(s_(i+1), mu'(s_(i+1)))
        y_targets = rewards + gamma * next_q_values
        q_values = cnet(states, actions) # q(s_i, a_i)

        critic_loss = criterion(q_values, y_targets)
        cnet.zero_grad()
        critic_loss.backward()
        critic_optim.step()

        actor_loss = - cnt(states, anet(states)).mean()
        anet.zero_grad()
        actor_loss.backward()
        actor_optim.step()

        # update target_anet, target_cnet
        soft_update(target_cnet, cnet)
        soft_update(target_anet, anet)
    
    critir_scheduler.step()            
    actor_scheduler.step()

TypeError: forward() takes 2 positional arguments but 3 were given

In [9]:
import random

In [28]:
action.tolist()

[0.06521841421992897,
 0.16697441722686354,
 0.1211497992960219,
 -0.12053809875482761,
 -0.0016359760287719127,
 0.18084830211000819]

In [22]:
a

array([ 0.06171432,  0.14566782,  0.07760204, -0.14458719, -0.01718974,
        0.16206875], dtype=float32)

In [23]:
noise

array([0.0035041 , 0.0213066 , 0.04354776, 0.02404909, 0.01555376,
       0.01877955])

In [24]:
noise.astype(np.float32)

array([0.0035041 , 0.0213066 , 0.04354776, 0.02404909, 0.01555376,
       0.01877955], dtype=float32)

In [11]:
R[0][0]

array([0.        , 0.59999999, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [12]:
R[0][1]

array([ 0.06390915,  0.12897551,  0.10098654, -0.13140325, -0.00055187,
        0.16599107])

In [13]:
R[0][2]

0.0

In [15]:
import random

In [19]:
minibatch = np.array(random.sample(R, 2)).astype(np.float32)

ValueError: setting an array element with a sequence.

In [26]:
minibatch[:, 1]

array([array([ 0.04567381,  0.13761569,  0.08789963, -0.13004533, -0.02985892,
        0.13857435]),
       array([ 0.05174735,  0.14683557,  0.08604294, -0.14483311, -0.0087649 ,
        0.17018943])], dtype=object)

In [8]:
def train(lock, globalNet, optimizer, scheduler, tmax, pid):
    t = 0
    done = False
    ep_return = 0
    log_episode_return = []
    cur_ep = 0
    step_count = 0
    
    localNet = A3C_v3(input_dim, action_dim)
    localNet.load_state_dict(globalNet.state_dict())
    env = gym.make(ENV_NAME)
    env.unwrapped.initialize()
    obs = env.reset()
    
    while globalNet.ep_counter.value < MAX_EP:
        t_start = t
        buff_value = []
        buff_reward = []
        buff_logp = []
        buff_entropy = []

        while t_start-t < t_max:
            mu, sigma, V = localNet(torch.tensor(obs.astype(np.float32)))
            Softplus=nn.Softplus()     
            sigma = Softplus(sigma + 1e-5) # constrain to sensible values
            normal_dist = torch.normal(mu, sigma)
            
            sigma = Softplus(sigma + 1e-5) # constrain to sensible values
            action_dist = torch.normal(mu, sigma)
            action = action_dist.detach().numpy()
            action = action.clip(env.action_space.low, env.action_space.high)
            
            entropy = -0.5 * (torch.log(2. * np.pi * sigma) + 1.)
            
            # log prob: gaussian negative log-likelihood
            log_prob = torch.log(1/torch.sqrt(2*np.pi*sigma**2)) - (action_dist-mu)**2/(2*sigma**2)

            obs, reward, done, _ = env.step(action)
            step_count += 1
            ep_return += reward

            buff_value.append(V)
            buff_reward.append(reward)
            buff_logp.append(log_prob.sum())
            buff_entropy.append(entropy)
            t += 1
            
            if done:
                cur_ep = globalNet.log_episode(ep_return)
                obs = env.reset()
                if step_count==env._max_episode_steps:
                    done = False
                step_count = 0
                ep_return = 0
                break

        R = V if not done else 0
        policy_loss = 0
        value_loss = 0
        entropy_loss = 0
        for i in range(-1, -(t-t_start)-1, -1):
            R = buff_reward[i] + gamma*R
            TD = R - buff_value[i]
            policy_loss += buff_logp[i] * TD.detach()
            value_loss += torch.pow(TD, 2)
            entropy_loss += buff_entropy[i].sum()
        loss = - policy_loss + value_loss - beta*entropy_loss
        
        lock.acquire()
        try:
            optimizer.zero_grad()
            loss.backward()
            for local_param, global_param in zip(localNet.parameters(), globalNet.parameters()):
                global_param.grad = local_param.grad
            optimizer.step()
        finally:
            lock.release()
        localNet.load_state_dict(globalNet.state_dict())
        
        if cur_ep%print_freq==0:
            print('[%d] Process'%pid)
            print('%d/%d episodes. (%.2f%%)'%(cur_ep, MAX_EP, cur_ep/MAX_EP*100))
            #print('Current learning rate:', optimizer.param_groups[0]['lr'])
            print('Total loss:\t', loss.data.numpy()[0])
            print('Entropy\t\tPolicy\t\tValue')
            print('%.2f\t\t%.2f\t\t%.2f'%(entropy_loss.data.numpy(), policy_loss.data.numpy()[0], \
                  value_loss.data.numpy()[0]))
            print('Epside Return: [%.1f]'%globalNet.average_returns[globalNet.ep_counter.value-1])
            print()
            
            global log_df, fig_num
            plt.figure(figsize=(10, 5))
            average_returns = np.array(globalNet.average_returns[:])
            ep_returns = np.array(globalNet.ep_returns[:])
            nonzero_indices = average_returns!=0.0
            plt.plot(ep_returns[nonzero_indices], color='lightgreen')
            plt.plot(average_returns[nonzero_indices], color='green')
            plt.savefig('A3C_v3_HalfCheetah_%d.png'%fignum)
            
            raw_data = [cur_ep/MAX_EP*100, cur_ep, loss.data.numpy()[0], globalNet.average_returns[globalNet.ep_counter.value-1], optimizer.param_groups[0]['lr']]
            log_df = log_df.append(pd.Series(raw_data, index = log_df.columns), ignore_index=True)
        
        scheduler.step()

In [None]:
globalNet = A3C_v3(input_dim, action_dim, MAX_EP, is_global=True)
globalNet.share_memory()

optimizer = optim.Adam(globalNet.parameters(), lr=learning_rate, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decay_rate)
lock = mp.Lock()

Softplus=nn.Softplus()
log_df = pd.DataFrame(columns=['running', 'EP', 'Loss', 'Return', 'LR'])
fignum = len([f for f in os.listdir() if 'v3_HalfCheetah' in f and 'png' in f])

processes = []
for p_idx in range(NUM_THREADS):
    p = mp.Process(target=train, args=(lock, globalNet, optimizer, scheduler, t_max, p_idx))
    p.start()
    processes.append(p)
for p in processes:
    p.join()

[3] Process
500/200000 episodes. (0.25%)
Current learning rate: 0.000938913877703549
Total loss:	 4622.2124
Entropy		Policy		Value
-1318.95		-1945.83		2675.06
Epside Return: [43.6]

[6] Process
1000/200000 episodes. (0.50%)
Current learning rate: 0.0008842092457380008
Total loss:	 2586.9006
Entropy		Policy		Value
-284.44		2039.51		4626.13
Epside Return: [39.8]

[0] Process
1500/200000 episodes. (0.75%)
Current learning rate: 0.000831027358976173
Total loss:	 10228.367
Entropy		Policy		Value
-832.30		-4088.22		6139.31
Epside Return: [46.4]

[3] Process
2000/200000 episodes. (1.00%)
Current learning rate: 0.0007779246707428731
Total loss:	 1610.7308
Entropy		Policy		Value
-821.11		649.62		2259.53
Epside Return: [58.8]

[1] Process
2500/200000 episodes. (1.25%)
Current learning rate: 0.0007311354045730212
Total loss:	 460.51385
Entropy		Policy		Value
-811.46		1996.93		2456.64
Epside Return: [64.5]

[3] Process
3000/200000 episodes. (1.50%)
Current learning rate: 0.0006864731778340079
Tota

In [None]:
plt.figure(figsize=(10, 5))
average_returns = np.array(globalNet.average_returns[:])
ep_returns = np.array(globalNet.ep_returns[:])
nonzero_indices = average_returns!=0.0
plt.plot(ep_returns[nonzero_indices], color='lightgreen')
plt.plot(average_returns[nonzero_indices], color='green')
plt.show()
#fignum = len([f for f in os.listdir() if 'v3_HalfCheetah' in f and 'png' in f])
#plt.savefig('A3C_v3_HalfCheetah_%d.png'%fignum)

In [None]:
log_df.head()