In [4]:
!pip install pyglet

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [7]:
import numpy as np
from arm_env import ArmEnv
import gym
import torch
import gc
import torch.nn as nn
import torch.nn.functional as F
import math
from collections import deque
import random
# from torch.autograd import Variable
import random

AttributeError: module 'pyglet.clock' has no attribute 'set_fps_limit'

In [2]:
def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data*(1.0 - tau)+ param.data*tau)

def hard_update(target,source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)


In [3]:
class ActionNoise:
    # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
    def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
        self.action_dim = action_dim
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.X = np.ones(self.action_dim)*self.mu
        
    def reset(self):
        self.X = np.ones(self.action_dim)*self.mu
    
    def sample(self):
        dx = self.theta*(self.mu - self.X)
        dx = dx + self.sigma*np.random.randn(len(self.X))
        self.X = self.X + dx
        print(self.X)
        return self.X

In [4]:
# noise = ActionNoise(2)
# print(noise.sample())

In [5]:
EPS = 0.003
def fanin_init(size, fanin=None):
    fanin = fanin or size[0]
    v = 1./np.sqrt(fanin)
    return torch.Tensor(size).uniform_(-v,v)

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        
        self.state_dim = state_dim = state_dim
        self.action_dim = action_dim
        
        self.fc1 = nn.Linear(state_dim, 150)
        self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
        
        self.fa1 = nn.Linear(action_dim, 150)
        self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
        
        self.fca1 = nn.Linear(300, 300)
        self.fca1.weight.data = fanin_init(self.fca1.weight.data.size())
        
        self.fca2 = nn.Linear(300, 1)
        self.fca2.weight.data.uniform_(-EPS, EPS)
        
    def forward(self, state, action):
        xs = torch.relu(self.fc1(state))
        xa = torch.relu(self.fa1(action))
        x = torch.cat((xs,xa), dim=1)
        x = torch.relu(self.fca1(x))
        vs = self.fca2(x)
        
        return vs

In [6]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, action_lim):
        super(Actor, self).__init__()
        self.state_dim = state_dim = state_dim
        self.action_dim = action_dim
        self.action_lim = action_lim
        
        self.fa1 = nn.Linear(state_dim, 300)
        self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
        
        self.fa2 = nn.Linear(300, 300)
        self.fa2.weight.data = fanin_init(self.fa2.weight.data.size())
        
        self.fa3 = nn.Linear(300, action_dim)
        self.fa3.weight.data.uniform_(-EPS,EPS)
        
    def forward(self, state):
        x = torch.relu(self.fa1(state))
        x = torch.relu(self.fa2(x))
        action = torch.tanh(self.fa3(x))
        
        action = action * self.action_lim
        
        return action

In [7]:
t1 = Actor(7,2,1)
print(t1.forward(torch.tensor([[2.,4.,5.,5.,5.,5.,5.]],dtype=torch.float32)))

tensor([[ 0.0175, -0.0217]], grad_fn=<MulBackward>)


In [8]:
class MemoryBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)
        self.maxSize = size
        self.len = 0
        
    def sample(self, count):
        batch = []
        count = min(count, self.len)
        batch = random.sample(self.buffer, count)
        
        s_array = np.float32([array[0] for array in batch])
        a_array = np.float32([array[1] for array in batch])
        r_array = np.float32([array[2] for array in batch])
        new_s_array = np.float32([array[3] for array in batch])
        
        return s_array, a_array, r_array, new_s_array
    
    def len(self):
        return self.len
    
    def add(self, s, a, r, new_s):
        transition = (s, a, r, new_s)
        self.len += 1 
        if self.len > self.maxSize:
            self.len = self.maxSize
        self.buffer.append(transition)

In [9]:
BATCH_SIZE = 128
LEARNING_RATE = 0.001
GAMMA = 0.99
TAU = 0.001

class Trainer:
    
    def __init__(self, state_dim, action_dim, action_lim, ram):
        
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lim = action_lim
        self.ram = ram
        #self.iter = 0 
        self.noise = ActionNoise(self.action_dim)
        
        self.actor = Actor(self.state_dim, self.action_dim, self.action_lim)
        self.target_actor = Actor(self.state_dim, self.action_dim, self.action_lim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), LEARNING_RATE)
        
        self.critic = Critic(self.state_dim, self.action_dim)
        self.target_critic = Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), LEARNING_RATE)
        
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        
    def get_exploitation_action(self,state):
        state = torch.from_numpy(state)
        action = self.target_actor.forward(state).detach()
        return action.data.numpy()
        
    def get_exploration_action(self, state):
        state = torch.from_numpy(state)
        action = self.actor.forward(state).detach()
        new_action = action.data.numpy() #+(self.noise.sample() * self.action_lim)
        return new_action
    
    def optimizer(self):
        s_sample, a_sample, r_sample, new_s_sample = ram.sample(BATCH_SIZE)
        
        s_sample = torch.from_numpy(s_sample)
        a_sample = torch.from_numpy(a_sample)
        r_sample = torch.from_numpy(r_sample)
        new_s_sample = torch.from_numpy(new_s_sample)
        
        #-------------- optimize critic
        
        a_target = self.target_actor.forward(new_s_sample).detach()
        next_value = torch.squeeze(self.target_critic.forward(new_s_sample, a_target).detach())
        # y_exp = r _ gamma*Q'(s', P'(s'))
        y_expected = r_sample + GAMMA*next_value
        # y_pred = Q(s,a)
        y_predicted = torch.squeeze(self.critic.forward(s_sample, a_sample))
        loss_critic = F.smooth_l1_loss(y_predicted, y_expected)
        
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()
        
        #------------ optimize actor
        pred_a_sample = self.actor.forward(s_sample)
        loss_actor = -1*torch.sum(self.critic.forward(s_sample, pred_a_sample))
        
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()
        
        soft_update(self.target_actor, self.actor, TAU)
        soft_update(self.target_critic, self.critic, TAU)
    
    def save_models(self, episode_count):
        
        torch.save(self.target_actor.state_dict(), './models/'+str(episode_count)+ '_actor.pt')
        torch.save(self.target_critic.state_dict(), './models/'+str(episode_count)+ '_critic.pt')
        print('****Models saved***')
        
    def load_models(self, episode):
        
        self.actor.load_state_dict(torch.load('./models/'+str(episode)+ '_actor.pt'))
        self.critic.load_state_dict(torch.load('./models/'+str(episode)+ '_critic.pt'))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        print('***Models load***')

In [10]:
MAX_EPISODES = 10001
MAX_STEPS = 500
MAX_BUFFER = 7000
RENDER = True
rewards_all_episodes = []

exploration_rate = 1
is_training = True
if is_training:
    exploration_rate = 1
else:
    exploration_rate = 0.1
print('exploration_rate', exploration_rate)
max_exploration_rate = 1
min_exploration_rate = 0.1
exploration_decay_rate = 0.05

var = 1

MODE = ['easy', 'hard']
n_model = 1
env = ArmEnv(mode=MODE[n_model])

S_DIM = env.state_dim
A_DIM = env.action_dim
A_MAX = 1

# env = gym.make('Pendulum-v0')
# env = gym.make('BipedalWalker-v2')
# S_DIM = env.observation_space.shape[0]
# A_DIM = env.action_space.shape[0]
# A_MAX = 1

print('State Dimensions:', S_DIM)
print('Action Dimensions:', A_DIM)
print('Action Max:', A_MAX)
ram = MemoryBuffer(MAX_BUFFER)
trainer = Trainer(S_DIM, A_DIM, A_MAX, ram)
#trainer.load_models(1700)


for ep in range(MAX_EPISODES):
    state = env.reset()
    print('Episode:', ep)
    
    rewards_current_episode = 0
    
    for step in range(MAX_STEPS):
        if RENDER:
            env.render()
            
        state = np.float32(state)
        
        #exploration_rate_threshold = random.uniform(0,1)
        #if exploration_rate_threshold > exploration_rate:
        #    action = trainer.get_exploration_action(state)
        #else:
        #    action = np.array([np.random.uniform(-1,1), np.random.uniform(-1,1)])
        action = trainer.get_exploration_action(state)
        #action = trainer.get_exploitation_action(state)
        action = np.clip(np.random.normal(action,var),-A_MAX,A_MAX)
        #if is_training:
        #    if ep%2 == 0:
        #        action = trainer.get_exploitation_action(state)
        #    else:
        #        action = trainer.get_exploration_action(state)
        #    action = np.clip(action,-A_MAX,A_MAX)
        #action = np.clip(action,-A_MAX,A_MAX)
        
        if not is_training:
            action = trainer.get_exploitation_action(state)
        #new_state, reward, done, _ = env.step(action)
        new_state, reward, done = env.step(action)
        
        rewards_current_episode += reward        
        if done:
            new_state = None
        else:
            new_state = np.float32(new_state)
            ram.add(state,action, reward, new_state)
        state = new_state
        
        if ram.len == MAX_BUFFER:
            var = max([var*.9999, 0.05])
            trainer.optimizer()
        #if is_training:
        #    trainer.optimizer()

        if step == MAX_STEPS-1 or done:
            print('reward per episode:', rewards_current_episode)
            print('explore:',var)
            rewards_all_episodes.append(rewards_current_episode)
            break
    
    exploration_rate = (min_exploration_rate +
                (max_exploration_rate - min_exploration_rate)* np.exp(-exploration_decay_rate*ep))
    #print(exploration_rate)
    # check memory consumption and clear memory
    gc.collect()
    if ep%100 == 0:
        trainer.save_models(ep)

print('Completed Episodes')

exploration_rate 1
State Dimensions: 7
Action Dimensions: 2
Action Max: 1
Episode: 0
reward per episode: -358.99206903837245
explore: 1
****Models saved***
Episode: 1
reward per episode: -396.9478573479679
explore: 1
Episode: 2
reward per episode: -306.05695491534806
explore: 1
Episode: 3
reward per episode: -202.03462347352755
explore: 1
Episode: 4
reward per episode: -556.9370078844258
explore: 1
Episode: 5
reward per episode: -410.1258494867277
explore: 1
Episode: 6
reward per episode: -384.70537241509146
explore: 1
Episode: 7
reward per episode: -401.7875729377462
explore: 1
Episode: 8
reward per episode: -660.0395860313192
explore: 1
Episode: 9
reward per episode: -429.7567153570176
explore: 1
Episode: 10
reward per episode: -382.4714798772276
explore: 1
Episode: 11
reward per episode: -447.25152159961215
explore: 1
Episode: 12
reward per episode: -623.4401487248325
explore: 1
Episode: 13
reward per episode: -723.5112820596958
explore: 0.9999
Episode: 14
reward per episode: -429.4

reward per episode: 54.37833223040852
explore: 0.0876221255538606
Episode: 106
reward per episode: 54.06392469047317
explore: 0.08701088196090408
Episode: 107
reward per episode: 50.865384130456775
explore: 0.08641254359932882
Episode: 108
reward per episode: 52.07559879698855
explore: 0.08585265567818351
Episode: 109
reward per episode: 156.0207452624351
explore: 0.08400122908711925
Episode: 110
reward per episode: 359.62101679375525
explore: 0.0799042410277229
Episode: 111
reward per episode: 53.189757651875354
explore: 0.07937064547391047
Episode: 112
reward per episode: 5.418845512428197
explore: 0.07727938721403174
Episode: 113
reward per episode: 59.576805407829795
explore: 0.07684781060884718
Episode: 114
reward per episode: 55.83982928660448
explore: 0.0763651671926214
Episode: 115
reward per episode: 15.271210855856374
explore: 0.07515298720842614
Episode: 116
reward per episode: 55.63851250797748
explore: 0.07465858609083527
Episode: 117
reward per episode: 57.128589040925675

reward per episode: -43.67676813703995
explore: 0.05
Episode: 218
reward per episode: -92.3075622622499
explore: 0.05
Episode: 219
reward per episode: 276.16869793757655
explore: 0.05
Episode: 220
reward per episode: 59.52882454493164
explore: 0.05
Episode: 221
reward per episode: 52.381638223071775
explore: 0.05
Episode: 222
reward per episode: 53.391267154033784
explore: 0.05
Episode: 223
reward per episode: 56.05973262110652
explore: 0.05
Episode: 224
reward per episode: 43.65210502096192
explore: 0.05
Episode: 225
reward per episode: 95.47221580182207
explore: 0.05
Episode: 226
reward per episode: 86.92566990659168
explore: 0.05
Episode: 227
reward per episode: 57.05822165391078
explore: 0.05
Episode: 228
reward per episode: 51.740118644552545
explore: 0.05
Episode: 229
reward per episode: 51.42699379068891
explore: 0.05
Episode: 230
reward per episode: 56.85511167199937
explore: 0.05
Episode: 231
reward per episode: 59.522112318887444
explore: 0.05
Episode: 232
reward per episode:

KeyboardInterrupt: 

In [19]:
env.close()

In [10]:
!ls models/

0_actor.pt	2000_actor.pt	300_actor.pt	4100_actor.pt	5200_actor.pt
0_critic.pt	2000_critic.pt	300_critic.pt	4100_critic.pt	5200_critic.pt
1000_actor.pt	200_actor.pt	3100_actor.pt	4200_actor.pt	5300_actor.pt
1000_critic.pt	200_critic.pt	3100_critic.pt	4200_critic.pt	5300_critic.pt
100_actor.pt	2100_actor.pt	3200_actor.pt	4300_actor.pt	5400_actor.pt
100_critic.pt	2100_critic.pt	3200_critic.pt	4300_critic.pt	5400_critic.pt
1100_actor.pt	2200_actor.pt	3300_actor.pt	4400_actor.pt	5500_actor.pt
1100_critic.pt	2200_critic.pt	3300_critic.pt	4400_critic.pt	5500_critic.pt
1200_actor.pt	2300_actor.pt	3400_actor.pt	4500_actor.pt	5600_actor.pt
1200_critic.pt	2300_critic.pt	3400_critic.pt	4500_critic.pt	5600_critic.pt
1300_actor.pt	2400_actor.pt	3500_actor.pt	4600_actor.pt	5700_actor.pt
1300_critic.pt	2400_critic.pt	3500_critic.pt	4600_critic.pt	5700_critic.pt
1400_actor.pt	2500_actor.pt	3600_actor.pt	4700_actor.pt	600_actor.pt
1400_critic.pt	2500_critic.pt	3600_critic.pt	4700_critic.pt	

In [24]:
env.close()

AttributeError: 'ArmEnv' object has no attribute 'close'

# Até aqui o código

In [270]:
!ls models/

0_actor.pt  0_critic.pt


In [250]:
a

tensor([[-0.0176, -0.0171],
        [-0.0176, -0.0171],
        [-0.0176, -0.0171]])

In [252]:
a.size()

torch.Size([3, 2])

In [255]:
a.reshape(1,2,-1)

tensor([[[-0.0176, -0.0171, -0.0176],
         [-0.0171, -0.0176, -0.0171]]])

In [131]:
N_S

NameError: name 'N_S' is not defined

In [55]:
N_A

2

In [56]:
A_BOUND

[-1, 1]

In [47]:
observation = env.reset()

In [40]:
observation

array([ 0.  , -0.84, -1.07, -0.84, -1.07,  0.16, -0.07])

In [100]:
for _ in range(10):
    env.render()
    action = np.array([0,0])
    state, reward, done = env.step(action)
    print('state:', state)
    print('reward', reward)
    print('done:', done)

state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326090371467467
done: True
state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326090371467467
done: True
state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326090371467467
done: True
state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326090371467467
done: True
state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326090371467467
done: True
state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326090371467467
done: True
state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326090371467467
done: True
state: [ 1.          0.32766824  0.3522399  -0.00546977 -0.02061271 -0.15
  0.5       ]
reward -0.021326

In [32]:
env.

In [None]:
del env

In [173]:
from torch.autograd import Variable

In [178]:
torch.tensor([0])

tensor([0])