# Exploration

In [None]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

obs = env.reset()
done = False       


# REINFORCE Model Debug

In [None]:
import robosuite as suite
import numpy as np
import math

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pi = Variable(torch.FloatTensor([math.pi])).to(device)

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi).sqrt()
    return a*b
    

class REINFORCEPolicy(nn.Module):
    '''
    This class represent our policy parameterization.
    '''
    def __init__(self, state_dim,action_dim,hidden_size):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_size = hidden_size
        
        self.l1 = nn.Linear(self.state_dim, self.hidden_size)
        self.l2 = nn.Linear(self.hidden_size, self.hidden_size//2)
        self.l3 = nn.Linear(self.hidden_size//2, self.action_dim)
        self.l3_ = nn.Linear(self.hidden_size//2, self.action_dim)
        self.d1 = nn.Dropout(0.5)
        self.d2 = nn.Dropout(0.5)

    def forward(self,x):
        out = F.relu(self.d1(self.l1(x)))
        out = F.relu(self.d2(self.l2(out)))
        mu = self.l3(out)
        sigma_sq = self.l3_(out)
        return mu, sigma_sq
    
    
class REINFORCE:
    '''
    This class encapsulates functionality required to run the REINFORCE algorithm.
    '''
    def __init__(self, state_dim,action_dim, gamma, lr, episodes, horizon, hidden_size):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_size = hidden_size
        self.lr = lr
        self.model = REINFORCEPolicy(state_dim, action_dim,hidden_size)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = self.lr)
        self.model.train()
        
        self.gamma = gamma
        self.episodes = episodes
        self.horizon = horizon
        
        
    def select_action(self, state):
        actions = []
        log_probs = []
        mu , sigma_sq = self.model(Variable(state).to(device)) 
        for i in range(self.action_dim):
            mu_ = mu[i]
            sigma_sq_ = sigma_sq[i]
            sigma_sq_ = F.softplus(sigma_sq_) # ensures that the estimate is always positive

            eps = torch.randn(mu_.size())
            action = (mu_ + sigma_sq_.sqrt()*Variable(eps).to(device)).data
            prob = normal(action, mu_, sigma_sq_)
            log_prob = prob.log()
            actions.append(action)
            log_probs.append(log_prob)
        
        return actions, log_probs
    

    def episode_update_parameters(self, rewards, log_probs):
        R = torch.zeros(1, 1)
        loss = torch.zeros(self.action_dim)
        for i in reversed(range(self.horizon)):
            R = self.gamma * R + rewards[0][i]
            for j in range(self.action_dim):
                loss[j] = loss[j] - (log_probs[0][i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[0][i][j])).to(device)).sum()
        loss = loss.sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    def epoch_update_parameters(self, rewards, log_probs):
        R = torch.zeros(self.episodes)
        loss = torch.zeros(self.episodes,self.action_dim)
        for episode in range(self.episodes):
            for i in reversed(range(self.horizon)):
                R[episode] = self.gamma * R[episode] + rewards[episode][i]
                for j in range(self.action_dim):
                    loss[episode][j] = loss[episode][j] - (log_probs[episode][i][j]*(Variable(R[episode].data.squeeze()).expand_as(log_probs[episode][i][j])).to(device)).sum()
        
        loss = loss.sum(dim=0)/self.episodes
        loss = loss.sum()


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 150, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim,0.9,0.001,100,200,256)
agent.model.load_state_dict(torch.load('/Users/peterfagan/Downloads/REINFORCE_3.pkl'))

obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    env.render()




## Observing learnt behavior

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    env.render()

# DDPG Model Debug

In [1]:
import sys
sys.path.insert(0,'..')

# Implementation of DDPG algorithm with inspiration from "https://github.com/ghliu/pytorch-ddpg/blob/master/ddpg.py"


import robosuite as suite
import numpy as np
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from models.utils import * # Improve by adding path var


class DDPGActor(nn.Module):
    '''This class represents our actor model'''

    def __init__(self, state_dim, action_dim, hidden_size, init_w=3e-3):
        super(DDPGActor, self).__init__()
        self.l1 = nn.Linear(state_dim, hidden_size)
        self.bn1 = nn.LayerNorm(hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.LayerNorm(hidden_size)
        self.l3 = nn.Linear(hidden_size, action_dim)
        self.init_weights(init_w)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)

    def init_weights(self, init_w):
        self.l1.weight.data = fanin_init(self.l1.weight.data.size())
        self.l2.weight.data = fanin_init(self.l2.weight.data.size())
        self.l3.weight.data.uniform_(-init_w, init_w)

    def forward(self, x):
        x = F.relu(self.bn1(self.l1(x)))
        x = F.relu(self.bn2(self.l2(x)))
        x = torch.tanh(self.l3(x))

        return x


class DDPGCritic(nn.Module):
    '''This class represents our critic model'''

    def __init__(self, state_dim, action_dim, hidden_size, init_w=3e-3):
        super(DDPGCritic, self).__init__()
        self.l1 = nn.Linear(state_dim, hidden_size)
        self.bn1 = nn.LayerNorm(hidden_size)
        self.l2 = nn.Linear(hidden_size+action_dim, hidden_size)
        self.bn2 = nn.LayerNorm(hidden_size)
        self.l3 = nn.Linear(hidden_size, 1)
        self.init_weights(init_w)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)

    def init_weights(self, init_w):
        self.l1.weight.data = fanin_init(self.l1.weight.data.size())
        self.l2.weight.data = fanin_init(self.l2.weight.data.size())
        self.l3.weight.data.uniform_(-init_w, init_w)

    def forward(self, xs):
        x, a = xs
        x = F.relu(self.bn1(self.l1(x)))
        x = F.relu(self.bn2(self.l2(torch.cat([x,a],1))))
        x = self.l3(x)

        return x


class DDPG:
    '''This class represents our implementation of DDPG'''

    def __init__(self, state_dim, action_dim, args):

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.actor = DDPGActor(state_dim, action_dim, args.hidden_size)
        self.actor_target = DDPGActor(state_dim, action_dim, args.hidden_size)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=args.lr_actor)

        self.critic = DDPGCritic(state_dim, action_dim, args.hidden_size)
        self.critic_target = DDPGCritic(state_dim, action_dim, args.hidden_size)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=args.lr_critic)
        self.criterion = nn.MSELoss()

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        self.max_mem_size = args.max_mem_size
        self.memory = ReplayBuffer(args.max_mem_size)

        self.random_process = OUActionNoise(mu=np.zeros(action_dim))

        self.tau = args.tau
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.lr_actor = args.lr_actor
        self.lr_critic = args.lr_critic
        self.gamma = args.gamma
        self.epsilon = 1.0
        self.depsilon = 1.0 / args.epsilon

        self.s_t = None
        self.a_t = None
        self.is_training = True


    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.store_transition(self.s_t, self.a_t, r_t, s_t1, done)
            self.s_t = s_t1


    def select_action(self, state, decay_epsilon=True):
        self.actor.eval()
        action = self.actor(to_tensor(state).to(self.actor.device)).to('cpu').detach().numpy()
        action += self.is_training*max(self.epsilon, 0)*self.random_process()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.actor.train()
        self.a_t = action
        return action

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.action_dim)
        self.a_t = action
        return action

    def update_parameters(self):
        # Sample batch from replay buffer
        state_batch, action_batch, reward_batch, \
        next_state_batch, done_batch = self.memory.sample(self.batch_size)
        
        state_batch = to_tensor(state_batch).to(device)
        action_batch = to_tensor(action_batch).to(device)
        reward_batch = to_tensor(reward_batch).to(device)
        next_state_batch = to_tensor(next_state_batch).to(device)
        done_batch = to_tensor(done_batch).to(device)

        # Calculate next q-values
        with torch.no_grad():
            q_next = self.critic_target([next_state_batch, \
                         self.actor_target(next_state_batch)])

            target_q_batch = reward_batch + \
                self.gamma*q_next   # Need to add details for terminal case

        # Critic update
        self.critic.zero_grad()
        self.critic.train()

        q_batch = self.critic([state_batch, action_batch])
        critic_loss = self.criterion(q_batch, target_q_batch)
        critic_loss.backward()
        self.critic_optim.step()

        # Actor update 
        self.critic.eval()
        self.actor.zero_grad()
        self.actor.train()

        actor_loss = self.critic([
            state_batch,
            self.actor(state_batch)
        ])

        actor_loss = -actor_loss.mean()
        actor_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        
        
  


In [2]:
class Args:
    hidden_size = 256
    max_mem_size=2000
    tau=0.001
    batch_size=64
    lr=0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    epsilon=10000
    warmup=100
    gamma=0.99
    theta=0
    num_episodes=50
    horizon=10
    num_epochs=100
    env_name='Lift'
    robot='Panda'
    seed=1

args=Args()

Values for actions simply blow up

In [None]:
env = suite.make(
        env_name=args.env_name,
        robots=args.robot,
        has_renderer=False,
        has_offscreen_renderer=False,
        use_camera_obs=False,
        use_object_obs=True,                    
        horizon = args.horizon, 
        reward_shaping=True                 
    )
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]
agent = DDPG(state_dim, env.action_dim, args)

iteration = 0
for epoch in range(args.num_epochs):
    rewards = []
    for episode in range(args.num_episodes):
        obs = env.reset()
        state = np.append(obs['robot0_robot-state'],obs['object-state'])
        agent.s_t = state
        done=False
        while done==False: 
            if iteration <= args.warmup:
                action = agent.random_action()
                iteration += 1
            else:
                action = agent.select_action(state) 
                iteration += 1
            print(action)
            obs, reward,done, info = env.step(action)
            rewards.append(reward)
            state = np.append(obs['robot0_robot-state'],obs['object-state'])
            agent.observe(reward, state, done)
            if iteration > args.warmup:
                agent.update_parameters()
           
            

[ 0.79274451 -0.1339642  -0.82032381 -0.86223673  0.64637173 -0.52930065
 -0.1762301   0.7818103 ]
[ 0.54584737 -0.17998126 -0.042627   -0.37850194 -0.87981382 -0.66988433
 -0.58296427 -0.06795617]
[-0.56569546  0.9902607   0.98496302 -0.36479008 -0.98674839  0.36488254
 -0.95778707  0.02147059]
[ 0.37555715 -0.10169242 -0.8936964   0.5202028  -0.73331425  0.01449888
  0.3537177   0.78391013]
[ 0.67980255 -0.40978382 -0.13124963 -0.49884715  0.60389104  0.34800909
  0.90458172  0.1832112 ]
[-0.82776978 -0.83950447 -0.66775177 -0.63873003 -0.3951228  -0.43455582
  0.72723383 -0.91387133]
[ 0.72317572  0.34026877  0.75079596 -0.99365345  0.3274975  -0.47350753
  0.37184066 -0.67042039]
[ 0.8748403   0.34146899 -0.13251559  0.70649417  0.48176692 -0.27412024
  0.31845854  0.81698238]
[ 0.2543309  -0.78713307  0.29400512  0.53076882 -0.60055211 -0.15711748
  0.26249359 -0.32038347]
[ 0.63885953  0.04447614  0.66891329 -0.63550046  0.49246177 -0.93719099
 -0.03800094 -0.04975371]
[ 0.544648

[-0.24630027  0.34819765 -0.38554293 -0.2184269  -0.72148714 -0.11359478
 -0.42104355 -0.73229977]
[ 0.07527548  0.79057929  0.20874473  0.56087252  0.07530117  0.5842371
  0.64275693 -0.66881458]
[-0.08486256 -0.35658386  0.73050476  0.31664986 -0.38585039 -0.4822677
 -0.87356759 -0.03474356]
[ 0.28115284 -0.17707246 -0.76162395 -0.04865095 -0.27473624  0.48220052
  0.53051175  0.52623803]
[ 0.25192376 -0.6152528   0.52147246 -0.06867694 -0.57809351  0.1355392
 -0.73489266  0.24421054]
[ 0.38801659 -0.53042039  0.53938825 -0.49658172  0.80118953 -0.45115678
  0.10104521  0.4977358 ]
[-0.39447584 -0.04478277 -0.63702906 -0.11563402  0.3436287   0.65993158
 -0.5833075   0.12311078]
[ 0.81958159  0.45299441 -0.25068751 -0.51924113  0.2563882  -0.20717656
  0.88195496 -0.96691589]
[-0.37142068 -0.20599156 -0.85261601  0.46856294  0.53577549  0.46274719
  0.85320987  0.78045504]
[-0.64072409 -0.74127972  0.0616367  -0.5029821  -0.99078806  0.20701105
  0.74530812  0.67021392]
[-0.71951335 

[-0.41611055 -0.62512046 -0.20398518 -0.17091243 -0.74818975 -0.6054693
 -0.27020884 -0.3902948 ]
[-0.42820767 -0.64096653 -0.21698682 -0.15681861 -0.7736606  -0.6090447
 -0.29979727 -0.3565481 ]
[-0.43364844 -0.647301   -0.2539696  -0.17363371 -0.79658204 -0.61928475
 -0.31432888 -0.37296677]
[-0.4282382  -0.63661546 -0.27300304 -0.19941899 -0.841033   -0.6374639
 -0.32350487 -0.38525808]
[-0.43060723 -0.6636351  -0.28644183 -0.2269944  -0.8696858  -0.64711684
 -0.32916567 -0.39370546]
[-0.44253483 -0.6674362  -0.28981346 -0.25197488 -0.8659237  -0.6510661
 -0.32835013 -0.42789274]
[-0.41983828 -0.66933864 -0.34235466 -0.27587974 -0.8893888  -0.667859
 -0.30375394 -0.43425983]
[-0.43292943 -0.6688597  -0.32653776 -0.28469163 -0.882513   -0.6763131
 -0.29339683 -0.41103792]
[-0.4133281  -0.6620661  -0.35326064 -0.28118345 -0.9061655  -0.65705633
 -0.29493597 -0.41919482]
[-0.40950838 -0.65228444 -0.3884704  -0.2494379  -0.9229177  -0.68830365
 -0.27918658 -0.42509884]
[-0.42571673 -0.6

[-0.15438451 -1.         -0.6612471  -0.7887741  -1.         -0.9415387
 -0.57988745 -0.59939796]
[-0.15496454 -1.         -0.65077394 -0.7960677  -1.         -0.9570827
 -0.5901362  -0.6013109 ]
[-0.14317733 -1.         -0.651662   -0.8105479  -1.         -0.9604266
 -0.58794546 -0.6116403 ]
[-0.1477525  -1.         -0.6454251  -0.8162868  -1.         -0.941431
 -0.5884484  -0.60905725]
[-0.11510857 -1.         -0.65690446 -0.7946775  -1.         -0.9569739
 -0.577433   -0.62926435]
[-0.12317671 -1.         -0.669499   -0.7856162  -1.         -0.94758433
 -0.6034972  -0.6090178 ]
[-0.14204632 -1.         -0.68182194 -0.8040498  -1.         -0.9395581
 -0.59615904 -0.60572857]
[-0.1458461  -1.         -0.6900256  -0.80988    -1.         -0.9340836
 -0.5870883  -0.60877115]
[-0.13359727 -1.         -0.70193714 -0.7948737  -1.         -0.94654423
 -0.5444154  -0.61485213]
[-0.10012567 -1.         -0.6901031  -0.8112283  -1.         -0.9352877
 -0.5470612  -0.651984  ]
[-0.07124655 -1.   

[ 0.47598028 -1.         -0.49433672 -0.9608269  -1.         -1.
 -0.62585866 -0.52090913]
[ 0.47252297 -1.         -0.47074488 -0.95841664 -1.         -1.
 -0.6022192  -0.4831455 ]
[ 0.44557965 -1.         -0.45184514 -0.9508034  -1.         -1.
 -0.5764861  -0.42952988]
[ 0.4264453  -1.         -0.40218633 -0.9396445  -1.         -1.
 -0.5618907  -0.37273917]
[ 0.61655235 -1.         -0.47000834 -0.9437007  -1.         -1.
 -0.67472273 -0.55890906]
[ 0.62487876 -1.         -0.46981457 -0.93069583 -1.         -1.
 -0.6860064  -0.5125043 ]
[ 0.6215809  -1.         -0.44834217 -0.92793995 -1.         -1.
 -0.6913322  -0.43670395]
[ 0.6367231  -1.         -0.47146302 -0.9261856  -1.         -1.
 -0.6980988  -0.36646008]
[ 0.62163115 -1.         -0.45415795 -0.9258217  -1.         -1.
 -0.6866725  -0.3045037 ]
[ 0.61598337 -1.         -0.457404   -0.89542514 -1.         -1.
 -0.69092095 -0.237996  ]
[ 0.5653298  -1.         -0.45028207 -0.8986057  -1.         -1.
 -0.68463755 -0.19387054]

[-0.7295717  -0.62522066 -0.04451133 -1.         -1.          0.9431476
  0.03725279  0.7395071 ]
[-0.7819274  -0.6258632  -0.02876863 -1.         -1.          0.95848095
  0.09427888  0.7442752 ]
[-0.7772972  -0.6159996  -0.02471343 -1.         -1.          0.95558006
  0.1195512   0.72983307]
[-0.779375   -0.63742477 -0.01212161 -1.         -1.          0.96138644
  0.1710062   0.7177264 ]
[-0.09220295 -0.76179695 -0.36676374 -0.9837777  -1.          0.15912487
 -0.44466826 -0.18204522]
[-0.2730956  -0.70817935 -0.2779895  -0.98984283 -1.          0.43462732
 -0.38068774  0.0988759 ]
[-0.50420547 -0.6798538  -0.18170927 -0.97725165 -1.          0.6915217
 -0.2643046   0.37940285]
[-0.5813085  -0.63086635 -0.12785363 -0.95682585 -1.          0.8174647
 -0.1775224   0.50208247]
[-0.6026151  -0.6247624  -0.15510814 -0.93560034 -1.          0.82841754
 -0.16119704  0.5618293 ]
[-0.6333298  -0.56857324 -0.1332431  -0.92615026 -1.          0.85510933
 -0.12598972  0.5914153 ]
[-0.6849075  

[-1.         -1.          0.14416112 -0.92116076 -1.          0.731316
  0.49152777 -1.        ]
[-1.         -0.99706656  0.15871713 -0.90010506 -1.          0.7134494
  0.48961997 -1.        ]
[-1.         -0.9843984   0.21050903 -0.90034413 -1.          0.80127555
  0.641534   -1.        ]
[-1.         -0.96799475  0.25111502 -0.8896864  -1.          0.8301893
  0.69350165 -1.        ]
[-1.        -0.9291609  0.3200764 -0.9095476 -1.         0.923593
  0.7655783 -1.       ]
[-1.         -0.90903264  0.32134977 -0.9022797  -1.          0.9172871
  0.69124573 -1.        ]
[-1.         -0.86749905  0.3881171  -0.92855185 -1.          0.9402064
  0.73547167 -1.        ]
[-1.         -0.31178406  0.16314827 -0.96566844 -1.          0.9704526
  0.3106887  -0.22131935]
[-1.         -0.5854125   0.28850186 -0.9708959  -1.          0.9580272
  0.6438838  -0.46461564]
[-1.         -0.43830135  0.1950218  -0.9544263  -1.          0.92628103
  0.23579842 -0.45135948]
[-1.         -1.          0

[-1.         -0.9683111  -0.01352789 -1.         -1.          0.88167787
  0.83439666 -0.896921  ]
[-1.         -0.96478915  0.13995136 -1.         -1.          0.87509656
  0.79820985 -0.92840654]
[-1.         -0.9183705   0.0918705  -1.         -1.          0.86189
  0.84655434 -0.9173674 ]
[-1.         -0.9212688   0.13772999 -1.         -1.          0.83556134
  0.8290119  -0.90694755]
[-1.         -0.7586141  -0.13440958 -1.         -1.          0.8389971
  0.78070474 -0.8675897 ]
[-1.         -1.         -0.2171727   0.09830219 -1.          0.27367225
  0.27335325 -0.90814275]
[-1.         -1.         -0.36152142 -0.1553922  -1.          0.23107283
  0.3121945  -0.9181286 ]
[-1.         -0.9786672  -0.6429517  -0.98550576 -1.          0.79009706
  0.84521765 -0.8500655 ]
[-1.         -0.9540315  -0.59335667 -1.         -1.          0.8517812
  1.         -0.7902523 ]
[-1.         -0.94232404 -0.55356014 -1.         -1.          0.88035077
  1.         -0.8291291 ]
[-1.         -0

[-1.         -1.          0.08318478 -0.5000021  -1.         -0.73369396
 -0.364948   -0.84911305]
[-1.         -1.          0.05980805 -0.8314371  -1.         -0.7819469
 -0.28183666 -0.8682612 ]
[-1.         -1.         -0.10367172 -1.         -1.         -0.24096532
  0.12021274 -0.82714677]
[-1.         -1.         -0.19821593 -1.         -1.          0.75631964
  1.         -0.82688385]
[-1.         -1.         -0.01578175 -1.         -1.          0.93703145
  1.         -0.86205196]
[-1.         -1.          0.14148752 -1.         -1.          0.9623795
  1.         -0.83267677]
[-1.         -1.          0.2752689  -1.         -1.          0.9552745
  1.         -0.83284605]
[-1.         -1.          0.34262884 -1.         -1.          0.94642454
  1.         -0.84499645]
[-1.         -1.          0.40421015 -1.         -1.          0.9601358
  1.         -0.86102796]
[-1.         -1.          0.4436888  -1.         -1.          0.9615483
  1.         -0.85272247]
[-1.         -1

  0.80072    -0.82304096]
[-1.         -0.32567602  0.7749975  -1.         -0.20152362  0.7160072
  0.718715   -0.7883306 ]
[-0.03672542  0.46249107 -0.41094273 -1.         -1.          0.7076371
 -0.7267133   0.06945676]
[-0.6735338   0.37781367 -0.07394268 -1.         -1.          0.71050036
 -0.03501965 -0.45439345]
[-0.77432775  0.19489583  0.3466938  -1.         -0.77801156  0.7067233
  0.12102726 -0.65781873]
[-0.66981095 -1.         -0.53373593 -1.         -1.          0.5859985
 -0.842883   -0.54803157]
[-0.83626235 -1.         -0.33921075 -1.         -1.          0.5850692
 -0.81315905 -0.6955911 ]
[-1.         -0.9602145   0.03051498 -1.         -1.          0.69794905
 -0.303775   -0.7581548 ]
[-1.         -0.72791195  0.34349844 -1.         -0.97974694  0.7314678
  0.76046896 -0.7953218 ]
[-1.         -0.55760586  0.5801607  -1.         -0.43051833  0.7200973
  0.9278503  -0.79935944]
[-1.         -0.19307087  0.38873404 -1.         -0.7414046   0.7364876
  0.700992   -0.82

[-1.         -1.         -0.3270572  -1.         -0.9937409   0.7339445
  1.         -0.86469805]
[-1.         -1.          0.0225572  -1.         -0.5401426   0.74576265
  1.         -0.8653826 ]
[-1.         -1.          0.21701357 -1.         -0.11433193  0.7402106
  1.         -0.8366422 ]
[-1.         -1.          0.47046638 -1.          0.14980744  0.7370283
  1.         -0.83977854]
[-1.         -0.7232541   0.63147366 -1.          0.39752582  0.7392806
  1.         -0.84605753]
[-1.         -0.51184595  0.72911024 -1.          0.526541    0.74749213
  1.         -0.8376597 ]
[ 0.63853925  0.56084436 -0.69028014 -1.         -1.          0.7361463
 -0.8244796   1.        ]
[ 0.6501056   0.51203084 -0.89609134 -1.         -1.          0.7372557
 -0.8094463   1.        ]
[ 0.49089018  0.48178366 -0.97170657 -1.         -1.          0.7458169
 -0.7792812   1.        ]
[-0.06288063  0.38133293 -1.         -1.         -1.          0.7374412
 -0.62703407  1.        ]
[-0.89089334 -1.  

In [None]:
out = agent.actor(torch.randn(42)).detach().numpy()
print(out)
out += agent.random_process.sample()
print(out)

In [None]:
env = suite.make(
        env_name=args.env_name,
        robots=args.robot,
        has_renderer=False,
        has_offscreen_renderer=False,
        use_camera_obs=False,
        use_object_obs=True,                    
        horizon = 400, 
        reward_shaping=True                 
    )
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]
agent = DDPG(state_dim, env.action_dim, args)
agent.actor = torch.load('/Users/peterfagan/Desktop/DDPG_1.pt')
agent.actor.eval()
done=False

while done==False: 
    state = np.append(obs['robot0_robot-state'],obs['object-state'])
    #print(state)
    action = agent.select_action(state)
    obs, reward,done, info = env.step(action)
    agent.observe(reward, state, done)
    #env.render()

In [None]:
obs

In [None]:
state, action, reward, state_, done = agent.memory.sample(1)

In [None]:
with torch.no_grad():
    q_next = agent.critic_target([to_tensor(state_), \
                         agent.actor_target(to_tensor(state_)).detach()])

In [None]:
done = to_tensor(done) 

In [None]:
target_q = to_tensor(reward) + agent.gamma*q_next*done
target_q

In [None]:
agent.critic.zero_grad()
q = agent.critic([to_tensor(state),to_tensor(action)])

In [None]:
loss = agent.criterion(q, target_q)

In [None]:
loss.backward()

In [None]:
agent.critic_optim.step()

In [None]:
self.critic

In [None]:
env = suite.make(
        env_name=args.env_name,
        robots=args.robot,
        has_renderer=False,
        has_offscreen_renderer=False,
        use_camera_obs=False,
        use_object_obs=True,                    
        horizon = args.horizon, 
        reward_shaping=True                 
    )
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]

agent = DDPG(state_dim, env.action_dim, args)
iteration = 0
for episode in range(args.num_episodes):
    obs = env.reset()
    state = np.append(obs['robot0_robot-state'],obs['object-state'])
    agent.s_t = state
    done=False
    while done==False: 
        if iteration <= args.warmup:
            action = agent.random_action()
            iteration += 1
        else:
            action = agent.select_action(state) 
            iteration += 1
        obs, reward,done, info = env.step(action)
        state = np.append(obs['robot0_robot-state'],obs['object-state'])
        agent.observe(reward, state, done)
        if iteration > args.warmup:
            agent.update_parameters()