# Exploration

In [None]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

obs = env.reset()
done = False       


# REINFORCE Model Debug

In [None]:
import robosuite as suite
import numpy as np
import math

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pi = Variable(torch.FloatTensor([math.pi])).to(device)

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi).sqrt()
    return a*b
    

class REINFORCEPolicy(nn.Module):
    '''
    This class represent our policy parameterization.
    '''
    def __init__(self, state_dim,action_dim,hidden_size):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_size = hidden_size
        
        self.l1 = nn.Linear(self.state_dim, self.hidden_size)
        self.l2 = nn.Linear(self.hidden_size, self.hidden_size//2)
        self.l3 = nn.Linear(self.hidden_size//2, self.action_dim)
        self.l3_ = nn.Linear(self.hidden_size//2, self.action_dim)
        self.d1 = nn.Dropout(0.5)
        self.d2 = nn.Dropout(0.5)

    def forward(self,x):
        out = F.relu(self.d1(self.l1(x)))
        out = F.relu(self.d2(self.l2(out)))
        mu = self.l3(out)
        sigma_sq = self.l3_(out)
        return mu, sigma_sq
    
    
class REINFORCE:
    '''
    This class encapsulates functionality required to run the REINFORCE algorithm.
    '''
    def __init__(self, state_dim,action_dim, gamma, lr, episodes, horizon, hidden_size):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_size = hidden_size
        self.lr = lr
        self.model = REINFORCEPolicy(state_dim, action_dim,hidden_size)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = self.lr)
        self.model.train()
        
        self.gamma = gamma
        self.episodes = episodes
        self.horizon = horizon
        
        
    def select_action(self, state):
        actions = []
        log_probs = []
        mu , sigma_sq = self.model(Variable(state).to(device)) 
        for i in range(self.action_dim):
            mu_ = mu[i]
            sigma_sq_ = sigma_sq[i]
            sigma_sq_ = F.softplus(sigma_sq_) # ensures that the estimate is always positive

            eps = torch.randn(mu_.size())
            action = (mu_ + sigma_sq_.sqrt()*Variable(eps).to(device)).data
            prob = normal(action, mu_, sigma_sq_)
            log_prob = prob.log()
            actions.append(action)
            log_probs.append(log_prob)
        
        return actions, log_probs
    

    def episode_update_parameters(self, rewards, log_probs):
        R = torch.zeros(1, 1)
        loss = torch.zeros(self.action_dim)
        for i in reversed(range(self.horizon)):
            R = self.gamma * R + rewards[0][i]
            for j in range(self.action_dim):
                loss[j] = loss[j] - (log_probs[0][i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[0][i][j])).to(device)).sum()
        loss = loss.sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    def epoch_update_parameters(self, rewards, log_probs):
        R = torch.zeros(self.episodes)
        loss = torch.zeros(self.episodes,self.action_dim)
        for episode in range(self.episodes):
            for i in reversed(range(self.horizon)):
                R[episode] = self.gamma * R[episode] + rewards[episode][i]
                for j in range(self.action_dim):
                    loss[episode][j] = loss[episode][j] - (log_probs[episode][i][j]*(Variable(R[episode].data.squeeze()).expand_as(log_probs[episode][i][j])).to(device)).sum()
        
        loss = loss.sum(dim=0)/self.episodes
        loss = loss.sum()


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 150, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim,0.9,0.001,100,200,256)
agent.model.load_state_dict(torch.load('/Users/peterfagan/Downloads/REINFORCE_3.pkl'))

obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    env.render()




## Observing learnt behavior

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
log_probs = []
rewards = []
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    log_probs.append(log_prob)
    rewards.append(reward)
    env.render()

# DDPG Model Debug

In [1]:
import sys
sys.path.insert(0,'..')

# Implementation of DDPG algorithm with inspiration from
# "https://github.com/ghliu/pytorch-ddpg/blob/master/ddpg.py"

import robosuite as suite
import numpy as np
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from models.utils import * # Improve by adding path var


class DDPGActor(nn.Module):
    '''This class represents our actor model'''

    def __init__(self, state_dim, action_dim, hidden_size):
        super(DDPGActor, self).__init__()
        self.l1 = nn.Linear(state_dim, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, action_dim)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.l3(x)

        return x


class DDPGCritic(nn.Module):
    '''This class represents our critic model'''

    def __init__(self, state_dim, action_dim, hidden_size):
        super(DDPGCritic, self).__init__()
        self.l1 = nn.Linear(state_dim+action_dim, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, 1)

    def forward(self, xs):
        x, a = xs
        x = F.relu(self.l1(torch.cat([x, a], 1)))
        x = F.relu(self.l2(x))
        x = self.l3(x)

        return x


class DDPG:
    '''This class represents our implementation of DDPG'''

    def __init__(self, state_dim, action_dim, args):
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.actor = DDPGActor(state_dim, action_dim, args.hidden_size)
        self.actor = self.actor.to(device)
        self.actor_target = DDPGActor(state_dim, action_dim, args.hidden_size)
        self.actor_target = self.actor_target.to(device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=args.lr)

        self.critic = DDPGCritic(state_dim, action_dim, args.hidden_size)
        self.critic = self.critic.to(device)
        self.critic_target = DDPGCritic(
            state_dim, action_dim, args.hidden_size)
        self.critic_target = self.critic_target.to(device)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=args.lr)
        self.criterion = nn.MSELoss()

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        self.max_mem_size = args.max_mem_size
        self.memory = ReplayBuffer(args.max_mem_size, state_dim, action_dim)

        self.random_process = OrnsteinUhlenbeckProcess(args.theta)

        self.tau = args.tau
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.gamma = args.gamma
        self.epsilon = 1.0
        self.depsilon = 1.0 / args.epsilon

        self.s_t = None
        self.a_t = None
        self.is_training = True


    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.store_transition(self.s_t, self.a_t, r_t, s_t1, done)
            self.s_t = s_t1


    def select_action(self, state, decay_epsilon=True):
        action = self.actor(to_tensor(state)).detach().numpy()
        action += self.is_training*self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon
        
        self.a_t = action
        return action

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.action_dim)
        self.a_t = action
        return action

    def update_parameters(self):
        # Sample batch from replay buffer
        state_batch, action_batch, reward_batch, \
        next_state_batch, done_batch = self.memory.sample(self.batch_size)

        # Calculate next q-values
        q_next = self.critic_target([to_tensor(next_state_batch), \
                     self.actor_target(to_tensor(next_state_batch))])

        target_q_batch = to_tensor(reward_batch) + \
            self.gamma*to_tensor(done_batch)*q_next

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])
        value_loss = self.criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update 
        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        
  


In [2]:
class Args:
    hidden_size = 256
    max_mem_size=2000
    tau=0.001
    batch_size=5
    lr=0.001
    epsilon=10000
    warmup=10
    gamma=0.99
    theta=0.15
    num_episodes=50
    horizon=10
    env_name='Lift'
    robot='Panda'

args=Args()

In [3]:
env = suite.make(
        env_name=args.env_name,
        robots=args.robot,
        has_renderer=False,
        has_offscreen_renderer=False,
        use_camera_obs=False,
        use_object_obs=True,                    
        horizon = args.horizon, 
        reward_shaping=True                 
    )
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]
agent = DDPG(state_dim, env.action_dim, args)

for episode in range(10):
    obs = env.reset()
    state = np.append(obs['robot0_robot-state'],obs['object-state'])
    agent.s_t = state
    done=False
    while done==False: 
        action = agent.random_action()
        obs, reward,done, info = env.step(action)
        state = np.append(obs['robot0_robot-state'],obs['object-state'])
        agent.observe(reward, state, done)

In [4]:
state, action, reward, state_, done = agent.memory.sample(10)

In [5]:
with torch.no_grad():
    q_next = agent.critic_target([to_tensor(state_), \
                         agent.actor_target(to_tensor(state_)).detach()])

In [6]:
q_next

tensor([[-0.0736],
        [-0.0790],
        [-0.0710],
        [-0.0585],
        [-0.0669],
        [-0.0787],
        [-0.0138],
        [-0.0441],
        [-0.0813],
        [-0.0524]])

In [7]:
target_q = to_tensor(reward) + agent.gamma*q_next 
target_q

tensor([[-0.0664],
        [-0.0699],
        [-0.0668],
        [-0.0532],
        [-0.0643],
        [-0.0726],
        [-0.0128],
        [-0.0425],
        [-0.0730],
        [-0.0439]])

In [8]:
agent.critic.zero_grad()
q = agent.critic([to_tensor(state),to_tensor(action)])

In [10]:
loss = agent.criterion(q, target_q)

In [11]:
loss.backward()

In [12]:
agent.critic_optim.step()

In [None]:
self.critic

In [None]:
env = suite.make(
        env_name=args.env_name,
        robots=args.robot,
        has_renderer=False,
        has_offscreen_renderer=False,
        use_camera_obs=False,
        use_object_obs=True,                    
        horizon = args.horizon, 
        reward_shaping=True                 
    )
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]

agent = DDPG(state_dim, env.action_dim, args)
iteration = 0
for episode in range(args.num_episodes):
    obs = env.reset()
    state = np.append(obs['robot0_robot-state'],obs['object-state'])
    agent.s_t = state
    done=False
    while done==False: 
        if iteration <= args.warmup:
            action = agent.random_action()
            iteration += 1
        else:
            action = agent.select_action(state) 
            iteration += 1
        obs, reward,done, info = env.step(action)
        state = np.append(obs['robot0_robot-state'],obs['object-state'])
        agent.observe(reward, state, done)
        if iteration > args.warmup:
            agent.update_parameters()