# Exploration

In [14]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [18]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

env.reset()

OrderedDict([('robot0_joint_pos',
              array([-0.0219546 ,  0.20314689,  0.02159294, -2.60401192,  0.03396985,
                      2.9417042 ,  0.76500556])),
             ('robot0_joint_vel',
              array([-3.84357389e-10, -1.99050302e-01, -1.15345653e-06, -6.67097603e-01,
                     -2.34813793e-03,  1.71713143e+00,  4.25741437e-04])),
             ('robot0_eef_pos',
              array([-9.69841837e-02, -9.16511673e-04,  1.01336326e+00])),
             ('robot0_eef_quat',
              array([ 0.99769611, -0.00651168,  0.06726765, -0.0059277 ])),
             ('robot0_gripper_qpos', array([ 0.020833, -0.020833])),
             ('robot0_gripper_qvel', array([-0.08876024, -0.10134752])),
             ('robot0_robot-state',
              array([-2.19528369e-02,  2.01752500e-01,  2.15912571e-02, -5.12059464e-01,
                      3.39633156e-02,  1.98560011e-01,  6.92541010e-01,  9.99759007e-01,
                      9.79436536e-01,  9.99766882e-01, -8.58

In [19]:
hist={}
for i in range(200):
    action = np.random.randn(env.robots[0].dof) # sample random action
    obs, reward, done, info = env.step(action)
    hist['step_{}'.format(i+1)] = np.append(obs['robot0_robot-state'], obs['object-state'])   # take action in the environment

In [3]:
action = np.random.randn(env.robots[0].dof) # sample random action
obs, reward, done, info = env.step(action)

In [4]:
obs

OrderedDict([('robot0_joint_pos',
              array([-0.0302742 ,  0.14044944, -0.00535421, -2.68449143,  0.00339097,
                      3.10627978,  0.82223303])),
             ('robot0_joint_vel',
              array([ 0.10565759, -0.27393351,  0.05012956, -0.6054687 ,  0.03996287,
                      1.47560382,  0.25854712])),
             ('robot0_eef_pos',
              array([-0.09538131, -0.01710642,  1.0367943 ])),
             ('robot0_eef_quat',
              array([ 9.89587659e-01, -3.70745152e-02,  1.39074572e-01,  9.09663361e-05])),
             ('robot0_gripper_qpos', array([ 0.00972155, -0.00984648])),
             ('robot0_gripper_qvel', array([-0.18717638,  0.16809092])),
             ('robot0_robot-state',
              array([-3.02695780e-02,  1.39988145e-01, -5.35418202e-03, -4.41348789e-01,
                      3.39096588e-03,  3.53055377e-02,  7.32667427e-01,  9.99541771e-01,
                      9.90153180e-01,  9.99985666e-01, -8.97335638e-01,  9.99994

42

# REINFORCE Model Debug

In [28]:
class ReinforceModel(nn.Module):
    '''
    This class represent our policy parameterization
    '''
    def __init__(self):
        super(ReinforceModel, self).__init__()
        self.robot_dim = len(env.observation_spec()['robot0_robot-state'])
        self.object_dim = len(env.observation_spec()['object-state'])
        self.state_dim = self.robot_dim + self.object_dim
        self.action_dim = env.action_dim
        
        self.l1 = nn.Linear(self.state_dim, 128, bias = False)
        self.l2 = nn.Linear(128, self.action_dim, bias = False)
        
        self.gamma = gamma # discount
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []
        
    def forward(self,x):
        model = nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
         )
        return model(x)
    
# Sepcify hyper parameters in a config file    
gamma = 0.9
learning_rate = 0.01


policy = ReinforceModel()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [None]:
def select_action(state):
    '''This function outputs the required actlion torques'''
    

In [None]:
def update_policy():
    '''This function updates our current policy parameterization'''
    

In [None]:
def train():
    '''This function trains the network'''

# PPO Model Debug

# DDPG Model Debug