# Exploration

In [1]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

In [2]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

obs = env.reset()
done = False       


# REINFORCE Model Debug

In [102]:
import robosuite as suite
import numpy as np
import math

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pi = Variable(torch.FloatTensor([math.pi])).to(device)

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi).sqrt()
    return a*b
    

class REINFORCEPolicy(nn.Module):
    '''
    This class represent our policy parameterization.
    '''
    def __init__(self, state_dim,action_dim):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.l1 = nn.Linear(self.state_dim, 128, bias = False)
        self.l2 = nn.Linear(128, self.action_dim*2, bias = False)

    def forward(self,x):
        model = nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
         )
        return model(x)
    
    
class REINFORCE:
    '''
    This class encapsulates functionality required to run the REINFORCE algorithm.
    '''
    def __init__(self, state_dim,action_dim, gamma, episodes):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = REINFORCEPolicy(state_dim, action_dim)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = 1e-3)
        self.model.train()
        
        self.gamma = gamma
        self.episodes = episodes
        
        
    def select_action(self, state):
        actions = []
        log_probs = []
        outputs = self.model(Variable(state).to(device)) 
        for i in range(self.action_dim):
            mu = outputs[i]
            sigma_sq = outputs[i+1]
            sigma_sq = F.softplus(sigma_sq) # ensures that the estimate is always positive

            eps = torch.randn(mu.size())
            action = (mu + sigma_sq.sqrt()*Variable(eps).to(device)).data
            prob = normal(action, mu, sigma_sq)
            log_prob = prob.log()
            actions.append(action)
            log_probs.append(log_prob)
        
        return actions, log_probs
    
    def episode_update_parameters(self, rewards, log_probs):
        R = torch.zeros(1, 1)
        loss = torch.zeros(self.action_dim)
        for i in reversed(range(len(rewards))):
            R = self.gamma * R + rewards[0][i]
            for j in range(self.action_dim):
                loss[j] = loss[j] - (log_probs[0][i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[0][i][j])).to(device)).sum()
        loss = loss.sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    def epoch_update_parameters(self, rewards, log_probs):
        R = torch.zeros(self.episodes)
        loss = torch.zeros(self.episodes,self.action_dim)
        for episode in range(self.episodes):
            for i in reversed(range(horizon)):
                R[episode] = self.gamma * R[episode] + rewards[episode][i]
                for j in range(self.action_dim):
                    loss[episode][j] = loss[episode][j] - (log_probs[episode][i][j]*(Variable(R[episode].data.squeeze()).expand_as(log_probs[episode][i][j])).to(device)).sum()
        
        loss = loss.sum(dim=0)/self.episodes
        loss = loss.sum()


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    

In [104]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 20, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim,0.99,2)

log_probs = [[] for i in range(2)]
rewards = [[] for i in range(2)]
for episode in range(2):
    obs=env.reset()
    state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
    done=False
    while done==False: 
        action, log_prob = agent.select_action(state)
        obs, reward, done, info = env.step(action)
        log_probs[episode].append(log_prob)
        rewards[episode].append(reward)
        
agent.epoch_update_parameters(rewards, log_probs)
print('Episode: {}, Rewards: {}'.format(episode, np.mean(rewards)))



Episode: 1, Rewards: 0.008341928133672247


In [116]:
np.sum(rewards,axis=1).mean()

0.16683856267344493

In [106]:
len(log_probs[0])

20

In [109]:
len(log_probs[0])

20

In [68]:
episodes = 2
horizon = 20
gamma = 0.99
R = torch.zeros(episodes)
loss = torch.zeros(episodes,8)

In [69]:
for episode in range(episodes):
            for i in reversed(range(horizon)):
                R[episode] = gamma * R[episode] + rewards[(episode*horizon)+i]
                for j in range(8):
                    loss[episode][j] = loss[episode][j] - (log_probs[(episode*horizon)+i][j]*(Variable(R[episode].data.squeeze()).expand_as(log_probs[(episode*horizon)+i][j])).to(device)).sum()
loss = loss.sum(dim=0)/episodes
loss = loss.sum()

self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

tensor([0.5817, 0.7244, 0.6377, 0.7238, 0.6817, 0.7885, 0.6493, 0.6641],
       grad_fn=<DivBackward0>)

In [63]:
loss.sum(dim=0)

tensor([1.2589, 0.8853, 0.9779, 0.7818, 0.9009, 0.9727, 1.4290, 1.0984],
       grad_fn=<SumBackward1>)

In [45]:
loss[0][episode][0]

tensor(0.)

In [40]:
R

tensor([[0.0377, 0.0359]])

In [9]:
rewards

[0.007720971233944773,
 0.008050656185023443,
 0.007641773727107282,
 0.006408437301586319,
 0.005110664368602277,
 0.004016619519829481,
 0.0032642840746705598,
 0.0028498603091401025,
 0.0024848124678070365,
 0.002174340756667892,
 0.00201399682843152,
 0.0019511273497495472,
 0.0018951895923540677,
 0.001807529591167404,
 0.0017130852195461127,
 0.0015890226342465889,
 0.0014411035413355556,
 0.001303104369399498,
 0.001124100094441823,
 0.0009662002557617733,
 0.008891991511901577,
 0.008312884525828877,
 0.007189653794190873,
 0.005917250284210516,
 0.0046820103640285,
 0.0035081150844539343,
 0.0025019157850621906,
 0.0018251974378898793,
 0.001457994976533477,
 0.001247136435338605,
 0.0010728351479341036,
 0.0009510054918916309,
 0.0008633183783107117,
 0.0007855179169756137,
 0.0007477981487271634,
 0.0007435465526680494,
 0.0007600592000800926,
 0.0007625826452190626,
 0.0008296643391220036,
 0.000999251797593834]

In [4]:
torch.save(agent.model.state_dict(),'model_params.pkl')

## Observing learnt behavior

In [9]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
log_probs = []
rewards = []
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    log_probs.append(log_prob)
    rewards.append(reward)
    env.render()

Creating window glfw
Creating window glfw


In [16]:
torch.zeros(1,5,3)

tensor([[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]])

# PPO Model Debug

# DDPG Model Debug