# Exploration

In [None]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

obs = env.reset()
done = False       


# REINFORCE Model Debug

In [2]:
import robosuite as suite
import numpy as np
import math

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pi = Variable(torch.FloatTensor([math.pi])).to(device)

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi).sqrt()
    return a*b
    

class REINFORCEPolicy(nn.Module):
    '''
    This class represent our policy parameterization.
    '''
    def __init__(self, state_dim,action_dim):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.l1 = nn.Linear(self.state_dim, 128, bias = False)
        self.l2 = nn.Linear(128, self.action_dim*2, bias = False)

    def forward(self,x):
        model = nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
         )
        return model(x)
    
    
class REINFORCE:
    '''
    This class encapsulates functionality required to run the REINFORCE algorithm.
    '''
    def __init__(self, state_dim,action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = REINFORCEPolicy(state_dim, action_dim)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = 1e-3)
        self.model.train()
        
    def select_action(self, state):
        actions = []
        log_probs = []
        outputs = self.model(Variable(state).to(device)) 
        for i in range(self.action_dim):
            mu = outputs[i]
            sigma_sq = outputs[i+1]
            sigma_sq = F.softplus(sigma_sq) # ensures that the estimate is always positive

            eps = torch.randn(mu.size())
            action = (mu + sigma_sq.sqrt()*Variable(eps).to(device)).data
            prob = normal(action, mu, sigma_sq)
            log_prob = prob.log()
            actions.append(action)
            log_probs.append(log_prob)
        
        return actions, log_probs
        
        
    def update_parameters(self, rewards, log_probs, gamma):
        R = torch.zeros(1, 1)
        loss = torch.zeros(self.action_dim)
        for i in reversed(range(len(rewards))):
            for j in range(self.action_dim):
                R = gamma * R + rewards[i]
                loss[j] = loss[j] - (log_probs[i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[i][j])).to(device)).sum()
        loss = loss.sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    

In [None]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim)

for episode in range(2000):
    obs=env.reset()
    state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
    done=False
    log_probs = []
    rewards = []
    while done==False: 
        action, log_prob = agent.select_action(state)
        obs, reward, done, info = env.step(action)
        log_probs.append(log_prob)
        rewards.append(reward)
        
    agent.update_parameters(rewards, log_probs, 0.99)
    print('Episode: {}, Rewards: {}'.format(episode, np.sum(rewards)))

    

Episode: 0, Rewards: 0.8161299088762956
Episode: 1, Rewards: 1.0034887065288505
Episode: 2, Rewards: 0.1234028260181447
Episode: 3, Rewards: 0.8832828136465667
Episode: 4, Rewards: 0.24207774591039521
Episode: 5, Rewards: 0.29577398510445174
Episode: 6, Rewards: 5.836262514248209
Episode: 7, Rewards: 1.3219722500175324
Episode: 8, Rewards: 0.8192462597065335
Episode: 9, Rewards: 1.9229083127933617
Episode: 10, Rewards: 0.20409351774887552
Episode: 11, Rewards: 0.3122872629996676
Episode: 12, Rewards: 0.43713395369443414
Episode: 13, Rewards: 0.17118807864737126
Episode: 14, Rewards: 0.9925118525201788
Episode: 15, Rewards: 0.6190223971025846
Episode: 16, Rewards: 0.503117933248256
Episode: 17, Rewards: 0.0658550493941764
Episode: 18, Rewards: 3.339172725999706
Episode: 19, Rewards: 0.3388893043217882
Episode: 20, Rewards: 0.12014717658810288
Episode: 21, Rewards: 0.3970001326124323
Episode: 22, Rewards: 0.14998954342962426
Episode: 23, Rewards: 0.2837681458744577
Episode: 24, Rewards: 

Episode: 198, Rewards: 1.3836913012742658
Episode: 199, Rewards: 0.18060716427376503
Episode: 200, Rewards: 6.493569024973647
Episode: 201, Rewards: 11.906559656263926
Episode: 202, Rewards: 0.5849670885697293
Episode: 203, Rewards: 1.7167602852871853
Episode: 204, Rewards: 0.1770428416283995
Episode: 205, Rewards: 3.8428217630000683
Episode: 206, Rewards: 2.3581941638587667
Episode: 207, Rewards: 0.25608922255041394
Episode: 208, Rewards: 0.2998541721833469
Episode: 209, Rewards: 9.806274977859834
Episode: 210, Rewards: 0.2057147255179743
Episode: 211, Rewards: 1.217339990634767
Episode: 212, Rewards: 0.3328896720084779
Episode: 213, Rewards: 7.892167436065215
Episode: 214, Rewards: 0.7012692396100759
Episode: 215, Rewards: 0.4827171197625272
Episode: 216, Rewards: 3.207971892166296
Episode: 217, Rewards: 0.739682808031088
Episode: 218, Rewards: 0.5467756567301227
Episode: 219, Rewards: 1.196414377245153
Episode: 220, Rewards: 6.278383031780577
Episode: 221, Rewards: 0.525367652296251

Episode: 394, Rewards: 0.08273538864807041
Episode: 395, Rewards: 16.031465401984452
Episode: 396, Rewards: 3.5779534226104834
Episode: 397, Rewards: 0.5373630821923288
Episode: 398, Rewards: 5.660414574339551
Episode: 399, Rewards: 0.6479345538466073
Episode: 400, Rewards: 0.3066872336514906
Episode: 401, Rewards: 4.579015426160005
Episode: 402, Rewards: 4.798503156172149
Episode: 403, Rewards: 0.2687027295994938
Episode: 404, Rewards: 0.5958886994058065
Episode: 405, Rewards: 0.5482929074377321
Episode: 406, Rewards: 28.493525715774837
Episode: 407, Rewards: 6.355765747857415
Episode: 408, Rewards: 0.7547897120416314
Episode: 409, Rewards: 4.241572755075113
Episode: 410, Rewards: 0.48068190002077943
Episode: 411, Rewards: 0.23912308121789205
Episode: 412, Rewards: 12.306217193122503
Episode: 413, Rewards: 4.178524112561984
Episode: 414, Rewards: 0.24073095867955413
Episode: 415, Rewards: 5.557728156576728
Episode: 416, Rewards: 2.5485328406639334
Episode: 417, Rewards: 0.368703456509

## Observing learnt behavior

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
log_probs = []
rewards = []
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    log_probs.append(log_prob)
    rewards.append(reward)
    env.render()

# PPO Model Debug

# DDPG Model Debug