# Exploration

In [None]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

obs = env.reset()
done = False       


# REINFORCE Model Debug

In [2]:
import robosuite as suite
import numpy as np
import math

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pi = Variable(torch.FloatTensor([math.pi])).to(device)

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi).sqrt()
    return a*b
    

class REINFORCEPolicy(nn.Module):
    '''
    This class represent our policy parameterization.
    '''
    def __init__(self, state_dim,action_dim):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.l1 = nn.Linear(self.state_dim, 128, bias = False)
        self.l2 = nn.Linear(128, self.action_dim*2, bias = False)

    def forward(self,x):
        model = nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
         )
        return model(x)
    
    
class REINFORCE:
    '''
    This class encapsulates functionality required to run the REINFORCE algorithm.
    '''
    def __init__(self, state_dim,action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = REINFORCEPolicy(state_dim, action_dim)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = 1e-3)
        self.model.train()
        
    def select_action(self, state):
        actions = []
        log_probs = []
        outputs = self.model(Variable(state).to(device)) 
        for i in range(self.action_dim):
            mu = outputs[i]
            sigma_sq = outputs[i+1]
            sigma_sq = F.softplus(sigma_sq) # ensures that the estimate is always positive

            eps = torch.randn(mu.size())
            action = (mu + sigma_sq.sqrt()*Variable(eps).to(device)).data
            prob = normal(action, mu, sigma_sq)
            log_prob = prob.log()
            actions.append(action)
            log_probs.append(log_prob)
        
        return actions, log_probs
        
        
    def update_parameters(self, rewards, log_probs, gamma):
        R = torch.zeros(1, 1)
        loss = torch.zeros(self.action_dim)
        for i in reversed(range(len(rewards))):
            for j in range(self.action_dim):
                R = gamma * R + rewards[i]
                loss[j] = loss[j] - (log_probs[i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[i][j])).to(device)).sum()
        loss = loss.sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    

In [None]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim)

for episode in range(2000):
    obs=env.reset()
    state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
    done=False
    log_probs = []
    rewards = []
    while done==False: 
        action, log_prob = agent.select_action(state)
        obs, reward, done, info = env.step(action)
        log_probs.append(log_prob)
        rewards.append(reward)
        
    agent.update_parameters(rewards, log_probs, 0.99)
    print('Episode: {}, Rewards: {}'.format(episode, np.sum(rewards)))

    

Episode: 0, Rewards: 0.8161299088762956
Episode: 1, Rewards: 1.0034887065288505
Episode: 2, Rewards: 0.1234028260181447
Episode: 3, Rewards: 0.8832828136465667
Episode: 4, Rewards: 0.24207774591039521
Episode: 5, Rewards: 0.29577398510445174
Episode: 6, Rewards: 5.836262514248209
Episode: 7, Rewards: 1.3219722500175324
Episode: 8, Rewards: 0.8192462597065335
Episode: 9, Rewards: 1.9229083127933617
Episode: 10, Rewards: 0.20409351774887552
Episode: 11, Rewards: 0.3122872629996676
Episode: 12, Rewards: 0.43713395369443414
Episode: 13, Rewards: 0.17118807864737126
Episode: 14, Rewards: 0.9925118525201788
Episode: 15, Rewards: 0.6190223971025846
Episode: 16, Rewards: 0.503117933248256
Episode: 17, Rewards: 0.0658550493941764
Episode: 18, Rewards: 3.339172725999706
Episode: 19, Rewards: 0.3388893043217882
Episode: 20, Rewards: 0.12014717658810288
Episode: 21, Rewards: 0.3970001326124323
Episode: 22, Rewards: 0.14998954342962426
Episode: 23, Rewards: 0.2837681458744577
Episode: 24, Rewards: 

Episode: 198, Rewards: 1.3836913012742658
Episode: 199, Rewards: 0.18060716427376503
Episode: 200, Rewards: 6.493569024973647
Episode: 201, Rewards: 11.906559656263926
Episode: 202, Rewards: 0.5849670885697293
Episode: 203, Rewards: 1.7167602852871853
Episode: 204, Rewards: 0.1770428416283995
Episode: 205, Rewards: 3.8428217630000683
Episode: 206, Rewards: 2.3581941638587667
Episode: 207, Rewards: 0.25608922255041394
Episode: 208, Rewards: 0.2998541721833469
Episode: 209, Rewards: 9.806274977859834
Episode: 210, Rewards: 0.2057147255179743
Episode: 211, Rewards: 1.217339990634767
Episode: 212, Rewards: 0.3328896720084779
Episode: 213, Rewards: 7.892167436065215
Episode: 214, Rewards: 0.7012692396100759
Episode: 215, Rewards: 0.4827171197625272
Episode: 216, Rewards: 3.207971892166296
Episode: 217, Rewards: 0.739682808031088
Episode: 218, Rewards: 0.5467756567301227
Episode: 219, Rewards: 1.196414377245153
Episode: 220, Rewards: 6.278383031780577
Episode: 221, Rewards: 0.525367652296251

Episode: 394, Rewards: 0.08273538864807041
Episode: 395, Rewards: 16.031465401984452
Episode: 396, Rewards: 3.5779534226104834
Episode: 397, Rewards: 0.5373630821923288
Episode: 398, Rewards: 5.660414574339551
Episode: 399, Rewards: 0.6479345538466073
Episode: 400, Rewards: 0.3066872336514906
Episode: 401, Rewards: 4.579015426160005
Episode: 402, Rewards: 4.798503156172149
Episode: 403, Rewards: 0.2687027295994938
Episode: 404, Rewards: 0.5958886994058065
Episode: 405, Rewards: 0.5482929074377321
Episode: 406, Rewards: 28.493525715774837
Episode: 407, Rewards: 6.355765747857415
Episode: 408, Rewards: 0.7547897120416314
Episode: 409, Rewards: 4.241572755075113
Episode: 410, Rewards: 0.48068190002077943
Episode: 411, Rewards: 0.23912308121789205
Episode: 412, Rewards: 12.306217193122503
Episode: 413, Rewards: 4.178524112561984
Episode: 414, Rewards: 0.24073095867955413
Episode: 415, Rewards: 5.557728156576728
Episode: 416, Rewards: 2.5485328406639334
Episode: 417, Rewards: 0.368703456509

Episode: 590, Rewards: 5.333693072969226
Episode: 591, Rewards: 10.855864366172185
Episode: 592, Rewards: 1.2529209500977703
Episode: 593, Rewards: 2.8509182782842286
Episode: 594, Rewards: 0.3990011423831623
Episode: 595, Rewards: 6.342597360871405
Episode: 596, Rewards: 0.39649882784671336
Episode: 597, Rewards: 5.7163929572942465
Episode: 598, Rewards: 0.18483048467656651
Episode: 599, Rewards: 11.06594202767779
Episode: 600, Rewards: 2.9756868237511753
Episode: 601, Rewards: 0.37555186702221044
Episode: 602, Rewards: 2.3589365556443633
Episode: 603, Rewards: 0.0634723619520104
Episode: 604, Rewards: 4.084193893489895
Episode: 605, Rewards: 2.753019662358254
Episode: 606, Rewards: 0.2830114899594235
Episode: 607, Rewards: 0.35876150099192317
Episode: 608, Rewards: 24.928881439851686
Episode: 609, Rewards: 17.97294893060277
Episode: 610, Rewards: 2.55189763722579
Episode: 611, Rewards: 14.037015185132775
Episode: 612, Rewards: 13.00333245543137
Episode: 613, Rewards: 0.70582009835240

Episode: 787, Rewards: 0.28813098350655747
Episode: 788, Rewards: 0.7692943200735669
Episode: 789, Rewards: 0.46081168608116807
Episode: 790, Rewards: 1.998892126372686
Episode: 791, Rewards: 1.2131906621628892
Episode: 792, Rewards: 5.728551329445217
Episode: 793, Rewards: 0.2834467353014711
Episode: 794, Rewards: 12.815865047245397
Episode: 795, Rewards: 8.210430796101242
Episode: 796, Rewards: 10.737038016288254
Episode: 797, Rewards: 0.7572247036930743
Episode: 798, Rewards: 1.3442098927650528
Episode: 799, Rewards: 5.074195626983431
Episode: 800, Rewards: 1.6877384959792718
Episode: 801, Rewards: 5.846976631540053
Episode: 802, Rewards: 4.643809292526841
Episode: 803, Rewards: 3.020359170669771
Episode: 804, Rewards: 5.043108538890071
Episode: 805, Rewards: 2.422002315460539
Episode: 806, Rewards: 0.26716762416600753
Episode: 807, Rewards: 0.9483373854544122
Episode: 808, Rewards: 14.990565095735718
Episode: 809, Rewards: 0.4506294094302333
Episode: 810, Rewards: 2.074261851817289

Episode: 984, Rewards: 7.588050579207306
Episode: 985, Rewards: 4.3414797869635455
Episode: 986, Rewards: 4.77605105448119
Episode: 987, Rewards: 0.1463655839035795
Episode: 988, Rewards: 0.39951272925863746
Episode: 989, Rewards: 0.583096455290345
Episode: 990, Rewards: 2.8208575697193865
Episode: 991, Rewards: 0.33719058208179703
Episode: 992, Rewards: 1.8208735038322301
Episode: 993, Rewards: 0.4590322056540927
Episode: 994, Rewards: 10.046063717349405
Episode: 995, Rewards: 3.8246472062390175
Episode: 996, Rewards: 2.7959063480988844
Episode: 997, Rewards: 11.065482826266464
Episode: 998, Rewards: 6.360089451761108
Episode: 999, Rewards: 2.805917942488384
Episode: 1000, Rewards: 3.500372321861231
Episode: 1001, Rewards: 6.55205911711296
Episode: 1002, Rewards: 2.631705674916917
Episode: 1003, Rewards: 5.016975326659102
Episode: 1004, Rewards: 10.667333223994975
Episode: 1005, Rewards: 2.4976762389861094
Episode: 1006, Rewards: 5.100178632850598
Episode: 1007, Rewards: 0.10017597956

Episode: 1177, Rewards: 11.292459775571697
Episode: 1178, Rewards: 0.24653593118577702
Episode: 1179, Rewards: 0.6687572382928133
Episode: 1180, Rewards: 15.284742305125304
Episode: 1181, Rewards: 0.8786894788025967
Episode: 1182, Rewards: 1.8511456243842854
Episode: 1183, Rewards: 0.12082068837864034
Episode: 1184, Rewards: 0.17107551943457702
Episode: 1185, Rewards: 4.292787630051117
Episode: 1186, Rewards: 7.93676489959012
Episode: 1187, Rewards: 2.5990267720803484
Episode: 1188, Rewards: 4.732998324941368
Episode: 1189, Rewards: 14.60625852953235
Episode: 1190, Rewards: 0.9427818618520636
Episode: 1191, Rewards: 1.4942460575546008
Episode: 1192, Rewards: 15.723169955311995
Episode: 1193, Rewards: 3.6450811206516263
Episode: 1194, Rewards: 0.2909715820635142
Episode: 1195, Rewards: 11.851563906525161
Episode: 1196, Rewards: 4.378290401896436
Episode: 1197, Rewards: 0.28254982965940695
Episode: 1198, Rewards: 0.13942739486185454
Episode: 1199, Rewards: 7.320455077205447
Episode: 1200

Episode: 1369, Rewards: 6.6909343975065525
Episode: 1370, Rewards: 1.1243493125610953
Episode: 1371, Rewards: 0.257765192552327
Episode: 1372, Rewards: 7.184968740911581
Episode: 1373, Rewards: 3.2228404685288643
Episode: 1374, Rewards: 0.36030624067755657
Episode: 1375, Rewards: 8.517398861044086
Episode: 1376, Rewards: 0.16034861605318454
Episode: 1377, Rewards: 41.18057369185307
Episode: 1378, Rewards: 3.5917117750782444
Episode: 1379, Rewards: 6.652378435198241
Episode: 1380, Rewards: 1.5954681145115197
Episode: 1381, Rewards: 16.52101850329717
Episode: 1382, Rewards: 2.646333935550635
Episode: 1383, Rewards: 0.5866968718314394
Episode: 1384, Rewards: 1.0675646353580908
Episode: 1385, Rewards: 11.856428033655174
Episode: 1386, Rewards: 2.7875280791878567
Episode: 1387, Rewards: 1.3312969170411546
Episode: 1388, Rewards: 1.0543911747461783
Episode: 1389, Rewards: 0.36990969043983984
Episode: 1390, Rewards: 28.02426359650814
Episode: 1391, Rewards: 8.991754681190956
Episode: 1392, Re

Episode: 1562, Rewards: 4.1914809596648634
Episode: 1563, Rewards: 4.822510723965834
Episode: 1564, Rewards: 5.656237927673579
Episode: 1565, Rewards: 0.3028559264439277
Episode: 1566, Rewards: 5.870090745654187
Episode: 1567, Rewards: 3.4572416795597967
Episode: 1568, Rewards: 1.1715458674685721
Episode: 1569, Rewards: 1.77142517257404
Episode: 1570, Rewards: 0.1680195827571871
Episode: 1571, Rewards: 1.1035642135334236
Episode: 1572, Rewards: 13.444543452122943
Episode: 1573, Rewards: 12.767297470013553
Episode: 1574, Rewards: 6.7139228775609645
Episode: 1575, Rewards: 16.749340018368894
Episode: 1576, Rewards: 1.9022172833715905
Episode: 1577, Rewards: 20.62463850717625
Episode: 1578, Rewards: 0.80816252711098
Episode: 1579, Rewards: 0.23187383191327893
Episode: 1580, Rewards: 0.4316099327533086
Episode: 1581, Rewards: 2.3841812523241814
Episode: 1582, Rewards: 1.5691412788355423
Episode: 1583, Rewards: 2.2340003043461483
Episode: 1584, Rewards: 31.56635824150222
Episode: 1585, Rewa

## Observing learnt behavior

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
log_probs = []
rewards = []
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    log_probs.append(log_prob)
    rewards.append(reward)
    env.render()

# PPO Model Debug

# DDPG Model Debug