# Exploration

In [None]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

obs = env.reset()
done = False       


# REINFORCE Model Debug

In [1]:
import robosuite as suite
import numpy as np
import math

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pi = Variable(torch.FloatTensor([math.pi])).to(device)

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi).sqrt()
    return a*b
    

class REINFORCEPolicy(nn.Module):
    '''
    This class represent our policy parameterization.
    '''
    def __init__(self, state_dim,action_dim,hidden_size):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_size = hidden_size
        
        self.l1 = nn.Linear(self.state_dim, self.hidden_size)
        self.l2 = nn.Linear(self.hidden_size, self.hidden_size//2)
        self.l3 = nn.Linear(self.hidden_size//2, self.action_dim)
        self.l3_ = nn.Linear(self.hidden_size//2, self.action_dim)
        self.d1 = nn.Dropout(0.5)
        self.d2 = nn.Dropout(0.5)

    def forward(self,x):
        out = F.relu(self.d1(self.l1(x)))
        out = F.relu(self.d2(self.l2(out)))
        mu = self.l3(out)
        sigma_sq = self.l3_(out)
        return mu, sigma_sq
    
    
class REINFORCE:
    '''
    This class encapsulates functionality required to run the REINFORCE algorithm.
    '''
    def __init__(self, state_dim,action_dim, gamma, lr, episodes, horizon, hidden_size):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_size = hidden_size
        self.lr = lr
        self.model = REINFORCEPolicy(state_dim, action_dim,hidden_size)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = self.lr)
        self.model.train()
        
        self.gamma = gamma
        self.episodes = episodes
        self.horizon = horizon
        
        
    def select_action(self, state):
        actions = []
        log_probs = []
        mu , sigma_sq = self.model(Variable(state).to(device)) 
        for i in range(self.action_dim):
            mu_ = mu[i]
            sigma_sq_ = sigma_sq[i]
            sigma_sq_ = F.softplus(sigma_sq_) # ensures that the estimate is always positive

            eps = torch.randn(mu_.size())
            action = (mu_ + sigma_sq_.sqrt()*Variable(eps).to(device)).data
            prob = normal(action, mu_, sigma_sq_)
            log_prob = prob.log()
            actions.append(action)
            log_probs.append(log_prob)
        
        return actions, log_probs
    

    def episode_update_parameters(self, rewards, log_probs):
        R = torch.zeros(1, 1)
        loss = torch.zeros(self.action_dim)
        for i in reversed(range(self.horizon)):
            R = self.gamma * R + rewards[0][i]
            for j in range(self.action_dim):
                loss[j] = loss[j] - (log_probs[0][i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[0][i][j])).to(device)).sum()
        loss = loss.sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    def epoch_update_parameters(self, rewards, log_probs):
        R = torch.zeros(self.episodes)
        loss = torch.zeros(self.episodes,self.action_dim)
        for episode in range(self.episodes):
            for i in reversed(range(self.horizon)):
                R[episode] = self.gamma * R[episode] + rewards[episode][i]
                for j in range(self.action_dim):
                    loss[episode][j] = loss[episode][j] - (log_probs[episode][i][j]*(Variable(R[episode].data.squeeze()).expand_as(log_probs[episode][i][j])).to(device)).sum()
        
        loss = loss.sum(dim=0)/self.episodes
        loss = loss.sum()


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 150, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim,0.9,0.001,100,200,256)
agent.model.load_state_dict(torch.load('/Users/peterfagan/Downloads/REINFORCE_3.pkl'))

obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    env.render()




Creating window glfw
Creating window glfw
Creating window glfw


In [5]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 20, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim,0.9,)

log_probs = [[] for i in range(2)]
rewards = [[] for i in range(2)]
for episode in range(2):
    obs=env.reset()
    state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
    done=False
    while done==False: 
        action, log_prob = agent.select_action(state)
        obs, reward, done, info = env.step(action)
        log_probs[episode].append(log_prob)
        rewards[episode].append(reward)
        
agent.epoch_update_parameters(rewards, log_probs)
print('Episode: {}, Rewards: {}'.format(episode, np.mean(rewards)))

Creating window glfw
Creating window glfw


TypeError: __init__() missing 4 required positional arguments: 'lr', 'episodes', 'horizon', and 'hidden_size'

In [None]:
np.sum(rewards,axis=1).mean()

In [None]:
len(log_probs[0])

In [None]:
len(log_probs[0])

In [None]:
episodes = 2
horizon = 20
gamma = 0.99
R = torch.zeros(episodes)
loss = torch.zeros(episodes,8)

In [None]:
for episode in range(episodes):
            for i in reversed(range(horizon)):
                R[episode] = gamma * R[episode] + rewards[(episode*horizon)+i]
                for j in range(8):
                    loss[episode][j] = loss[episode][j] - (log_probs[(episode*horizon)+i][j]*(Variable(R[episode].data.squeeze()).expand_as(log_probs[(episode*horizon)+i][j])).to(device)).sum()
loss = loss.sum(dim=0)/episodes
loss = loss.sum()

self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

In [None]:
loss.sum(dim=0)

In [None]:
loss[0][episode][0]

In [None]:
R

In [None]:
rewards

In [None]:
torch.save(agent.model.state_dict(),'model_params.pkl')

## Observing learnt behavior

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
log_probs = []
rewards = []
while done==False: 
    action, log_prob = agent.select_action(state)
    obs, reward, done, info = env.step(action)
    log_probs.append(log_prob)
    rewards.append(reward)
    env.render()

In [None]:
torch.zeros(1,5,3)

# PPO Model Debug

# DDPG Model Debug