# Exploration

In [None]:
import robosuite as suite
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)

obs = env.reset()
done = False       


# REINFORCE Model Debug

In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs = env.reset()
done = False 



In [None]:
#         self.robot_dim = len(env.observation_spec()['robot0_robot-state'])
#         self.object_dim = len(env.observation_spec()['object-state'])
#         self.state_dim = self.robot_dim + self.object_dim
#         self.gamma = gamma

#         self.policy_history = torch.Tensor(Variable())
#         self.reward_episode = []
#         self.reward_history = []
#         self.loss_history = []


  def __init__(self, state_dim,action_dim):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.l1_shared = nn.Linear(self.state_dim, 128, bias = False)
        self.d1 = nn.Dropout(p=0.6)
        self.r1 = nn.ReLU()
        self.l2_shared = nn.Linear(128, 128, bias = False)
        self.a_1 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))
        self.a_2 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))
        self.a_3 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))
        self.a_4 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))
        self.a_5 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))
        self.a_6 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))
        self.a_7 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))
        self.a_8 = nn.Sequential(nn.Linear(128, 2, bias = False),nn.Softmax(dim=-1))

    def forward(self,x):
        x = self.l1_shared(x)
        x = self.d1(x)
        x = self.r1(x)
        x = self.l2_shared(x)
        a_1 = self.a_1(x)
        a_2 = self.a_2(x)
        a_3 = self.a_3(x)
        a_4 = self.a_4(x)
        a_5 = self.a_5(x)
        a_6 = self.a_6(x)
        a_7 = self.a_7(x)
        a_8 = self.a_8(x)

        
        return a_1, a_2, a_3, a_4, a_5, a_6, a_7, a_8

In [3]:
import robosuite as suite
import numpy as np
import math

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.autograd import Variable

pi = Variable(torch.FloatTensor([math.pi])) #.cuda()

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi).sqrt()
    return a*b
    

class REINFORCEPolicy(nn.Module):
    '''
    This class represent our policy parameterization.
    '''
    def __init__(self, state_dim,action_dim):
        super(REINFORCEPolicy, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.l1 = nn.Linear(self.state_dim, 128, bias = False)
        self.l2 = nn.Linear(128, self.action_dim*2, bias = False)

    def forward(self,x):
        model = nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
         )
        return model(x)
    
    
class REINFORCE:
    '''
    This class encapsulates functionality required to run the REINFORCE algorithm.
    '''
    def __init__(self, state_dim,action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = REINFORCEPolicy(state_dim, action_dim)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = 1e-3)
        self.model.train()
        
    def select_action(self, state):
        actions = []
        log_probs = []
        outputs = self.model(Variable(state).to(device)) 
        for i in range(self.action_dim):
            mu = outputs[i]
            sigma_sq = outputs[i+1]
            sigma_sq = F.softplus(sigma_sq) # ensures that the estimate is always positive

            eps = torch.randn(mu.size())
            action = (mu + sigma_sq.sqrt()*Variable(eps).to(device)).data
            prob = normal(action, mu, sigma_sq)
            log_prob = prob.log()
            actions.append(action)
            log_probs.append(log_prob)
        
        return actions, log_probs
        
        
    def update_parameters(self, rewards, log_probs, gamma):
        R = torch.zeros(1, 1)
        loss = torch.zeros(self.action_dim)
        for i in reversed(range(len(rewards))):
            for j in range(self.action_dim):
                R = gamma * R + rewards[i]
                loss[j] = loss[j] - (log_probs[i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[i][j])).to(device)).sum()
        loss = loss.sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    

In [4]:
#from models.REINFORCE import REINFORCE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs = env.reset()
state_dim = obs['robot0_robot-state'].shape[0]+obs['object-state'].shape[0]


agent = REINFORCE(state_dim,env.action_dim)

for episode in range(10):
    obs=env.reset()
    state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
    done=False
    log_probs = []
    rewards = []
    while done==False: 
        action, log_prob = agent.select_action(state)
        obs, reward, done, info = env.step(action)
        log_probs.append(log_prob)
        rewards.append(reward)
        
    agent.update_parameters(rewards, log_probs, 0.99)
    print('Episode: {}, Rewards: {}'.format(episode, np.sum(rewards)))

    

Episode: 0, Rewards: 0.7731841640350652
Episode: 1, Rewards: 0.06702735149144877
Episode: 2, Rewards: 0.5133513507462013
Episode: 3, Rewards: 0.15580513649171013
Episode: 4, Rewards: 0.18962087665444205
Episode: 5, Rewards: 0.057104365453782314
Episode: 6, Rewards: 0.29105300958408264
Episode: 7, Rewards: 0.7670369453372128
Episode: 8, Rewards: 0.23440469122971627
Episode: 9, Rewards: 0.29720808879881055


In [None]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    use_object_obs=True,                    
    horizon = 200, 
    reward_shaping=True                 
)
obs=env.reset()
state = torch.Tensor(np.append(obs['robot0_robot-state'],obs['object-state']))
done=False
log_probs = []
rewards = []
while done==False: 
    action = np.random.randn(8)
    obs, reward, done, info = env.step(action)
#     log_probs.append(log_prob)
#     rewards.append(reward)
    env.render()

In [None]:
log_probs[1][0]

In [None]:
R.data.squeeze()

In [None]:
Variable(R.data.squeeze()).expand_as(log_probs[0][0])

In [None]:
gamma=0.99
R = torch.zeros(1, 1)
loss = torch.zeroes(self.action_dim)
for i in reversed(range(len(rewards))):
    for j in range(self.action_dim):
    R = gamma * R + rewards[i]
    loss[j] = loss[j] - (log_probs[i][j]*(Variable(R.data.squeeze()).expand_as(log_probs[i][j]))).sum() #.cuda()
loss = loss.sum()
loss = loss / len(rewards)

self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

In [None]:
torch.zeros(8)[0]

In [None]:
def select_action(obs):
    '''This function outputs the required actlion torques'''
    state = np.append(obs['robot0_robot-state'], obs['object-state'])
    state = torch.from_numpy(state).type(torch.FloatTensor)
    parameters = policy(state)
    actions = []
    log_probs = torch.Tensor(Variable())
    for mu, std in zip(*[iter(parameters)]*2):
        pdf = dist.Normal(mu,std)
        sample = pdf.sample()
        actions.append(sample)
        if log_probs.nelement()!=0:
            log_probs = torch.cat((log_probs,pdf.log_prob(sample).unsqueeze(0)))
        else:
            log_probs = (pdf.log_prob(sample).unsqueeze(0))
    if policy.policy_history.nelement()!=0:
            policy.policy_history = torch.cat((policy.policy_history,log_probs))
    else:
        policy.policy_history = log_probs       
    return actions

In [None]:
def update_policy():
    R = 0
    rewards = []

    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        for i in range(policy.action_dim):
            rewards.insert(0,R)
    # Loop through hist with for loop
    loss_steps = torch.mul(policy.policy_history,torch.Tensor(rewards)).mul(-1)
    loss_steps = loss_steps.view(-1,8)
    for entry in loss_steps:
        print(entry)
        print(len(entry))
        loss = entry
        optimizer.zero_grad()
        loss.sum().backward(retain_graph=True)
        optimizer.step()

    #Save and intialize episode history counters
    policy.loss_history.append(sum(loss_steps))
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

In [None]:
for episode in range(10):
    obs = env.reset()
    done = False  
    
    while done==False:
        action = select_action(obs)
        obs, reward, done, info = env.step(action)
        policy.reward_episode.append(reward)
    update_policy()
    print('Episode {}, Reward {}'.format(episode, policy.reward_episode[-1]))

In [None]:
def main(episodes):
    running_reward = 10
    for episode in range(episodes):
        state = env.reset() # Reset environment and record the starting state
        done = False       
    
        for time in range(1000):
            action = select_action(state)
            # Step through environment using chosen action
            state, reward, done, _ = env.step(action.data[0])
# Save reward
            policy.reward_episode.append(reward)
            if done:
                break
        
        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (time * 0.01)
update_policy()
if episode % 50 == 0:
            print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(episode, time, running_reward))
if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, time))
            break

In [None]:
def train():
    '''This function trains the network'''

# PPO Model Debug

# DDPG Model Debug