In [1]:
from env import *

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque

class SoftQNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(SoftQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class SoftQLearningAgent:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99, tau=0.005, 
                 alpha=0.2, buffer_size=10000, batch_size=64):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha  # Entropy temperature parameter
        self.batch_size = batch_size
        
        # Q networks
        self.q_network = SoftQNetwork(state_dim, action_dim)
        self.target_q_network = SoftQNetwork(state_dim, action_dim)
        self.target_q_network.load_state_dict(self.q_network.state_dict())
        
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        
        # Replay buffer
        self.memory = deque(maxlen=buffer_size)
        
    def choose_action(self, state, explore=True):
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.q_network(state)
        
        if explore:
            # Soft policy: use softmax with temperature
            action_probs = F.softmax(q_values / self.alpha, dim=1)
            action = torch.multinomial(action_probs, 1).squeeze().numpy()
            
            # Generate a full action array with randomness
            full_action = np.zeros(self.action_dim, dtype=np.float32)
            full_action[:-1] = np.random.uniform(-1, 1, self.action_dim-1)
            full_action[-1] = np.random.uniform(-1, 1)
            
            return full_action
        else:
            # Deterministic action selection
            action = np.zeros(self.action_dim, dtype=np.float32)
            action[:-1] = np.clip(q_values.squeeze().numpy()[:-1], -1, 1)
            action[-1] = np.clip(q_values.squeeze().numpy()[-1], -1, 1)
            return action
    
    def compute_entropy(self, q_values):
        # Compute entropy of action distribution
        action_probs = F.softmax(q_values / self.alpha, dim=1)
        log_probs = F.log_softmax(q_values / self.alpha, dim=1)
        entropy = -(action_probs * log_probs).sum(dim=1)
        return entropy
    
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        
        # Sample batch
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)
        
        # Current Q-values and entropy
        current_q_values = self.q_network(states)
        current_entropy = self.compute_entropy(current_q_values)
        
        # Target Q-values with entropy regularization
        with torch.no_grad():
            next_q_values = self.target_q_network(next_states)
            next_entropy = self.compute_entropy(next_q_values)
            
            # Soft Bellman backup
            soft_target_values = rewards + (1 - dones) * self.gamma * (
                torch.max(next_q_values, dim=1)[0] + self.alpha * next_entropy
            )
        
        # Q-value loss with entropy regularization
        q_loss = F.mse_loss(
            torch.sum(current_q_values * actions, dim=1), 
            soft_target_values
        )
        
        # Optional: Add entropy bonus to encourage exploration
        entropy_loss = torch.mean(current_entropy)
        
        # Combined loss
        loss = q_loss + self.alpha * entropy_loss
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Soft update of target network
        for target_param, param in zip(self.target_q_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

def train_soft_q_learning(env, num_episodes=1000, max_steps=200):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    agent = SoftQLearningAgent(state_dim, action_dim)
    
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        
        for step in range(max_steps):
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            
            agent.store_transition(state, action, reward, next_state, done)
            agent.learn()
            
            state = next_state
            total_reward += reward
            
            if done:
                break
        
        print(f"Episode {episode+1}/{num_episodes}, Total Reward: {total_reward}")
    
    return agent

# Example usage
if __name__ == "__main__":
    env = ur5GymEnv()  # Your custom environment
    trained_agent = train_soft_q_learning(env)

jointInfo(id=0, name='world_joint', type='FIXED', lowerLimit=0.0, upperLimit=-1.0, maxForce=0.0, maxVelocity=0.0, controllable=False)
jointInfo(id=1, name='shoulder_pan_joint', type='REVOLUTE', lowerLimit=-6.28318530718, upperLimit=6.28318530718, maxForce=150.0, maxVelocity=3.14, controllable=True)
jointInfo(id=2, name='shoulder_lift_joint', type='REVOLUTE', lowerLimit=-6.28318530718, upperLimit=6.28318530718, maxForce=150.0, maxVelocity=3.14, controllable=True)
jointInfo(id=3, name='elbow_joint', type='REVOLUTE', lowerLimit=-3.14159265359, upperLimit=3.14159265359, maxForce=150.0, maxVelocity=3.14, controllable=True)
jointInfo(id=4, name='wrist_1_joint', type='REVOLUTE', lowerLimit=-6.28318530718, upperLimit=6.28318530718, maxForce=28.0, maxVelocity=6.28, controllable=True)
jointInfo(id=5, name='wrist_2_joint', type='REVOLUTE', lowerLimit=-6.28318530718, upperLimit=6.28318530718, maxForce=28.0, maxVelocity=6.28, controllable=True)
jointInfo(id=6, name='wrist_3_joint', type='REVOLUTE',

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  states = torch.FloatTensor(states)
  q_loss = F.mse_loss(


Episode 1/1000, Total Reward: [-442.44053298]
Episode 2/1000, Total Reward: [-1060.83884292]


error: getLinkState failed.