In [2]:
from env import *

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque

class SoftQNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(SoftQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class SoftQLearningAgent:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99, tau=0.005, 
                 alpha=1, buffer_size=10000, batch_size=64, device='cpu'):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha  # Entropy temperature parameter
        self.batch_size = batch_size
        self.device = device  # CUDA or CPU
        
        # Q networks
        self.q_network = SoftQNetwork(state_dim, action_dim).to(device)
        self.target_q_network = SoftQNetwork(state_dim, action_dim).to(device)
        self.target_q_network.load_state_dict(self.q_network.state_dict())
        
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        
        # Replay buffer
        self.memory = deque(maxlen=buffer_size)
        
    def choose_action(self, state, explore=True):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.q_network(state)
        
        if explore:
            # Soft policy: use softmax with temperature
            action_probs = F.softmax(q_values / self.alpha, dim=1)
            action = torch.multinomial(action_probs, 1).squeeze().cpu().numpy()
            
            # Generate a full action array with randomness
            full_action = np.zeros(self.action_dim, dtype=np.float32)
            full_action[:-1] = np.random.uniform(-1, 1, self.action_dim-1)
            full_action[-1] = np.random.uniform(-1, 1)
            
            return full_action
        else:
            # Deterministic action selection
            action = np.zeros(self.action_dim, dtype=np.float32)
            action[:-1] = np.clip(q_values.squeeze().cpu().numpy()[:-1], -1, 1)
            action[-1] = np.clip(q_values.squeeze().cpu().numpy()[-1], -1, 1)
            return action
    
    def compute_entropy(self, q_values):
        # Compute entropy of action distribution
        action_probs = F.softmax(q_values / self.alpha, dim=1)
        log_probs = F.log_softmax(q_values / self.alpha, dim=1)
        entropy = -(action_probs * log_probs).sum(dim=1)
        return entropy
    
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        
        # Sample batch
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        # Current Q-values and entropy
        current_q_values = self.q_network(states)
        current_entropy = self.compute_entropy(current_q_values)
        
        # Target Q-values with entropy regularization
        with torch.no_grad():
            next_q_values = self.target_q_network(next_states)
            next_entropy = self.compute_entropy(next_q_values)
            
            # Soft Bellman backup
            soft_target_values = rewards + (1 - dones) * self.gamma * (
                torch.max(next_q_values, dim=1)[0] + self.alpha * next_entropy
            )
        
        # Q-value loss with entropy regularization
        q_loss = F.mse_loss(
            torch.sum(current_q_values * actions, dim=1), 
            soft_target_values
        )
        
        # Optional: Add entropy bonus to encourage exploration
        entropy_loss = torch.mean(current_entropy)
        
        # Combined loss
        loss = q_loss + self.alpha * entropy_loss
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Soft update of target network
        for target_param, param in zip(self.target_q_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

    def save_model(self):
        torch.save(self.q_network.state_dict(), 'saved_rl_models/best_sql.pt')
def train_soft_q_learning(env, num_episodes=100000, max_steps=200, device=torch.device('cpu')):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    agent = SoftQLearningAgent(state_dim, action_dim, device=device)
    
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        
        for step in range(max_steps):
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            
            agent.store_transition(state, action, reward, next_state, done)
            agent.learn()
            
            state = next_state
            total_reward += reward
            
            if done:
                print('done')
                agent.save_model()
                break
        
        print(f"Episode {episode+1}/{num_episodes}, Total Reward: {total_reward}")
    
    return agent

# Example usage
if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    env = ur5GymEnv(renders=True)  # Your custom environment
    trained_agent = train_soft_q_learning(env, device=device)


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  states = torch.FloatTensor(states).to(self.device)
  q_loss = F.mse_loss(


done
Episode 1/100000, Total Reward: [-527.9111537]
done
Episode 2/100000, Total Reward: [-581.71515909]
done
Episode 3/100000, Total Reward: [-505.65371377]
done
Episode 4/100000, Total Reward: [-393.8973314]
done
Episode 5/100000, Total Reward: [-872.29759583]
done
Episode 6/100000, Total Reward: [-1217.93712457]
done
Episode 7/100000, Total Reward: [-436.2340415]
done
Episode 8/100000, Total Reward: [-504.23521468]
done
Episode 9/100000, Total Reward: [-420.25886216]
done
Episode 10/100000, Total Reward: [-772.28532407]
done
Episode 11/100000, Total Reward: [-855.41307724]
done
Episode 12/100000, Total Reward: [-855.851159]
done
Episode 13/100000, Total Reward: [-895.14541605]
done
Episode 14/100000, Total Reward: [-1089.83684739]
done
Episode 15/100000, Total Reward: [-527.30338838]
done
Episode 16/100000, Total Reward: [-475.81352165]
done
Episode 17/100000, Total Reward: [-671.35051102]


KeyboardInterrupt: 

In [None]:
# import math
# import random
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# import matplotlib.pyplot as plt
# from env import *
# from torch.distributions.categorical import Categorical

# class ReplayBuffer:
#     def __init__(self, capacity):
#         self.capacity = capacity
#         self.buffer = []
#         self.position = 0

#     def push(self, state, action, reward, next_state, done):
#         if len(self.buffer) < self.capacity:
#             self.buffer.append(None)
#         self.buffer[self.position] = (state, action, reward, next_state, done)
#         self.position = (self.position + 1) % self.capacity

#     def sample(self, batch_size):
#         batch = random.sample(self.buffer, batch_size)
#         state, action, reward, next_state, done = map(np.stack, zip(*batch))
#         return state, action, reward, next_state, done

#     def __len__(self):
#         return len(self.buffer)

# class SoftQNetwork(nn.Module):
#     def __init__(self, state_dim, num_actions, alpha):
#         super(SoftQNetwork, self).__init__()

#         self.linear1 = nn.Linear(state_dim, 128)
#         self.linear2 = nn.Linear(128, 64)
#         self.linear3 = nn.Linear(64, num_actions)
#         self.alpha = alpha

#     def get_Q(self, state):
#         x = F.relu(self.linear1(state))
#         x = F.relu(self.linear2(x))
#         x = self.linear3(x)
#         return x

#     def get_V(self, q):
#         # print(q)
#         # print(q.shape)
#         v = self.alpha*torch.log(torch.mean(torch.exp(q/self.alpha)))
#         return v

# class SoftQ(object):
#     def __init__(self, state_dim, action_dim):
#         self.alpha = 2
#         self.soft_q_net = SoftQNetwork(state_dim, action_dim, self.alpha).to(device)
#         self.v_criterion = nn.MSELoss()
#         self.soft_q_criterion = nn.MSELoss()
#         self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=1e-3)
#         self.gamma = 0.9

#     def get_action(self, state):
#         py_state = torch.from_numpy(state).float()
#         temp_q = self.soft_q_net.get_Q(py_state)
#         # print(temp_q)
#         dist = torch.exp((temp_q-self.soft_q_net.get_V(temp_q))/self.alpha)
#         # print(dist)
#         dist = dist / torch.sum(dist)
#         # print(dist)
#         m = Categorical(dist.squeeze(0))
#         a = m.sample()
#         return dist

#     def train(self, batch):
#         state = batch[0]  # array [64 1 2]
#         action = batch[1]  # array [64, ]
#         reward = batch[2]  # array [64, ]
#         next_state = batch[3]
#         state = torch.from_numpy(state).float().to(device)
#         next_state = torch.from_numpy(next_state).float().to(device)
#         reward = torch.FloatTensor(reward).float().to(device)

#         q = self.soft_q_net.get_Q(state).squeeze(1)
#         est_q = q.clone()
#         print(est_q.shape)
#         next_q = self.soft_q_net.get_Q(next_state).squeeze(1)
#         next_v = self.soft_q_net.get_V(next_q)
#         for i in range(len(action)):
#             print(action[i])
#             est_q[i] = reward[i] + self.gamma * next_v[i]
#         q_loss = F.mse_loss(q, est_q.detach())
#         self.soft_q_optimizer.zero_grad()
#         q_loss.backward()
#         self.soft_q_optimizer.step()

# if __name__ == '__main__':
#     use_cuda = torch.cuda.is_available()
#     device = torch.device("cuda" if use_cuda else "cpu")
#     env = ur5GymEnv(renders=False)
#     state_dim = env.observation_space.shape[0]
#     action_dim = env.action_space.shape[0]
#     agent = SoftQ(state_dim = state_dim, action_dim = action_dim)
#     max_MC_iter = 200
#     max_epi_iter = 500
#     batch_size = 64
#     replay_buffer = ReplayBuffer(10000)
#     train_curve = []
#     for epi in range(max_epi_iter):
#         state = env.reset()
#         # state = state.reshape((1, state_dim))
#         acc_reward = 0
#         for MC_iter in range(max_MC_iter):
#             # print(state)
            
#             action = agent.get_action(state)
            
#             action = np.array(action.squeeze().detach().cpu())
#             print("action: ", action)
#             next_state, reward, done, _ = env.step(action)
#             acc_reward = acc_reward + reward
#             state = next_state
#             replay_buffer.push(state, action, reward, next_state, done)
#             if len(replay_buffer) > batch_size:
#                 agent.train(replay_buffer.sample(batch_size))
#             if done:
#                 break
#         print('Episode', epi, 'reward', acc_reward / MC_iter)
#         train_curve.append(acc_reward)
#     plt.plot(train_curve, linewidth=1, label='SAC')
#     plt.show()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


action:  [0.16830368 0.1689492  0.18071607 0.48203102]
action:  [0.16796921 0.16886152 0.18069643 0.4824728 ]
action:  [0.16763812 0.16837768 0.18005356 0.4839306 ]
action:  [0.1662529  0.16633344 0.17807178 0.48934188]
action:  [0.166573   0.16722819 0.17960267 0.4865962 ]
action:  [0.1660001  0.16640314 0.17845926 0.48913753]
action:  [0.16501935 0.16404252 0.17498639 0.4959518 ]
action:  [0.16483834 0.16248028 0.17172404 0.50095737]
action:  [0.1647227  0.16199186 0.17108999 0.5021955 ]
action:  [0.16480145 0.1616091  0.17099853 0.502591  ]
action:  [0.16410933 0.16088404 0.1693797  0.505627  ]
action:  [0.16445436 0.16052337 0.16973592 0.50528634]
action:  [0.16485702 0.16082394 0.17085885 0.50346017]
action:  [0.1638408  0.16105983 0.17206727 0.5030321 ]
action:  [0.1624472  0.15991025 0.17041087 0.50723165]
action:  [0.16230807 0.15868612 0.16831113 0.5106947 ]
action:  [0.16106854 0.15823193 0.16804372 0.51265585]
action:  [0.16050555 0.15837671 0.16903563 0.51208216]
action:  [

IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number