In [2]:
import torch
import cql.environment as env
import cql.nn.model as model

max_epochs = 100
max_episodes = 5000

training_batch_size = 200
learning_rate = 0.02
discount_factor = 0.99
exploration_probability = 0.5
exploration_discount = 0.9

critic_training_step = 200
actor_training_step = 200

# Weight parameters for the reward function
# [0] -> X_Difference
# [1] -> Y_Difference
# [2] -> Area_Reward
# [3] -> Boundary_Constraint
weight_params = [-1.0, -1.0, -10.0, -5.0]

starting_pos = (4, 4)
target_pos = (0, 0)
env_dim = (5, 5)
env_obs = [((1, 1), -9999), ((2, 1), -9999), ((1, 2), -9999), ((2, 2), -9999)]

goal_radius = 0.2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Currently using {device} device")

e = env.Environment(
    dim=env_dim, 
    target_pos=target_pos, 
    start_pos=starting_pos)
e.set_weights(env_obs)
e.display()

weights = torch.tensor(weight_params, dtype=torch.float).to(device)
model.run_sim(
    environment=e, 
    weights=weights,
    device=device,
    learning_rate=learning_rate,
    discount_factor=discount_factor,
    max_epochs=max_epochs,
    max_episodes=max_episodes,
    training_batch_size=training_batch_size,
    critic_training_step=critic_training_step,
    actor_training_step=actor_training_step
)

Currently using cpu device
Setting weight with value: -9999, at coordinate: (1, 1)
Setting weight with value: -9999, at coordinate: (2, 1)
Setting weight with value: -9999, at coordinate: (1, 2)
Setting weight with value: -9999, at coordinate: (2, 2)
tensor([[    0.,     0.,     0.,     0.,     0.],
        [    0., -9999., -9999.,     0.,     0.],
        [    0., -9999., -9999.,     0.,     0.],
        [    0.,     0.,     0.,     0.,     0.],
        [    0.,     0.,     0.,     0.,     0.]])
Current Epoch: 0
Linear Diff: tensor([-4.8390, -3.8234], grad_fn=<NegBackward0>), Area Reward: (tensor(0.), True), Boundary Constraint: -1.0
tensor([4., 4.]) -> tensor([4.8390, 3.8234], grad_fn=<AddBackward0>), action: tensor([ 0.8390, -0.1766], grad_fn=<TanhBackward0>)
Linear Diff: tensor([-5.4926, -3.5278], grad_fn=<NegBackward0>), Area Reward: (tensor(-6.8310), False), Boundary Constraint: 0.0
tensor([4.8390, 3.8234], grad_fn=<AddBackward0>) -> tensor([5.4926, 3.5278], grad_fn=<AddBackward0

KeyboardInterrupt: 