In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
import numpy as np

import random

import gymnasium as gym
from gymnasium import spaces

class TerraBot(gym.Env):
	# [weight, humidity, temperature, light level]
	# TODO find actual optimal weight
	def __init__(self, size=5):
		self.targets = np.array([400, 75, 26, 900])

		# SENSORS------------------------------------  ACTUATORS----------------------------
		# [weight, humidity, temperature, light level, led strength, fan on/off, pump on/off]
		self.observation_space = spaces.Box(0, 1000, shape=(7,), dtype=int)

		# (255 - 0 + 1) + 2 + 2 = 256 + 4 + 260
		self.action_space = spaces.Discrete(260)

		for i in range(256):
			self._action_to_actuator[i] = np.array([i, 0, 0])
		self._action_to_actuator[256] = [0, 0, 0]
		self._action_to_actuator[257] = [0, 1, 0]
		self._action_to_actuator[258] = [0, 0, 0]
		self._action_to_actuator[259] = [0, 0, 1]

	def _get_observations(self):
		return np.concatenate((self._sensors, self._actuators))
	def _get_info(self):
		return np.linalg.norm(self._sensors - self.targets, ord=2)

	def reset(self, seed=None, options=None):
		super().reset(seed=seed)

		self._sensors = np.array([
			random.SystemRandom().randint(0, 1000),
			random.SystemRandom().randint(0, 100),
			random.SystemRandom().randint(10, 40),
			random.SystemRandom().randint(0, 1000)
		])
		self._actuators = np.array([
			random.SystemRandom().randint(0, 255),
			random.SystemRandom().randint(0, 1),
			random.SystemRandom().randint(0, 1)
		])

		return self._get_observations(), self._get_info()

	def step(self, action):
		self._actuators = self._action_to_actuator[action]

		terminated = np.array_equal(self._agent_location, self._target_location)
		reward = 1 if terminated else 0  # Binary sparse rewards
		observation = self._get_observations()
		info = self._get_info()

		if self.render_mode == "human":
			self._render_frame()

		return observation, reward, terminated, False, info


In [4]:
env = GridWorldEnv()

In [5]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Define the policy network
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_size, 32)
        self.fc2 = nn.Linear(32, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.softmax(x, dim=-1)

policy = Policy()


In [6]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(policy.parameters())


In [7]:
def update_policy(rewards, log_probs, optimizer):
    log_probs = torch.stack(log_probs)
    loss = -torch.mean(log_probs * sum(rewards))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [9]:
for episode in range(10000):
	state, _ = env.reset()
	done = False
	rewards = []
	log_probs = []
	
	while not done:
		# Select action
		state = torch.tensor(state, dtype=torch.float32).reshape(1, -1)
		probs = policy(state)
		action = torch.multinomial(probs, 1).item()
		log_prob = torch.log(probs[0, action])

		# Take step
		next_state, reward, done, _, _ = env.step(action)
		rewards.append(reward)
		log_probs.append(log_prob)
		state = next_state
		
	# Update policy
	if episode % 1000 == 0:
		print(f"Episode {episode}: {sum(rewards)}")
	update_policy(rewards, log_probs, optimizer)
	rewards = []
	log_probs = []


Episode 0: 1
Episode 1000: 1
Episode 2000: 1
Episode 3000: 1
Episode 4000: 1
Episode 5000: 1
Episode 6000: 1
Episode 7000: 1
Episode 8000: 1
Episode 9000: 1


In [12]:
state, _ = env.reset()

print(state)

state = torch.tensor(state, dtype=torch.float32).reshape(1, -1)
probs = policy(state)

action = torch.multinomial(probs, 1).item()

action


[0 4 4 1]


2