In [1]:
import numpy as np 
import random 
import torch 
import torch.nn as nn 
import torch.optim as optim 

# Define your neural network architecture. 
# In this case, input_size = 9 (9 eyes), output_size = 4 (4 kinds of movements). 
class QNetwork(nn.Module): 
    def __init__(self, input_size, output_size): 
        super().__init__() 
        self.fc1 = nn.Linear(input_size, 128) 
        self.fc2 = nn.Linear(128, 64) 
        self.fc3 = nn.Linear(64, output_size) 

    # This line defines the forward method for the QNetwork class, 
    # which specifies how data flows through the network during the forward pass. 
    def forward(self, x): 
        # ReLU: Rectified Linear Unit 
        # It applies ReLU activation function to the output of the 1st fully connected layer (fc1). 
        x = torch.relu(self.fc1(x)) # torch.nn.functional.relu 
        # It applies ReLU activation function to the output of the 2nd fully connected layer (fc2). 
        x = torch.relu(self.fc2(x)) # torch.nn.functional.relu 
        # It returns the output of the 3rd fully connected layer (fc3) without using activation function. 
        return self.fc3(x) 

# Define the Deep Q-Learning agent. 
class DQNAgent: 
    def __init__(self, state_size, action_size, gamma = 0.99, epsilon = 1.0, epsilon_decay = 0.995, epsilon_min = 0.01): 
        self.state_size = state_size # state_size: 9 eyes 
        self.action_size = action_size # action_size: 4 kinds of movements 
        self.gamma = gamma 
        self.epsilon = epsilon 
        self.epsilon_decay = epsilon_decay 
        self.epsilon_min = epsilon_min 
        
        # state_size, action_size: 9 eyes, 4 kinds of movements 
        self.q_network = QNetwork(state_size, action_size) 
        self.optimizer = optim.Adam(self.q_network.parameters(), lr = 0.001) 
    
    # The select_action method of the DQNAgent class returns 
    # the index of the action with the highest Q-value. 
    def select_action(self, state): 
        if np.random.rand() < self.epsilon: 
            return random.randrange(self.action_size) 
        with torch.no_grad(): 
            state = torch.tensor(state, dtype = torch.float32) 
            q_values = self.q_network(state) 
            return np.argmax(q_values.numpy()) 
    
    def train(self, state, action, reward, next_state, done): 
        state = torch.tensor(state, dtype = torch.float32) 
        next_state = torch.tensor(next_state, dtype = torch.float32) 
        q_values = self.q_network(state) # 2. Input the data into the model. 
        next_q_values = self.q_network(next_state) 
        target = q_values.clone() 
        target[0][action] = reward + self.gamma * torch.max(next_q_values) 
        loss = nn.MSELoss()(q_values, target) # 3. Calculate loss. 
        self.optimizer.zero_grad() # 1. Clear the gradient. 
        loss.backward() # 4. Calculate the gradient. [Backpropogation] 
        self.optimizer.step() # 5. Do gradient Descent. [Update the parameter.] 
        if self.epsilon > self.epsilon_min: 
            self.epsilon *= self.epsilon_decay 

# Define your 2D environment class. 
class Environment: 
    def __init__(self, num_red_apples, num_green_poisonous_things): 
        
        # Define the number of eyes and the number of values each eye senses. 
        self.num_eyes = 9 
        self.num_values_per_eye = 3 
        self.max_visibility_distance = 10.0 # Adjust as needed 
        
        # Define the boundaries of the environment. 
        self.x_min = 0 
        self.x_max = 100 # Adjust as needed 
        self.y_min = 0 
        self.y_max = 100 # Adjust as needed 
        
        # Initialize the environment state with maximum visibility distances. 
        # self.initial_state is a 3 * 9 array with 10.0 in each element. 
        self.initial_state = np.full((self.num_values_per_eye, self.num_eyes), self.max_visibility_distance) 

        # Define the number of red apples and green poisonous things. 
        self.num_red_apples = num_red_apples 
        self.num_green_poisonous_things = num_green_poisonous_things 
        
        # Place the red apples and green poisonous things randomly.  
        self.red_apple_positions = self._generate_random_positions(self.num_red_apples) 
        self.green_poisonous_thing_positions = self._generate_random_positions(self.num_green_poisonous_things) 
        
        # Initialize the agent's position 
        self.agent_position = [50, 50]  # Starting position of the agent 
        
    def _generate_random_positions(self, num_positions): 
        positions = [] 
        for i in range(num_positions): 
            position = [random.randint(self.x_min, self.x_max), random.randint(self.y_min, self.y_max)] 
            while position in positions: 
                position = [random.randint(self.x_min, self.x_max), random.randint(self.y_min, self.y_max)] 
            positions.append(position) 
        return positions 
    
    def _get_agent_positions(self): 
        # Return the current position of the agent 
        return [self.agent_position] 
        
    def step(self, action): 
        
        # Define the mapping between action indices and action strings. 
        action_mapping = { 
            0: "move_left", 
            1: "move_right", 
            2: "move_up", 
            3: "move_down" 
        } 
        
        # Implement the step function to update the environment based on the chosen action 

        # 1. Update the state (no state change in this simple example) 
        next_state = self.initial_state 

        # 2. Calculate the reward 
        reward = 0 
        
        # Check if the agent collects a red apple 
        for i in range(self.num_red_apples): 
            if self.red_apple_positions[i] in self._get_agent_positions(): 
                reward = reward + 1.0 # Add a positive reward for collecting a red apple 
                self.red_apple_positions[i] = None # Remove the collected red apple from the environment 
                break # Exit the loop once a red apple is collected 
                
        # Check if the agent collides with a green poisonous thing 
        for i in range(self.num_green_poisonous_things): 
            if self.green_poisonous_thing_positions[i] in self._get_agent_positions(): 
                reward = reward - 1.0 # Add a negative reward for colliding with a green poisonous thing 
                done = True # Terminate the episode 
                break # Exit the loop once a green poisonous thing is collided with 

        # 3. Determine if the episode is done 
        done = False 

        # Update the agent's position based on the chosen action 
        if action == 0 and self.agent_position[0] > self.x_min: 
            self.agent_position[0] -= 1 # Move left 
            
        elif action == 1 and self.agent_position[0] < self.x_max: 
            self.agent_position[0] += 1 # Move right 
            
        elif action == 2 and self.agent_position[1] < self.y_max: 
            self.agent_position[1] += 1 # Move up 
            
        elif action == 3 and self.agent_position[1] > self.y_min: 
            self.agent_position[1] -= 1 # Move down 
        
        # Check if the action takes the agent outside the boundaries 
        if action == 0 and self.agent_position[0] <= self.x_min: 
            done = True # Terminate the episode 
            
        elif action == 1 and self.agent_position[0] >= self.x_max: 
            done = True # Terminate the episode 
            
        elif action == 2 and self.agent_position[1] >= self.y_max: 
            done = True # Terminate the episode 
            
        elif action == 3 and self.agent_position[1] <= self.y_min: 
            done = True # Terminate the episode 
            
        # Get the corresponding action string from the mapping 
        action_string = action_mapping.get(action, "unknown") 
        
        # Return the next_state, reward, and whether the episode is done 
        return next_state, reward, done, action_string, self.agent_position 
        # return next_state, reward, done, action_string, self.agent_position, self.red_apple_positions, self.green_poisonous_thing_positions 

# Create an instance of your 2D environment 
# 351: # of red apples 
# 289: # of green poisonous things 
env = Environment(351, 289) 
    
# Define your 2D environment and the training loop 
def train_dqn(num_episodes, env): 
    state_size = 9 # Adjust according to your state representation (9 eyes) 
    action_size = 4 # Number of possible actions (4 kinds of movements) 

    # state_size = 9 eyes 
    # action_size = 4 kinds of movements 
    agent = DQNAgent(state_size, action_size) 

    for episode in range(num_episodes): 
        state = env.initial_state # Initialize the environment state (3 * 9 array, elements: 10.0) 
        total_reward = 0 
        done = False 
        
        print(f"Episode: {episode + 1}") 

        while not done: 
            action = agent.select_action(state) 
            next_state, reward, done, action_string, agent_position = env.step(action)  # Use the environment you defined 
            # next_state, reward, done, action_string, agent_position, red_apple_positions, green_poisonous_thing_positions = env.step(action)  # Use the environment you defined 

            agent.train(state, action, reward, next_state, done) 
            state = next_state 
            total_reward += reward 
            print(f"Action: {action_string}, Done: {done}, Reward: {reward}, Agent Position: {agent_position}") 
            # print(f"Action: {action_string}, Done: {done}, Reward: {reward}, Agent Position: {agent_position}\n") 
            # print(f"Red Position: {red_apple_positions}, \n\nGreen Positions: {green_poisonous_thing_positions}\n") 
            
        print(f"Total Reward: {total_reward}\n") 

# Main training loop 
if __name__ == "__main__": 
    num_episodes = 100 # Adjust the number of episodes as needed 
    train_dqn(num_episodes, env) 

Episode: 1
Action: move_right, Done: False, Reward: 0, Agent Position: [51, 50]
Action: move_right, Done: False, Reward: 0, Agent Position: [52, 50]
Action: move_down, Done: False, Reward: 0, Agent Position: [52, 49]
Action: move_up, Done: False, Reward: 0, Agent Position: [52, 50]
Action: move_right, Done: False, Reward: 0, Agent Position: [53, 50]
Action: move_down, Done: False, Reward: 0, Agent Position: [53, 49]
Action: move_right, Done: False, Reward: 0, Agent Position: [54, 49]
Action: move_up, Done: False, Reward: 0, Agent Position: [54, 50]
Action: move_down, Done: False, Reward: 0, Agent Position: [54, 49]
Action: move_down, Done: False, Reward: 0, Agent Position: [54, 48]
Action: move_up, Done: False, Reward: 0, Agent Position: [54, 49]
Action: move_left, Done: False, Reward: 0, Agent Position: [53, 49]
Action: move_right, Done: False, Reward: 0, Agent Position: [54, 49]
Action: move_left, Done: False, Reward: 0, Agent Position: [53, 49]
Action: move_right, Done: False, Rewar