Step 1: Set up the environment
The first task is to create a grid world where agents can move between two locations A and B. Here's a simple way to implement the environment:

In [None]:
import numpy as np

# Environment settings
grid_size = 5  # 5x5 grid
A = (0, 0)  # Location A
B = (4, 4)  # Location B
actions = ['up', 'down', 'left', 'right']  # Possible actions

# Grid initialization
grid = np.zeros((grid_size, grid_size))  # Create a grid of size 5x5

# Define agent class
class Agent:
    def __init__(self, start_position):
        self.position = start_position  # Agent's starting position
        self.carrying_item = False  # Whether the agent is carrying an item

    def move(self, action):
        """ Move the agent according to the action """
        x, y = self.position
        if action == 'up' and x > 0:
            self.position = (x - 1, y)
        elif action == 'down' and x < grid_size - 1:
            self.position = (x + 1, y)
        elif action == 'left' and y > 0:
            self.position = (x, y - 1)
        elif action == 'right' and y < grid_size - 1:
            self.position = (x, y + 1)


Step 2: Set up Q-learning (Tabular Q-learning)
In Q-learning, the agent learns a policy by updating a Q-table. We'll use the Q-table to store the action-value function. Here's how to implement Q-learning:

In [None]:
import random

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration factor
        self.q_table = {}  # Q-table to store state-action values

    def get_state(self, agent):
        """Get the state of the agent"""
        return agent.position + (agent.carrying_item,)

    def choose_action(self, state):
        """Choose action based on epsilon-greedy policy"""
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(actions)  # Explore: choose random action
        else:
            # Exploit: choose the best action based on the Q-table
            if state not in self.q_table:
                self.q_table[state] = {a: 0 for a in actions}  # Initialize Q-values for new state
            return max(self.q_table[state], key=self.q_table[state].get)

    def update_q_table(self, state, action, reward, next_state):
        """Update the Q-table based on the action taken"""
        if state not in self.q_table:
            self.q_table[state] = {a: 0 for a in actions}
        if next_state not in self.q_table:
            self.q_table[next_state] = {a: 0 for a in actions}

        best_next_action = max(self.q_table[next_state], key=self.q_table[next_state].get)
        # Q-value update formula
        self.q_table[state][action] = self.q_table[state][action] + self.alpha * (
            reward + self.gamma * self.q_table[next_state][best_next_action] - self.q_table[state][action])


Step 3: Define the task and rewards
Now, we need to define the reward function and the criteria for avoiding head-on collisions.

In [None]:
def get_reward(agent, other_agents):
    """Define the reward based on agent's position and collision status"""
    # Check for collisions
    for other_agent in other_agents:
        if agent.position == other_agent.position:
            if (agent.carrying_item and not other_agent.carrying_item) or (not agent.carrying_item and other_agent.carrying_item):
                return -10  # Penalty for collision (head-on)
    
    # Reward for delivering item from A to B or B to A
    if agent.position == A and agent.carrying_item:
        agent.carrying_item = False
        return 10  # Reward for successful delivery
    elif agent.position == B and not agent.carrying_item:
        agent.carrying_item = True
        return 0  # No reward for arriving at B without an item
    
    return -1  # Small penalty for unnecessary movements


Step 4: Training the agents
Now, we can set up the environment, initialize the agents, and train them using Q-learning.

In [None]:
# Initialize agents
agents = [Agent(A) for _ in range(4)]
q_learning_agents = [QLearningAgent() for _ in range(4)]

# Training loop
episodes = 1000
for episode in range(episodes):
    for agent, q_agent in zip(agents, q_learning_agents):
        state = q_agent.get_state(agent)
        action = q_agent.choose_action(state)
        
        # Move the agent
        agent.move(action)
        
        # Get reward and next state
        reward = get_reward(agent, agents)
        next_state = q_agent.get_state(agent)
        
        # Update Q-table
        q_agent.update_q_table(state, action, reward, next_state)

    # Optionally print progress
    if episode % 100 == 0:
        print(f"Episode {episode}/{episodes} complete")


Step 5: Testing the learned policy
After training, the agents should be able to move from A to B and back, coordinating to avoid collisions. We can test the learned behavior as follows:

In [None]:
def test_agents():
    for agent in agents:
        print(f"Agent started at {agent.position}")
        # Test the agent's actions
        state = q_learning_agents[0].get_state(agent)
        action = q_learning_agents[0].choose_action(state)
        agent.move(action)
        print(f"Agent moved to {agent.position}")
