In [1]:
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import time

try:
    plt.switch_backend('TkAgg') 
except ImportError:
    pass

random.seed(42)
np.random.seed(42)

In [2]:
class ChaseEnv:
    """A 10x10 pursuit environment with fixed Traps."""
    
    def __init__(self, size=10, gamma=0.9):
        self.SIZE = size
        self.nA = 4  # 0: Up, 1: Down, 2: Left, 3: Right
        self.gamma = gamma
        
        self.TRAPS = self._generate_traps()
        
        self.reset()

    def _generate_traps(self):
        """Creates a static set of traps/obstacles."""
        traps = set()
        for r in [2, 7]:
            for c in [2, 7]:
                traps.add((r, c))
        return traps

    def reset(self):
        """Resets agent and prey to random non-trap locations."""
        self.agent_pos = self._get_safe_random_pos()
        self.prey_pos = self._get_safe_random_pos()
        self.is_terminal = False
        self.score = 0
        return self._get_state()

    def _get_safe_random_pos(self):
        """Gets a random position that is not a trap."""
        while True:
            pos = (random.randint(0, self.SIZE - 1), random.randint(0, self.SIZE - 1))
            if pos not in self.TRAPS:
                return pos
    
    def _get_state(self):
        """State Vector (6 features): [Distance_N, Danger_B, P_U, P_D, P_L, P_R]"""
        r_agent, c_agent = self.agent_pos
        r_prey, c_prey = self.prey_pos

        dist_to_prey = np.sqrt((r_agent - r_prey)**2 + (c_agent - c_prey)**2) / (self.SIZE * 1.5)
        
        danger_ahead = 0
        current_direction = random.randint(0,3) 
        dr, dc = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}[current_direction]
        next_r, next_c = r_agent + dr, c_agent + dc
        
        if next_r < 0 or next_r >= self.SIZE or next_c < 0 or next_c >= self.SIZE or (next_r, next_c) in self.TRAPS:
            danger_ahead = 1
            
        prey_direction = [r_prey < r_agent, r_prey > r_agent, c_prey < c_agent, c_prey > c_agent]
        
        state_vector = [dist_to_prey, danger_ahead] + prey_direction
        return np.array(state_vector)

    def _get_state_index(self, state_vector):
        """Discretizes the 6-feature vector into a single index (0-63)."""
        Food_Close = 1 if state_vector[0] < 0.2 else 0 
        Danger_Ahead = int(state_vector[1])
        F_U, F_D, F_L, F_R = [int(x) for x in state_vector[2:]]
        idx = (Food_Close * 32) + (Danger_Ahead * 16) + (F_U * 8) + (F_D * 4) + (F_L * 2) + F_R
        return idx

    def _move_prey(self):
        """The prey moves randomly every step."""
        r, c = self.prey_pos
        action = random.randint(0, 3)
        dr, dc = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}[action]
        
        nr, nc = r + dr, c + dc
        
        if 0 <= nr < self.SIZE and 0 <= nc < self.SIZE and (nr, nc) not in self.TRAPS:
            self.prey_pos = (nr, nc)

    def step(self, action):
        reward = -0.1 
        done = False
        
        r, c = self.agent_pos
        dr, dc = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}[action]
        
        next_r, next_c = r + dr, c + dc
        next_pos = (next_r, next_c)
        
        if next_r < 0 or next_r >= self.SIZE or next_c < 0 or next_c >= self.SIZE or next_pos in self.TRAPS:
            reward = -20.0 
            done = True 
        else:
            self.agent_pos = next_pos
            
            if self.agent_pos == self.prey_pos:
                reward = 20.0 
                self.score += 1
                self.prey_pos = self._get_safe_random_pos() 
                
            old_pos_array = np.array([r, c]) 
            new_pos_array = np.array(next_pos)
            
            old_dist = np.linalg.norm(old_pos_array - np.array(self.prey_pos))
            new_dist = np.linalg.norm(new_pos_array - np.array(self.prey_pos))
            
            if new_dist < old_dist:
                 reward += 0.2 

            self._move_prey()
        
        next_s_vector = self._get_state()
        return self._get_state_index(next_s_vector), reward, done, {}


In [3]:
class QAgent:
    def __init__(self, env, alpha=0.1, gamma=0.9):
        self.nS = 64
        self.nA = env.nA
        self.Q = np.zeros((self.nS, self.nA))
        self.alpha = alpha
        self.gamma = gamma

    def get_action(self, s_vector, epsilon, env):
        s = env._get_state_index(s_vector) 
        if random.random() < epsilon:
            return random.randint(0, self.nA - 1)
        else:
            return np.argmax(self.Q[s]) 
            
    def get_greedy_action(self, s_vector, env):
        s = env._get_state_index(s_vector)
        return np.argmax(self.Q[s])

    def update_Q(self, s_vector, a, r, ns_vector, env):
        s = env._get_state_index(s_vector)
        ns = env._get_state_index(ns_vector)
        old_value = self.Q[s, a]
        next_max = np.max(self.Q[ns])
        new_value = (1 - self.alpha) * old_value + self.alpha * (r + self.gamma * next_max)
        self.Q[s, a] = new_value


In [4]:
def train_chase_agent(env, agent, num_episodes=5000):
    epsilon = 1.0
    epsilon_min = 0.05
    epsilon_decay = 0.999 
    
    total_rewards = []
    print(f"Starting ChaseRL Training for {num_episodes} episodes...")

    for episode in tqdm(range(num_episodes), desc="Training"):
        s_vector = env.reset()
        s = env._get_state_index(s_vector)
        done = False
        episode_reward = 0
        
        for step in range(100): 
            a = agent.get_action(s_vector, epsilon, env)
            s, reward, done, _ = env.step(a)
            ns_vector = env._get_state()
            
            agent.update_Q(s_vector, a, reward, ns_vector, env)
            
            s_vector = ns_vector
            episode_reward += reward
            
            if done: break 
            
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        total_rewards.append(episode_reward)
        
    return total_rewards, agent

In [5]:
def visualize_chase(env, agent, num_runs=3, delay=0.1):
    """Animates the agent's learned greedy policy."""
    plt.ion() 
    fig, ax = plt.subplots(figsize=(6, 6))
    plt.show(block=False)

    for run in range(1, num_runs + 1):
        run_seed = random.randint(0, 1000)
        random.seed(run_seed) 
        np.random.seed(run_seed)
        env = ChaseEnv(size=10, gamma=0.9)
        
        s_vector = env.reset() 
        done = False
        
        print(f"\n--- Running Chase Demo {run}/{num_runs} ---")

        for step in range(200):
            
            s_vector = env._get_state()
            action = agent.get_greedy_action(s_vector, env)
            
            s, reward, done, _ = env.step(action)
            
            ax.cla() 
            grid = np.zeros((env.SIZE, env.SIZE))
            
            grid[env.agent_pos[0], env.agent_pos[1]] = 3.0 
            grid[env.prey_pos[0], env.prey_pos[1]] = 4.0 
            for r, c in env.TRAPS:
                grid[r, c] = 1.0 

            ax.imshow(grid, cmap='viridis', vmin=0, vmax=4)
            ax.set_title(f"Run {run} | Score: {env.score} | Step: {step}")
            ax.set_xticks(np.arange(env.SIZE)), ax.set_yticks(np.arange(env.SIZE))
            ax.grid(color='white', linestyle='-', linewidth=0.5)
            
            fig.canvas.draw()
            fig.canvas.flush_events()
            time.sleep(delay) 

            if done:
                print(f"Run {run} finished! Agent crashed. Final Score: {env.score}")
                break
        
        if not done:
            print(f"Run {run} finished! Max steps reached. Final Score: {env.score}")
            
        if run < num_runs:
            ax.cla()
            ax.set_title(f"Run {run} Complete. Score: {env.score}. Click to start Run {run+1}.")
            fig.canvas.draw()
            fig.canvas.flush_events()
            plt.waitforbuttonpress() 

    plt.ioff()
    plt.close(fig)



In [9]:
if __name__ == '__main__':
    ENV = ChaseEnv(size=10, gamma=0.9)
    AGENT = QAgent(ENV)
    
    REWARDS, FINAL_AGENT = train_chase_agent(ENV, AGENT, num_episodes=7000)
    
    window = 100
    smoothed_rewards = np.convolve(REWARDS, np.ones(window)/window, mode='valid')
    
    plt.figure(figsize=(10, 5))
    plt.plot(smoothed_rewards)
    plt.title("ChaseRL: Agent Learning Curve (Cumulative Reward)")
    plt.xlabel(f"Episode (Smoothed over {window} episodes)")
    plt.ylabel("Avg. Cumulative Reward (Higher = Better Policy)")
    plt.grid(True, alpha=0.3)
    plt.show(block=False) 
    
    visualize_chase(ENV, FINAL_AGENT, num_runs=3, delay=0.1)

Starting ChaseRL Training for 7000 episodes...


Training: 100%|██████████| 7000/7000 [00:01<00:00, 3622.47it/s]



--- Running Chase Demo 1/3 ---
Run 1 finished! Agent crashed. Final Score: 0

--- Running Chase Demo 2/3 ---
Run 2 finished! Agent crashed. Final Score: 0

--- Running Chase Demo 3/3 ---
Run 3 finished! Agent crashed. Final Score: 1
