In [20]:
from DQN.model import DQN
from DQN.utils import select_action_dqn
from PPO.models.ppo_cnn import PPOCNN
from cathedral_rl import cathedral_v0
from tqdm import tqdm
import numpy as np
import torch
import pygame
from PIL import Image
import os

In [2]:
K_epochs = 40               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.95                # discount factor
gae_lambda = 0.95

lr_actor = 0.005       # learning rate for actor network
lr_critic = 0.002

In [13]:
def dqn_vs_ppo(model_dqn, model_ppo, board_size, device, num_episodes_eval=1000, epsilon=0.15):
    env = cathedral_v0.env(board_size=board_size, render_mode="text", per_move_rewards=True, final_reward_score_difference=False)
    env.reset()
    
    player_0='player_0' # player_0 or player_1 (does not matter)
    player_1='player_1'
    n_actions = env.action_space(player_1).n
    obs_shape = env.observe(player_1)["observation"].shape 
    
    dqn_net = DQN(obs_shape, n_actions).to(device)

    checkpoint_dqn = torch.load(f"models/{model_dqn}.pth", weights_only=False)
    dqn_net.load_state_dict(checkpoint_dqn['model_state_dict'])


    checkpoint_ppo = torch.load(f"models/{model_ppo}.pth", weights_only=False)
    ppo_agent = PPOCNN(
        obs_shape=obs_shape,
        action_dim=n_actions,
        lr_actor=lr_actor, 
        lr_critic=lr_critic,
        gamma=gamma,
        K_epochs=K_epochs,
        eps_clip=eps_clip
    )
    if 'model_state_dict' in checkpoint_ppo:
        ppo_agent.policy.load_state_dict(checkpoint_ppo['model_state_dict'])
    else:
        # Direct state dict (from .save() method)
        ppo_agent.policy.load_state_dict(checkpoint_ppo)
    
    list_reward_dqn = []
    win_count_dqn = 0
    list_reward_ppo = []
    win_count_ppo = 0
    win_count_draw = 0
        
    for episode in tqdm(range(num_episodes_eval)):
        env.reset()
        list_agents = env.agents

        while env.agents:
            current_agent = env.agent_selection
            observation = env.observe(current_agent)
            legal_moves = [i for i, valid in enumerate(observation["action_mask"]) if valid]
            state = observation["observation"]
            action_mask = observation["action_mask"]

            if current_agent == player_0: # dqn plays
                _, action = select_action_dqn(dqn_net, state, action_mask, legal_moves, device, 'boltzmann', 0, 0.01)
                reward = env.rewards[current_agent]
                list_reward_dqn.append(reward)
                
            else:  
                with torch.no_grad():
                    action = ppo_agent.select_action_evaluation(state, action_mask)
                reward = env.rewards[current_agent]
                list_reward_ppo.append(reward)
            
            env.step(action)
                        
        winner = env.winner
        if winner == -1:
            win_count_draw += 1
        else:
            if list_agents[winner] == player_0:  # dqn wins
                win_count_dqn += 1
            else:
                win_count_ppo += 1

        
    avg_reward_dqn = sum(list_reward_dqn)/len(list_reward_dqn)
    avg_reward_ppo = sum(list_reward_ppo)/len(list_reward_ppo)
    winrate_dqn = win_count_dqn/num_episodes_eval
    winrate_ppo = win_count_ppo/num_episodes_eval
    winrate_draw = win_count_draw/num_episodes_eval
    print(f" {num_episodes_eval} episodes:\nAvg Reward DQN: {avg_reward_dqn:.4f} // Winrate DQN: {winrate_dqn:.4f}\nAvg Reward PPO: {avg_reward_ppo:.4f} // Winrate PPO: {winrate_ppo:.4f}\nWinrate Draw: {winrate_draw:.4f}")
    return avg_reward_dqn, winrate_dqn, avg_reward_ppo, winrate_ppo, winrate_draw

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_, _, _, _, _= dqn_vs_ppo(model_dqn="True_False_model_DQN_episodes10000_buffer40000_prioritizedTrue_batch_size64_gamma0.95_target_update30_treshold_penalize0_boltzmann", 
                          model_ppo="cathedral_ppo_self_play_adversarial_final", 
                          board_size=8, 
                          device=device, 
                          num_episodes_eval=100)

Loading PPO CNN with 5, height 8, width 8
Loading PPO CNN with 5, height 8, width 8


  action_mask_tensor = torch.tensor(action_mask, dtype=torch.bool)
100%|██████████| 100/100 [00:42<00:00,  2.35it/s]

 100 episodes:
Avg Reward DQN: 0.0920 // Winrate DQN: 0.9500
Avg Reward PPO: -1.4098 // Winrate PPO: 0.0400
Winrate Draw: 0.0100





Temperature = 1:
Avg Reward DQN: -0.7753 // Winrate DQN: 0.7840
Avg Reward PPO: -1.4818 // Winrate PPO: 0.1720
Winrate Draw: 0.0440

Temperature=5 for DQN
100 episodes:
Avg Reward DQN: -1.2714 // Winrate DQN: 0.5100
Avg Reward PPO: -1.5022 // Winrate PPO: 0.3900
Winrate Draw: 0.1000

Temperature=0.3
100 episodes:
Avg Reward DQN: -0.0661 // Winrate DQN: 0.9600
Avg Reward PPO: -1.4289 // Winrate PPO: 0.0400
Winrate Draw: 0.0000

In [None]:
def create_game_gif(episode_num, ppo_agent, dqn_model, gif_dir, board_size):
        """Create and save a GIF animation of a full game using Pygame rendering"""
        
        # Initialize rendering environment for this game
        gif_env = cathedral_v0.env(board_size=board_size, render_mode="human", 
                                  per_move_rewards=True, final_reward_score_difference=False)
        gif_env.reset()
        
        # Initialize pygame if not already done
        if pygame.get_init() == False:
            pygame.init()
        
        # Store frames for the GIF
        frames = []
        
        # Create a clock for controlling rendering speed
        clock = pygame.time.Clock()
        
        # Capture initial board state
        gif_env.render()
        pygame.display.flip()
        frame = pygame.surfarray.array3d(pygame.display.get_surface())
        frame = np.transpose(frame, (1, 0, 2))  # Transpose to correct orientation
        frames.append(Image.fromarray(frame))
        player_0='player_0' # player_0 or player_1 (does not matter)
        player_1='player_1'

        # Game loop
        count=0
        while gif_env.agents:
            current_agent = gif_env.agent_selection
            observation = gif_env.observe(current_agent)
            legal_moves = [i for i, valid in enumerate(observation["action_mask"]) if valid]
            state = observation["observation"]
            action_mask = observation["action_mask"]
            
            # Get action using PPO
            if current_agent == player_0:
                 _, action = select_action_dqn(dqn_model, state, action_mask, legal_moves, device, 'boltzmann', 0, 0.1)
            else:
                action = ppo_agent.select_action_evaluation(state, action_mask)
            
            # Take step
            gif_env.step(action)
            
            # Render and capture frame
            gif_env.render()
            pygame.display.flip()
            
            # Capture the displayed frame
            frame = pygame.surfarray.array3d(pygame.display.get_surface())
            frame = np.transpose(frame, (1, 0, 2))  # Transpose to correct orientation
            frames.append(Image.fromarray(frame))
            
            # Control rendering speed
            clock.tick(1)  # Limit to 2 FPS for the gif
        
        # Save frames as a GIF
        gif_path = os.path.join(gif_dir, f"game_episode_{episode_num}.gif")
        frames[0].save(
            gif_path,
            format='GIF',
            append_images=frames[1:],
            save_all=True,
            duration=500,  # Duration between frames in milliseconds
            loop=0  # Loop indefinitely
        )
        
        # Close environment
        gif_env.close()
        print(f"Game animation saved to {gif_path}")
        
        return

In [None]:
env = cathedral_v0.env(board_size=8, render_mode="text", per_move_rewards=True, final_reward_score_difference=False)
env.reset()

player_0='player_0' # player_0 or player_1 (does not matter)
player_1='player_1'
n_actions = env.action_space(player_1).n
obs_shape = env.observe(player_1)["observation"].shape 
    
ppo_agent = PPOCNN(
        obs_shape=obs_shape,
        action_dim=n_actions,
        lr_actor=lr_actor, 
        lr_critic=lr_critic,
        gamma=gamma,
        K_epochs=K_epochs,
        eps_clip=eps_clip
    )
checkpoint_ppo = torch.load(f"models/cathedral_ppo_self_play_adversarial_final.pth", weights_only=False)
    
if 'model_state_dict' in checkpoint_ppo:
        ppo_agent.policy.load_state_dict(checkpoint_ppo['model_state_dict'])
else:
    # Direct state dict (from .save() method)
    ppo_agent.policy.load_state_dict(checkpoint_ppo)
    
dqn_net = DQN(obs_shape, n_actions).to(device)

checkpoint_dqn = torch.load(f"models/True_False_model_DQN_episodes10000_buffer40000_prioritizedTrue_batch_size64_gamma0.95_target_update30_treshold_penalize0_boltzmann.pth", weights_only=False)
dqn_net.load_state_dict(checkpoint_dqn['model_state_dict'])
   
for episode in range(3):
    create_game_gif(episode_num=episode, 
                    ppo_agent=ppo_agent, 
                    dqn_model=dqn_net, 
                    gif_dir="PPO_vs_DQN_gif", 
                    board_size=8)

Loading PPO CNN with 5, height 8, width 8
Loading PPO CNN with 5, height 8, width 8


  action_mask_tensor = torch.tensor(action_mask, dtype=torch.bool)


Game animation saved to PPO_vs_DQN_gif/game_episode_0.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_1.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_2.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_3.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_4.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_5.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_6.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_7.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_8.gif
Game animation saved to PPO_vs_DQN_gif/game_episode_9.gif
