In [112]:
import gymnasium as gym
import ale_py

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import cv2


In [113]:
gym.register_envs(ale_py)

In [114]:
BATCH_SIZE = 32
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 1000000
TARGET_UPDATE = 1000
MEMORY_SIZE = 10000
LEARNING_RATE = 1e-4

In [115]:
def preprocess_observation(obs):
    obs = obs[35:195]  # Crop
    obs = cv2.resize(obs, (84, 84))  # Resize
    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)  # Convert to grayscale
    _, obs = cv2.threshold(obs, 1, 255, cv2.THRESH_BINARY)  # Binary
    return obs / 255.0  # Normalize


In [116]:
# Define the DQN model
class DQN(nn.Module):
    def __init__(self, action_space):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc = nn.Linear(7 * 7 * 64, 512)
        self.out = nn.Linear(512, action_space)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc(x))
        return self.out(x)

In [117]:
memory = deque(maxlen=MEMORY_SIZE)

In [118]:
def select_action(state, epsilon, action_space):
    if random.random() < epsilon:
        return random.randrange(action_space)
    else:
        if isinstance(state, np.ndarray):
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            return torch.argmax(policy_net(state)).item()

In [119]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return

    transitions = random.sample(memory, BATCH_SIZE)
    batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)

    # Conversion en tenseurs PyTorch
    batch_state = torch.cat([torch.tensor(s, dtype=torch.float32).unsqueeze(0) for s in batch_state])
    batch_action = torch.tensor(batch_action)
    batch_reward = torch.tensor(batch_reward)
    batch_next_state = torch.cat([torch.tensor(s, dtype=torch.float32).unsqueeze(0) for s in batch_next_state])
    batch_done = torch.tensor(batch_done, dtype=torch.bool)

    current_q_values = policy_net(batch_state).gather(1, batch_action.unsqueeze(1))
    next_q_values = target_net(batch_next_state).max(1)[0].detach()
    expected_q_values = batch_reward + (GAMMA * next_q_values) * (~batch_done)

    loss = nn.MSELoss()(current_q_values, expected_q_values.unsqueeze(1))
    # Ajout de prints pour surveiller l'évolution
    print(f"Loss: {loss.item():.4f} | Batch Reward Mean: {batch_reward.mean().item():.2f} | "f"Q-Value Mean: {current_q_values.mean().item():.4f} | Next Q-Value Mean: {next_q_values.mean().item():.4f}")
    
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()



In [None]:
env = gym.make("PongNoFrameskip-v4", difficulty=1)
policy_net = DQN(env.action_space.n)
target_net = DQN(env.action_space.n)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

In [None]:
num_episodes = 5000
epsilon = EPSILON_START

for episode in range(num_episodes):
    obs, _ = env.reset()
    state = preprocess_observation(obs)
    state = np.stack([state] * 4, axis=0)
    state = torch.tensor(state, dtype=torch.float32)

    total_reward = 0
    done = False
    steps = 0

    while not done:
        action = select_action(state, epsilon, env.action_space.n)
        next_obs, reward, done, truncated, _ = env.step(action)
        total_reward += reward
        next_state = preprocess_observation(next_obs)
        next_state = np.concatenate((state[1:, :, :], np.expand_dims(next_state, 0)), axis=0)
        memory.append((state, action, reward, next_state, done))
        state = next_state

        optimize_model()

        if steps % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        steps += 1
        epsilon = max(EPSILON_END, EPSILON_START - steps / EPSILON_DECAY)

    print(f"Episode {episode}, Total Reward: {total_reward}")

  batch_state = torch.cat([torch.tensor(s, dtype=torch.float32).unsqueeze(0) for s in batch_state])


Loss: 0.0077 | Batch Reward Mean: 0.00 | Q-Value Mean: -0.0080 | Next Q-Value Mean: 0.0743
Loss: 0.0033 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0214 | Next Q-Value Mean: 0.0743
Loss: 0.0010 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0489 | Next Q-Value Mean: 0.0743
Loss: 0.0004 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0747 | Next Q-Value Mean: 0.0743
Loss: 0.0005 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0888 | Next Q-Value Mean: 0.0743
Loss: 0.0006 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0963 | Next Q-Value Mean: 0.0743
Loss: 0.0005 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0954 | Next Q-Value Mean: 0.0743
Loss: 0.0003 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0877 | Next Q-Value Mean: 0.0743
Loss: 0.0002 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0785 | Next Q-Value Mean: 0.0743
Loss: 0.0001 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0708 | Next Q-Value Mean: 0.0743
Loss: 0.0002 | Batch Reward Mean: 0.00 | Q-Value Mean: 0.0656 | Next Q-Value Mean: 0.0743
Loss: 0.0

KeyboardInterrupt: 

In [None]:
import time

# Réinitialiser epsilon pour exploitation totale
epsilon = 0.05  # Fixé bas pour maximiser l'exploitation des actions apprises

# Fonction pour voir l'agent jouer un épisode
def play_episode(env, policy_net):
    obs, _ = env.reset()
    state = preprocess_observation(obs)
    state = np.stack([state] * 4, axis=0)
    state = torch.tensor(state, dtype=torch.float32)

    done = False
    total_reward = 0

    while not done:
        # Afficher l'environnement
        env.render()
        time.sleep(0.02)  # Pour ralentir l'affichage et mieux voir le jeu

        # Sélectionner l'action
        action = select_action(state, epsilon, env.action_space.n)

        # Exécuter l'action
        next_obs, reward, done, truncated, _ = env.step(action)
        total_reward += reward

        # Mettre à jour l'état
        next_state = preprocess_observation(next_obs)
        next_state = np.append(state[1:, :, :], np.expand_dims(next_state, 0), axis=0)
        state = torch.tensor(next_state, dtype=torch.float32)

    env.close()
    print(f"Score obtenu : {total_reward}")

# Lancer l'agent pour jouer un épisode
play_episode(env, policy_net)
