# Apprentissage par renforcement

In [14]:
# %%capture
# !uv pip install -r requirements.txt

import torch
import gymnasium as gym
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo
import numpy as np
from copy import deepcopy
from collections import namedtuple, deque
import random

# Essayez les jeux

Vous pouvez essayer un jeu, vous avez le choix entre "Car", "Mountain" et "Lunar"

In [25]:
!python play.py --game="Lunar"

  from pkg_resources import resource_stream, resource_exists


# Deep Q-Learning

<img src="./images/deep-q-learning.jpg" width="70%" class="center"/>

[source](https://huggingface.co/learn/deep-rl-course/en/unit3/deep-q-algorithm)

In [48]:
def update_model(model, frozen_model, batch_size, memory, gamma):
    with torch.no_grad():
        batch = Transition(*zip(*memory.sample(batch_size)))
        all_states = torch.tensor(batch.state, device=model.device)
        all_next_states = batch.next_state
        rewards = torch.tensor(batch.rewards, device=model.device)
        
        mask = torch.tensor([next_state is not None for next_state in all_next_states], dtype=torch.int32)
        
        targets = rewards + gamma * mask * frozen_model(all_states).max(1)
        ys = model(all_states).gather(1, batch.action)
        loss = (targets - ys).pow(2).mean()

        # optimize and clip the gradient

SyntaxError: expected ':' (3186880370.py, line 8)

In [47]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory():
    def __init__(self, max_capacity):
        self.memory = []
        self.max_capacity = max_capacity

    def push(self, state, action, next_state, reward):
        transition = Transition(state, action, next_state, reward)
        self.memory.append(transition)
        if len(self.memory) > self.max_capacity:
            val = self.memory.pop(0)
            assert len(self.memory) == self.max_capacity
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class DQN(torch.nn.Module):
    def __init__(self, n_obs, n_actions):
        super().__init__()
        self.n_obs = n_obs
        self.n_actions = n_actions
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(self.n_obs, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, self.n_actions)
        )

    def foward(self, input):
        return self.layers(input)

In [41]:
def train_model(env, num_eval_episodes):
    epsilon = 0.01
    obs, info = env.reset()
    size_observation_space = len(obs)
    n_actions = env.action_space.n

    n_step_update = 50
    max_capacity = 1500
    batch_size = 64
    
    # Initialize replay memory D to capacity N
    memory = ReplayMemory(max_capacity=max_capacity)
    # Initialize action-value function Q with random weights theta
    Q_model = DQN(size_observation_space, n_actions)
    
    # Initialize target action-value function \hat{Q} with weights theta^-1 = theta
    Q_hat_model = deepcopy(Q_model)
    
    for episode_num in range(num_eval_episodes):
        obs, info = env.reset()
        episode_reward = 0
        step_count = 0
        epsilon_greedy = torch.distributions.bernoulli.Bernoulli(torch.tensor([epsilon]))
    
        for n_step in range(env._max_episode_steps):
            # Add epsilon decay
            if bool(m.sample().item()):
                action = env.action_space.sample()  # Random policy for demonstration
            else:
                action = np.argmax(Q_model(obs))
    
            new_obs, reward, terminated, truncated, info = env.step(action)

            episode_terminated = terminated or truncated
            episode_reward += reward
            step_count += 1
            
            if episode_terminated:
                memory.push(obs, action, None, reward)
            else:
                memory.push(obs, action, new_obs, reward)
            obs = new_obs

            if len(memory) >= batch_size:
                ...

            if n_step % n_step_update == 0:
                Q_hat_model = deepcopy(Q_model)
                
            if episode_terminated:
                break
    
        print(f"Episode {episode_num + 1}: {step_count} steps, reward = {episode_reward}")

In [42]:
# Configuration
num_eval_episodes = 4
env_name = "MountainCar-v0"

# Create environment with recording capabilities
env = gym.make(env_name, render_mode="rgb_array")  # rgb_array needed for video recording

# Add video recording for every episode
env = RecordVideo(
    env,
    video_folder=env_name + "_agent",    # Folder to save videos
    name_prefix="eval",               # Prefix for video filenames
    episode_trigger=lambda x: x % 50 == 0    # Record every episode
)

# Add episode statistics tracking
env = RecordEpisodeStatistics(env, buffer_length=num_eval_episodes)

print(f"Starting evaluation for {num_eval_episodes} episodes...")


train_model(env, num_eval_episodes)

env.close()

# Print summary statistics
print(f'\nEvaluation Summary:')
print(f'Episode durations: {list(env.time_queue)}')
print(f'Episode rewards: {list(env.return_queue)}')
print(f'Episode lengths: {list(env.length_queue)}')

# Calculate some useful metrics
avg_reward = np.sum(env.return_queue)
avg_length = np.sum(env.length_queue)
std_reward = np.std(env.return_queue)

print(f'\nAverage reward: {avg_reward:.2f} ± {std_reward:.2f}')
print(f'Average episode length: {avg_length:.1f} steps')
print(f'Success rate: {sum(1 for r in env.return_queue if r > 0) / len(env.return_queue):.1%}')

  logger.warn(


Starting evaluation for 4 episodes...
Episode 1: 200 steps, reward = -200.0
Episode 2: 200 steps, reward = -200.0
Episode 3: 200 steps, reward = -200.0
Episode 4: 200 steps, reward = -200.0

Evaluation Summary:
Episode durations: [0.174245, 0.001743, 0.001544, 0.001404]
Episode rewards: [-200.0, -200.0, -200.0, -200.0]
Episode lengths: [200, 200, 200, 200]

Average reward: -800.00 ± 0.00
Average episode length: 800.0 steps
Success rate: 0.0%


# PPO

In [47]:
batch = env.step(0)

In [56]:
env.action_space.n

3

In [5]:
# Configuration
num_eval_episodes = 4
env_name = "LunarLander-v3"  # Replace with your environmentLunar Lander

# Create environment with recording capabilities
env = gym.make(env_name, render_mode="rgb_array")  # rgb_array needed for video recording

# Add video recording for every episode
env = RecordVideo(
    env,
    video_folder=env_name + "_agent",    # Folder to save videos
    name_prefix="eval",               # Prefix for video filenames
    episode_trigger=lambda x: x % 50 == 0    # Record every episode
)

# Add episode statistics tracking
env = RecordEpisodeStatistics(env, buffer_length=num_eval_episodes)

print(f"Starting evaluation for {num_eval_episodes} episodes...")

for episode_num in range(num_eval_episodes):
    obs, info = env.reset()
    episode_reward = 0
    step_count = 0

    episode_over = False
    while not episode_over:
        # Replace this with your trained agent's policy
        action = env.action_space.sample()  # Random policy for demonstration

        obs, reward, terminated, truncated, info = env.step(action)
        episode_reward += reward
        step_count += 1

        episode_over = terminated or truncated

    print(f"Episode {episode_num + 1}: {step_count} steps, reward = {episode_reward}")

env.close()

# Print summary statistics
print(f'\nEvaluation Summary:')
print(f'Episode durations: {list(env.time_queue)}')
print(f'Episode rewards: {list(env.return_queue)}')
print(f'Episode lengths: {list(env.length_queue)}')

# Calculate some useful metrics
avg_reward = np.sum(env.return_queue)
avg_length = np.sum(env.length_queue)
std_reward = np.std(env.return_queue)

print(f'\nAverage reward: {avg_reward:.2f} ± {std_reward:.2f}')
print(f'Average episode length: {avg_length:.1f} steps')
print(f'Success rate: {sum(1 for r in env.return_queue if r > 0) / len(env.return_queue):.1%}')

Starting evaluation for 4 episodes...
Episode 1: 58 steps, reward = -113.3143971493336
Episode 2: 96 steps, reward = -302.28691844428306
Episode 3: 115 steps, reward = -106.40357398727691
Episode 4: 70 steps, reward = -86.86919655459616

Evaluation Summary:
Episode durations: [0.081562, 0.005922, 0.007073, 0.003183]
Episode rewards: [-113.3143971493336, -302.28691844428306, -106.40357398727691, -86.86919655459616]
Episode lengths: [58, 96, 115, 70]

Average reward: -608.87 ± 87.18
Average episode length: 339.0 steps
Success rate: 0.0%
