# Q-Learning

### Exercice 1: Exploitation de l'environement FrozenLake

In [5]:
import gymnasium as gym

env = gym.make("FrozenLake-v1", is_slippery=False, render_mode = "human")

print("Espace d'états :", env.observation_space.n)
print("Espace d'actions :", env.action_space.n) 

observation, info = env.reset()

episode = 0
episode_max = 100
while episode < episode_max:
    action = env.action_space.sample()
    observation, reward, done, _, _ = env.step(action)
    
    print("episode:", episode,"action: ", action, "observation: ", observation, "reward:", reward, "info:", info)
    if done:
        print("fin de l'episode")
        env.reset()
    episode  += 1
env.close() 

Espace d'états : 16
Espace d'actions : 4
episode: 0 action:  1 observation:  4 reward: 0.0 info: {'prob': 1}
episode: 1 action:  2 observation:  5 reward: 0.0 info: {'prob': 1}
fin de l'episode
episode: 2 action:  3 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 3 action:  3 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 4 action:  2 observation:  1 reward: 0.0 info: {'prob': 1}
episode: 5 action:  1 observation:  5 reward: 0.0 info: {'prob': 1}
fin de l'episode
episode: 6 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 7 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 8 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 9 action:  2 observation:  1 reward: 0.0 info: {'prob': 1}
episode: 10 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 11 action:  3 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 12 action:  1 observation:  4 reward: 0.0 info: {'prob': 1}
episode: 13 action:  2 observation:  5

### Exercice 2: Implementation de la Q-Table et Initialisation

In [10]:
# Q-table
import numpy as np
q_table = np.zeros((env.observation_space.n, env.action_space.n))

q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

###  Exercice 3: Implementation du Q-Learning avec Mise à jour

In [26]:
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode = "human")
print("Espace d'états :", env.observation_space.n)
print("Espace d'actions :", env.action_space.n) 

observation, info = env.reset()


# hyperparametres
alpha = 0.01
gamma = 0.99
epsilon = 0.5 # exploiation seulement
num_episodes = 100
done = False
# apprentissage

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:

        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
    
        next_state, reward, done, _, _ = env.step(action)
        
        best_next_action = np.max(q_table[next_state])
        q_table[state, action] += alpha * (reward + gamma * best_next_action - q_table[state, action])
        print("episode:", episode,"action: ", action, "observation: ", state, "reward:", reward, "info:", info)
        state = next_state

env.close() 
print(q_table)

Espace d'états : 16
Espace d'actions : 4
episode: 0 action:  1 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  1 observation:  4 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  8 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  8 reward: 0.0 info: {'prob': 1}
episode: 0 action:  3 observation:  8 reward: 0.0 info: {'prob': 1}
episode: 0 action:  1 observation:  4 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  8 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  8 reward: 0.0 info: {'prob': 1}
episode: 0 action:  1 observation:  8 reward: 0.0 info: {'prob': 1}
episode: 1 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 1 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 1 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 1 action:  3 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 1 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode

### Exercice 4: Evaluation des performances de l'Agent

In [27]:
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode = "human")
print("Espace d'états :", env.observation_space.n)
print("Espace d'actions :", env.action_space.n) 

# hyperparametres
alpha = 0.01
gamma = 0.99
epsilon = 1.0 # exploiation seulement
num_episodes = 100
done = False
success = 0
# apprentissage

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:

        if np.random.rand() > epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
    
        next_state, reward, done, _, _ = env.step(action)
        
        best_next_action = np.max(q_table[next_state])
        q_table[state, action] += alpha * (reward + gamma * best_next_action - q_table[state, action])
        print("episode:", episode,"action: ", action, "observation: ", state, "reward:", reward, "info:", info)
        state = next_state

        if done and reward == 1.0:
            success += 1

env.close() 
print(q_table)

success_taux = success / num_episodes
print(f"\nTaux de succès sur {test_episodes} épisodes : {success_taux * 100:.2f}%")

Espace d'états : 16
Espace d'actions : 4
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode: 0 action:  0 observation:  0 reward: 0.0 info: {'prob': 1}
episode

KeyboardInterrupt: 