### **TP 7 Julie Chapdelaine et Cleo Daguin**
Le but de ce TP est de comprendre le reinforcement learning

In [0]:
!pip install gym



In [0]:
import gym
from time import sleep
import numpy as np
import random
from IPython.display import clear_output

#### **I) Basic reinforcement learning example, openAI Gym environment**
Utilisation du problème de Taxi-v3. Source : https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

**Sans reinforcement learning**

In [0]:
# Creating the env
env = gym.make("Taxi-v3").env

env.s = 328

In [0]:
# Setting the number of iterations, penalties and reward to zero,
epochs = 0
penalties, reward = 0, 0

In [0]:
frames = []

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into the dictionary for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
    }
    )

    epochs += 1

print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))


Timesteps taken: 378
Penalties incurred: 109


In [0]:
# Printing all the possible actions, states, rewards.
def frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'].getvalue())
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
frames(frames)

Avec cette méthode, les actions sont choisies au hasard et on peut voir que pour atteindre le but il y a énormément d'actions faites mais surtout beaucoup trop de pénalités.

**Avec q-learning**

Cette méthode va permettre de calculer pour chaque état et pour chaque action possible, son indice de favorabilité à l'atteinte de l'objectif.

In [0]:
# Init Taxi-V2 Env
env = gym.make("Taxi-v3").env

# Init arbitary values
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1


all_epochs = []
all_penalties = []

In [0]:
#Training to fill in the q-table
for i in range(1, 100001):
    state = env.reset()

    # Init Vars
    epochs, penalties, reward, = 0, 0, 0
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:
            # Check the action space
            action = env.action_space.sample()
        else:
            # Check the learned values
            action = np.argmax(q_table[state])

        next_state, reward, done, info = env.step(action)

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        # Update the new value
        new_value = (1 - alpha) * old_value + alpha * \
            (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    if i % 100 == 0:
        clear_output(wait=True)
        print("Episode: {i}")

print("Training finished.")


Episode: {i}
Training finished.


In [0]:
q_table[328]

array([ -2.40474003,  -2.27325184,  -2.39254947,  -2.34728348,
       -11.00109089, -10.5969929 ])

**Q-learning choix du next action avec la meilleur Q-value**

In [0]:
total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 13.14
Average penalties per episode: 0.0


**Q-learning choix random du next action**

In [0]:
total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = (int)(random.random()*6)
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 2445.38
Average penalties per episode: 798.64


On peut voir que choisir la meilleure action grâce à la table de Q-value est bien plus rentable que de choisir l'action au hasard. Le seul problème de cette technique est qu'on peut atteindre un extremum local et rester bloquer en un point sans atteindre l'objectif. Pour contrer ce problème plusieurs options existes notamment lors du choix de la prochaine action introduire dans un faible pourcentage des cas un choix aléatoire au lieu du choix le plus logique.
On peut quand même observer que le choix de la meilleur Q-value est l'apprentissage le plus rapide et le plus efficace dans la plupart des cas.

**Q learning avec choix de l'action avec la meilleur Q-value dans la plupart des cas**

In [0]:
total_epochs, total_penalties = 0, 0
episodes = 100
probaQValue=0.95

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
      if(random.random()<=probaQValue):
        action = np.argmax(q_table[state])

      else:
        action = (int)(random.random()*6)

      state, reward, done, info = env.step(action)

      if reward == -10:
          penalties += 1

      epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 14.15
Average penalties per episode: 0.39


#### **II)Frozen lake problem**

Q-learning pour un problème moins complexe, FrozenLake

In [0]:
# Init Frozen lake Env
env = gym.make("FrozenLake-v0").env

# Init arbitary values
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1


all_epochs = []
all_penalties = []

In [0]:
#Training to fill in the q-table
for i in range(1, 100001):
    state = env.reset()

    # Init Vars
    epochs, penalties, reward, = 0, 0, 0
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:
            # Check the action space
            action = env.action_space.sample()
        else:
            # Check the learned values
            action = np.argmax(q_table[state])

        next_state, reward, done, info = env.step(action)

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        # Update the new value
        new_value = (1 - alpha) * old_value + alpha * \
            (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    if i % 100 == 0:
        clear_output(wait=True)
        print("Episode: {i}")

print("Training finished.")

Episode: {i}
Training finished.


In [0]:
q_table[4]

array([0.00230967, 0.0011711 , 0.00113283, 0.00094806])

In [0]:
total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 10.89
Average penalties per episode: 0.0
