In [236]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
env = gym.make('FrozenLake-v1', is_slippery=False)

In [237]:
def epsilon_greedy_policy(state, epsilon, i):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state,:] + np.random.randn(1,env.action_space.n)*epsilon)

In [293]:
def Q_learning_algorithm(alpha, gamma, epsilon, number_episodes):
    total_rewards = []
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    print("Q-Table before training\n")
    print(Q, "\n")
    for i in range(number_episodes):
        #On réinitialise l'environnement. On récupère le premier état.
        state,_ = env.reset()
        sum_reward = 0
        done = False
        #L'algorithme d'apprentissage de la Q-table
        while True:
            action = epsilon_greedy_policy(state, epsilon, i)
            #On récupère un nouvel état et une nouvelle récompense de l'environment
            state_next, reward, done, _, _ = env.step(action)
            #On actualise la Q-table 
            Q[state,action] = Q[state,action] + alpha * (reward + gamma * np.max(Q[state_next,:]) - Q[state,action])
            
            #Si on a une récompense nous avons eu un succès   
            sum_reward += reward
            state = state_next
            if done == True:
                break   
                
        total_rewards.append(sum_reward)
    return Q, total_rewards

In [294]:
# Training
Q_table, final_reward = Q_learning_algorithm(alpha = 0.1, gamma = 0.9999, epsilon = 0.015, number_episodes = 1000)
print ("Q-Table after training \n")
print (Q_table)

Q-Table before training

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]] 

Q-Table after training 

[[0.45080908 0.45399731 0.43191475 0.45033646]
 [0.44107408 0.         0.37622538 0.41178619]
 [0.35434512 0.39160004 0.2990488  0.37291397]
 [0.31982313 0.         0.23198409 0.25089438]
 [0.44561586 0.45651943 0.         0.442965  ]
 [0.         0.         0.         0.        ]
 [0.         0.41476251 0.         0.29336956]
 [0.         0.         0.         0.        ]
 [0.40777073 0.         0.46994836 0.4196112 ]
 [0.3306548  0.52932674 0.43423655 0.        ]
 [0.25925306 0.62061645 0.         0.21159674]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.41274568 0.62612347 0.32647724]
 [0.30134786 0.6138046  0.81469798 0.377462  ]
 [

In [295]:
episodes = 100
nb_success = 0

# Test
for _ in range(episodes):
    state, _ = env.reset()
    done = False

    while not done:
        #Choisir l'action dont la valeur est la plus élevée dans l'état actuel.
        action = np.argmax(Q_table[state])

        #Exécuter cette action et déplacer l'agent dans la direction souhaitée
        new_state, reward, done, _, _ = env.step(action)

        #Mettre à jour notre état actuel
        state = new_state

        #Quand on obtient une récompense, cela signifie qu'on a résolu le jeu
        nb_success += reward

#Taux de succès
print (f"Success rate = {nb_success/episodes*100}%")

Success rate = 100.0%
