In [None]:
import gym
import numpy as np
import random

# FrozenLake-v0

In [None]:
env = gym.make('FrozenLake-v0')

In [None]:
# Initialize Q-value table randomly
q_table = np.zeros((env.observation_space.n, env.action_space.n))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [None]:
# Hyperparameters
gamma = 0.99
learning_rate = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
epsilon_decay_rate = 0.005

num_episodes = 20000
num_steps_per_episode = 100

## Thuật toán Q-Learning

In [None]:
def q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    for episode in range(num_episodes):
        state = env.reset()

        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        for step in range(num_steps_per_episode):
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + gamma * np.max(q_table[next_state,:]))

            reward_episode += reward
            state = next_state

            if done:
                break
        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    return q_table, rewards_all

In [None]:
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [None]:
q_table

array([[5.36331837e-01, 4.81409614e-01, 4.64682415e-01, 4.64442318e-01],
       [2.98728299e-01, 2.41669230e-01, 2.00757206e-01, 4.47369971e-01],
       [3.66546947e-01, 2.45479442e-01, 2.54281218e-01, 2.53388556e-01],
       [1.15347764e-09, 2.16081710e-01, 5.24862455e-09, 7.08909471e-09],
       [5.46987040e-01, 3.05162902e-01, 3.38287645e-01, 3.79289482e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.32502817e-01, 4.77710670e-02, 2.30210196e-01, 1.36851580e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.79439258e-01, 4.09676492e-01, 3.29712404e-01, 5.73842284e-01],
       [3.99523590e-01, 5.74871826e-01, 4.29975476e-01, 4.46647633e-01],
       [5.24731471e-01, 3.71512284e-01, 2.56017956e-01, 2.78572840e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.08894849e-01, 4.63898405e-01, 7.65806530e

## Cài đặt thuật toán SARSA

In [None]:
def epsilon_greedy(Q, epsilon, n_actions, s):
    """
    Q: Q Table
    epsilon: exploration parameter
    n_actions: number of actions
    s: state
    """
    # selects a random action with probability epsilon
    if np.random.random() <= epsilon:
        return np.random.randint(n_actions)
    else:
        return np.argmax(Q[s, :])
    

n_states, n_actions = env.observation_space.n, env.action_space.n

def SARSA(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    for episode in range(num_episodes):
        state = env.reset()

        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)

        action = epsilon_greedy(q_table, epsilon, n_actions ,state)


        for step in range(num_steps_per_episode):
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            next_action = epsilon_greedy(q_table, epsilon, n_actions , next_state)
            
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + gamma * q_table[next_state, next_action])

            reward_episode += reward
            state = next_state

            if done:
                break
        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    return q_table, rewards_all

In [None]:
q_table_2, rewards_all_2 = SARSA(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [None]:
q_table_2

array([[5.25675544e-01, 4.11083779e-01, 4.29636609e-01, 4.32238514e-01],
       [2.70815687e-01, 2.90799418e-01, 1.62256536e-01, 3.86886259e-01],
       [3.38517543e-01, 2.12720254e-01, 2.51491508e-01, 2.71528497e-01],
       [1.29245139e-01, 4.17136214e-05, 6.25209101e-05, 4.15791938e-03],
       [5.41682025e-01, 3.35349962e-01, 3.28077023e-01, 2.50103488e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.32640014e-01, 9.68976397e-02, 3.42854457e-01, 1.44902581e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.24212271e-01, 2.75126872e-01, 2.23356826e-01, 5.82032320e-01],
       [4.26492011e-01, 6.22671263e-01, 4.59677448e-01, 3.94299787e-01],
       [6.10422173e-01, 3.85323540e-01, 3.80762138e-01, 2.80780980e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.32412656e-01, 4.98403798e-01, 7.24092708e

In [None]:
sum(rewards_all)

13217.0

In [None]:
sum(rewards_all[0:1000])

300.0

In [None]:
sum(rewards_all[1000:2000])

668.0

In [None]:
sum(rewards_all[2000:3000])

696.0

In [None]:
sum(rewards_all[9000:10000])

678.0

In [None]:
sum(rewards_all_2[9000:10000])

664.0

In [None]:
def play(env, q_table, render=False):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    while not done:
        action = np.argmax(q_table[state, :])
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return (total_reward, steps)

In [None]:
def play_multiple_times(env, q_table, max_episodes):
    success = 0
    list_of_steps = []
    for i in range(max_episodes):
        total_reward, steps = play(env, q_table)

        if total_reward > 0:
            success += 1
            list_of_steps.append(steps)

    print(f'Number of successes: {success}/{max_episodes}')
    print(f'Average number of steps: {np.mean(list_of_steps)}')

In [None]:
play_multiple_times(env, q_table, 1000)

Number of successes: 718/1000
Average number of steps: 36.82451253481894


In [None]:
play_multiple_times(env, q_table_2, 1000)

Number of successes: 721/1000
Average number of steps: 37.0748959778086


Nhận xét trên môi trường FrozenLake-v0:
  + Number of successes của SARSA tốt hơn 1 chút so với Q-Learning (721 > 718)
  + Average number of steps của 2 thuật toán cũng gần như nhau 36.82 và 37.07

# FrozenLake8x8-v0

In [None]:
env = gym.make('FrozenLake8x8-v0')

In [None]:
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [None]:
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],


In [None]:
sum(rewards_all[9000:10000])

0.0

In [None]:
q_table_2, rewards_all_2 = SARSA(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [None]:
q_table_2

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.26554826, -2.16866161, -2.17858438, -3.0402459 ,  8.2570751 ,
        -4.91916341],
       [ 2.6033253 ,  0.77275178,  1.39888425,  0.81978018, 14.01219276,
        -5.26296194],
       ...,
       [-1.39524658, -0.8484833 , -1.4015808 , -1.40763279, -5.46149344,
        -4.843336  ],
       [-3.49399978, -3.46024302, -3.40498748, -0.74361837, -7.24099505,
        -7.40639733],
       [-0.2694952 , -0.1999    , -0.1999    , 10.93590646, -1.        ,
        -1.91881   ]])

In [None]:
sum(rewards_all_2[9000:10000])

7434.0

In [None]:
play_multiple_times(env, q_table, 1000)

Number of successes: 1000/1000
Average number of steps: 13.028


In [None]:
play_multiple_times(env, q_table_2, 1000)

Number of successes: 1000/1000
Average number of steps: 13.068


Nhận xét trên môi trường FrozenLak8x8e-v0:
+ Number of sccesses của SARSA với Q-Learning bằng nhau và tối đa 1000.
+ Average number of steps của 2 thuật toán cũng gần gần nhau 13.028 và 13.068

# Taxi-v3

In [None]:
env = gym.make('Taxi-v3')

In [None]:
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [None]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.49520654,  2.00056796, -2.42844336, -1.38806626,  9.6220697 ,
        -3.41469895],
       [ 3.59281756,  2.03283477, -0.87354049,  3.07536457, 14.11880599,
        -4.04622411],
       ...,
       [-1.25102892,  4.63471853, -1.28373394, -1.21798638, -1.9197901 ,
        -4.87414035],
       [-2.43667219, -2.44563813, -2.47022874,  1.0742471 , -3.57497861,
        -4.51745465],
       [-0.1999    , -0.1999    , -0.1999    ,  7.80873722, -1.        ,
        -1.        ]])

In [None]:
sum(rewards_all[9000:10000])

7313.0

In [None]:
q_table_2, rewards_all_2 = SARSA(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [None]:
q_table_2

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-4.25090989, -1.85770899, -2.03229882, -2.44126085,  9.32857538,
        -5.4554385 ],
       [ 1.13454761, -0.8735832 , -1.40701798,  3.83943155, 13.92323866,
        -1.03001101],
       ...,
       [-1.50881601, -1.27640085, -1.50117238, -1.52673594, -2.95306419,
        -4.18372556],
       [-2.58589572, -2.64346028, -2.62329993, -0.0825706 , -6.3945349 ,
        -3.5891608 ],
       [-0.2614762 , -0.20988713, -0.20881   , 14.50148227, -1.9099    ,
        -2.7199    ]])

In [None]:
sum(rewards_all_2[9000:10000])

7302.0

In [None]:
play_multiple_times(env, q_table, 1000)

Number of successes: 1000/1000
Average number of steps: 13.166


In [None]:
play_multiple_times(env, q_table_2, 1000)

Number of successes: 1000/1000
Average number of steps: 13.085


Nhận xét trên môi trường FrozenLake-v0:

+ Number of sccesses của SARSA như nhau 1000 - tối đa.
+ Average number of steps của 2 thuật toán cũng gần như nhau 13.166 và 13.085

# Nhận xét chung

+ Thử nghiệm thuật toán Q-learning và SARSA qua 3 môi trường FrozenLake-v0, FrozenLake8x8-v0, Taxi-v3 thì Number of successes được cải thiện rõ rệt và Average number of steps được rút ngắn rất nhiều. 
+ Có thể thấy FrozenLake8x8 và Taxi-v3 là 2 môi trường mà Q-learning và SARSA chạy trên đó vượt trội rất nhiều so với FrozenLake-v0