<a href="https://colab.research.google.com/github/chi-yan/notebooks/blob/master/Reinforcement_Learning_OpenAI_Gym_and_Q_learning_for_%22Frozen_Lake%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on the "Tutorial: An Introduction to Reinforcement Learning Using OpenAI Gym" by Joy Zhang

The tutorial uses the "Taxi" game, and the code is modified for "Frozen Lake"

https://www.gocoder.one/blog/rl-tutorial-with-openai-gym

https://colab.research.google.com/drive/1gS2aJo711XJodqqPIVIbzgX1ktZzS8d8?usp=sharing

In [14]:
import numpy as np
import gym
import random

def main():

    # create Taxi environment
    env = gym.make('FrozenLake-v0')

    # initialize q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    qtable = np.zeros((state_size, action_size))

    # hyperparameters
    learning_rate = 0.9
    discount_rate = 0.95
    epsilon = 1
    decay_rate= 0.00005

    # training variables
    num_episodes = 50000
    max_steps = 99# per episode

    # training
    for episode in range(num_episodes):

        # reset the environment
        state = env.reset()
        done = False

        for s in range(max_steps):

            # exploration-exploitation tradeoff
            if random.uniform(0,1) < epsilon:
                # explore
                action = env.action_space.sample()
            else:
                # exploit
                action = np.argmax(qtable[state,:])

            # take action and observe reward
            new_state, reward, done, info = env.step(action)
     #       print(reward)

            # Q-learning algorithm
            qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

            # Update to our new state
            state = new_state

            # if done, finish episode
            if done == True:
                break

        # Decrease epsilon
        epsilon = np.exp(-decay_rate*episode)

    print(qtable)
    print(f"Training completed over {num_episodes} episodes")
    input("Press Enter to watch trained agent...")

    # watch trained agent
    state = env.reset()
    done = False
    rewards = 0

    for s in range(max_steps):

        print(f"TRAINED AGENT")
        print("Step {}".format(s+1))

        action = np.argmax(qtable[state,:])
        print("State: ", state)
        print("Action: ", action)
        new_state, reward, done, info = env.step(action)
        print("New State: ", new_state)
  
        rewards += reward

        env.render()
        print(f"score: {rewards}")
        state = new_state

        if done == True:
            if new_state == 15:
                print("Reached goal")
            else:
                print("Fell into hole")
            
            break

    env.close()

if __name__ == "__main__":
    main()


[[1.20122338e-01 1.21462371e-01 1.18914159e-01 8.26207754e-02]
 [5.53661973e-03 1.71112713e-02 1.54090577e-02 5.72216330e-02]
 [4.32662070e-02 4.97717083e-02 4.38507792e-02 5.66620404e-02]
 [5.20101650e-03 7.19695931e-03 2.30884708e-02 4.53446474e-02]
 [1.73971841e-01 3.57729856e-02 2.58022726e-02 1.60294975e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.15987876e-03 3.86168596e-05 4.01584264e-02 1.79504738e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.64277692e-01 1.31247795e-02 4.09835225e-03 1.51198238e-01]
 [5.57797698e-04 2.59010257e-01 5.90020218e-02 1.30634591e-02]
 [1.59961364e-02 8.00504352e-02 8.99638811e-05 4.20651807e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.13183568e-03 7.97133615e-01 2.84836208e-01 6.51564845e-02]
 [2.84432800e-01 6.23151537e-01 7.77025364e-01 6.64766639e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [19]:
import numpy as np
import gym
import random

def main():

    # create Taxi environment
    env = gym.make('FrozenLake-v0')

    # initialize q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    qtable = np.zeros((state_size, action_size))

    # hyperparameters
    learning_rate = 0.9
    discount_rate = 0.95
    epsilon = 1
    decay_rate= 0.00005

    # training variables
    num_episodes = 50000
    max_steps = 99# per episode

    # training
    for episode in range(num_episodes):

        # reset the environment
        state = env.reset()
        done = False

        for s in range(max_steps):

            # exploration-exploitation tradeoff
            if random.uniform(0,1) < epsilon:
                # explore
                action = env.action_space.sample()
            else:
                # exploit
                action = np.argmax(qtable[state,:])

            # take action and observe reward
            new_state, reward, done, info = env.step(action)
     #       print(reward)

            # Q-learning algorithm
            qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

            # Update to our new state
            state = new_state

            # if done, finish episode
            if done == True:
                break

        # Decrease epsilon
        epsilon = np.exp(-decay_rate*episode)

    print(qtable)
    print(f"Training completed over {num_episodes} episodes")
    input("Press Enter to watch trained agent...")

    # watch trained agent
    
    done = False
    rewards = 0
    trials = 1000
    results = []
    turns = []

    for i in range(trials):
      state = env.reset()

      for s in range(max_steps):

          action = np.argmax(qtable[state,:])      
          new_state, reward, done, info = env.step(action) 
          rewards += reward
          state = new_state

          if done == True:
              if new_state == 15:
                  turns.append(s)
                  results.append(1)
              else:
                  turns.append(s)
                  results.append(0)
              break
    print(turns)
    print("Games won: " , sum(results))
    env.close()

if __name__ == "__main__":
    main()


[[4.38913142e-01 1.38564315e-01 1.21989625e-01 9.69939959e-02]
 [3.13435477e-02 4.98928512e-02 4.78817730e-04 3.82748805e-02]
 [1.79671057e-03 1.06278496e-02 3.63302541e-02 3.75980182e-02]
 [3.58016920e-03 6.60611953e-07 3.32299448e-02 3.53775038e-02]
 [5.57064712e-01 6.45881800e-02 4.79903343e-02 4.21381287e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.03029915e-02 8.65254806e-06 2.05249355e-04 1.80063427e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.26139494e-01 5.71522657e-01 1.91137389e-05 5.54814324e-01]
 [4.54540367e-01 5.29685828e-01 6.50416188e-04 4.33323407e-02]
 [8.98994203e-01 8.20963986e-02 1.89258867e-08 2.26425337e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.36613243e-01 2.78713993e-01 7.33564195e-01 5.41496509e-06]
 [4.11887433e-01 9.75772360e-01 1.20591784e-01 9.38693190e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000