# Deep Reinforcement Learning : Q Learning

In [2]:
import numpy as np
import gym
import random

In [3]:
env = gym.make("FrozenLake-v0")  # create the environment


In [4]:
action_size = env.action_space.n  # number of actions and states to consider for our Q-Table
state_size = env.observation_space.n


In [5]:
qtable = np.zeros((state_size, action_size))
print(qtable)


[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [6]:
# create the hyper-parameters now
episodes = 10000
learning_rate = 0.8 #(alpha)
steps = 99 #maximum steps per episode
gamma = 0.95

# exploration parameters
epsilon = 1.0 #initially high because we don't know anything about the environment initially. thus exploration rate should be high
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob



# The Q- Learning Algorithm

# Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]

In [8]:
rewards = []
for episode in range(episodes):
    
    # reset the environment for each episode.
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(steps):
        # choose an action for the current state s
        # first we randomize a number --> exploration- exploitation trade-off
        exp_exp_tradeoff = random.uniform(0, 1)
        ## If this number > greater than epsilon --> choose to do exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        # else do exploration by choosing a random state
        else:
            action = env.action_space.sample()
        
        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)
        
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward  ## update total rewards
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    episode += 1
    
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/episodes))
print(qtable)

        


Score over time: 0.4893
[[9.70269484e-02 3.68888315e-02 6.28365070e-02 2.41343950e-02]
 [1.56377284e-02 7.50618427e-03 1.21929087e-02 6.14046073e-02]
 [1.33762271e-02 1.31608635e-02 6.43752586e-03 5.83978157e-02]
 [9.93020263e-03 5.51384956e-03 6.89697225e-04 2.45063094e-02]
 [8.83389833e-02 3.85891431e-02 1.55328252e-02 5.95353822e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.98386964e-02 2.54956041e-08 3.19648675e-05 2.43098834e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.60344001e-02 6.96952900e-02 7.70178065e-03 1.59021318e-01]
 [1.88018692e-02 3.62851984e-01 6.57140951e-05 1.58357544e-02]
 [9.86838568e-03 1.53674287e-02 8.02293169e-03 1.38603988e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.19083236e-01 1.39791309e-02 5.71207485e-01 1.45333418e-02]
 [5.01863364e-01 9.18957286e-01 3.13601624e-01 1.82567520e-01]
 [0.00000000e+00 0.00000000e+00

In [10]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(steps):
        env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            break
        state = new_state
env.close()

****************************************************
EPISODE  0

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  