In [1]:
import gym
import numpy as np
import math
import random

In [2]:
env = gym.make('CartPole-v0')

In [3]:
print(env.action_space)

Discrete(2)


In [4]:
print(env.observation_space)

Box(4,)


In [5]:
print(env.observation_space.low)

[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]


In [6]:
print(env.observation_space.high)

[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [7]:
NUM_BUCKETS = (1, 1, 6, 3)

In [8]:
NUM_ACTIONS = env.action_space.n

In [9]:
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))

In [10]:
STATE_BOUNDS[1] = [-0.5, 0.5]

In [11]:
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]

In [12]:
print(STATE_BOUNDS)

[(-4.8, 4.8), [-0.5, 0.5], (-0.41887903, 0.41887903), [-0.8726646259971648, 0.8726646259971648]]


In [13]:
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))

In [14]:
print(q_table.shape)

(1, 1, 6, 3, 2)


In [15]:
print(q_table)

[[[[[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]]]]


In [16]:
EXPLORE_RATE_MIN = 0.01

In [17]:
LEARNING_RATE_MIN = 0.1

In [18]:
def get_explore_rate(t):
    return max(EXPLORE_RATE_MIN, min(1, 1.0 - math.log10((t+1)/25)))

In [19]:
def get_learning_rate(t):
    return max(LEARNING_RATE_MIN, min(0.5, 1.0 - math.log10((t+1)/25)))

In [21]:
def select_action(state, explore_rate):
    if random.random() < explore_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])
        
    return action

In [24]:
def state_to_bucket(state):
    
    bucket_indices = []
    
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
            
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
            
        else:
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            
            offset = (NUM_BUCKETS[i] - 1) * STATE_BOUNDS[i][0] / bound_width
            scaling = (NUM_BUCKETS[i] - 1) / bound_width
            
            bucket_index = int(round(scaling * state[i] - offset))
            
        bucket_indices.append(bucket_index)
        
    return tuple(bucket_indices)

In [28]:
def simulate():
    
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    
    discount_factor = 0.99
    num_streaks = 0
    
    for episode in range(1000):
        
        observ = env.reset()
        
        state_0 = state_to_bucket(observ)
        
        for t in range(250):
            
            env.render()
            
            action = select_action(state_0, explore_rate)
            
            observ, reward, done, _ = env.step(action)
            
            state = state_to_bucket(observ)
            
            best_q = np.amax(q_table[state])
            
            q_table[state_0 + (action,)] += \
                        learning_rate * (reward + discount_factor * (best_q) - q_table[state_0 + (action,)])
            
            state_0 = state
            
            print("\nEpisode = %d" % episode)
            print("t = %d" % t)
            print("Action: %d" % action)
            print("State: %s" % str(state))
            print("Reward: %f" % reward)
            print("Best Q: %f" % best_q)
            print("Explore Rate: %f" % explore_rate)
            print("Learning Rate: %f" % learning_rate)
            print("Streaks: %d" % num_streaks)
            
            print('')
            
            if done:
                print("Episode %d finished after %f time steps" % (episode, t))
                
                if (t >= 199):
                    num_streaks +=1
                else:
                    num_streaks = 0
                break
                
                if num_streaks > 120:
                    break
                
                explore_rate = get_explore_rate(episode)
                learning_rate = get_learning_rate(episode)
                    

In [29]:
simulate()


Episode = 0
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 73.360911
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 0
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 73.419896
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 0
t = 2
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 73.419896
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 0
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 73.419896
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 0
t = 4
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 75.207919
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 0
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 75.207919
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 0
t = 6
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 75.207919
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0



Episode = 3
t = 15
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.947928
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 3
t = 16
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 75.388722
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 3
t = 17
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.947928
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 3
t = 18
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 76.283585
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 3
t = 19
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 76.283585
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 3
t = 20
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.947928
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 3
t = 21
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.947928
Explore Rate: 1.000000
Learning Rate: 0.500000
Stre


Episode = 6
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 77.197087
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 6
t = 1
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 77.215288
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 6
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 77.320111
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 6
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.115901
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 6
t = 4
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 77.389126
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 6
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.365568
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 6
t = 6
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 77.490519
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0



Episode = 8
t = 29
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 65.737168
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 8
t = 30
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 65.737168
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 8
t = 31
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 65.908482
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 8
t = 32
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 65.908482
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 8
t = 33
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 65.908482
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 8
t = 34
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 31.530137
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 8
t = 35
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 31.530137
Explore Rate: 1.000000
Learning Rate: 0.500000
Stre


Episode = 10
t = 41
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 71.467294
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 10
t = 42
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 71.467294
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 10
t = 43
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 71.467294
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 10
t = 44
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 32.141508
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 10
t = 45
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 32.141508
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 10
t = 46
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 32.480801
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 10 finished after 46.000000 time steps

Episode = 11
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.90238


Episode = 14
t = 7
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.470746
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 14
t = 8
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.588392
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 14
t = 9
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.705450
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 14
t = 10
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.821923
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 14
t = 11
Action: 0
State: (0, 0, 1, 1)
Reward: 1.000000
Best Q: 52.162193
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 14
t = 12
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 56.369601
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 14
t = 13
Action: 0
State: (0, 0, 1, 1)
Reward: 1.000000
Best Q: 54.484049
Explore Rate: 1.000000
Learning Rate: 0.500000



Episode = 17
t = 9
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 75.925247
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 17
t = 10
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 75.925247
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 17
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 75.925247
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 17
t = 12
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 56.574528
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 17
t = 13
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 56.574528
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 17 finished after 13.000000 time steps

Episode = 18
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.473569
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 18
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 76.364289



Episode = 21
t = 17
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 57.436545
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 21
t = 18
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 57.436545
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 21
t = 19
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 57.649362
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 21 finished after 19.000000 time steps

Episode = 22
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.914615
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 22
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 76.471257
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 22
t = 2
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 76.471257
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 22
t = 3
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.972987
E

Best Q: 76.533328
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 25
t = 8
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 76.650661
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 25
t = 9
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 76.650661
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 25
t = 10
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 76.767408
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 25
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 76.883571
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 25
t = 12
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 33.941817
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 25
t = 13
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 33.941817
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 25 finished after 13.000000 time steps

Episode


Episode = 27
t = 34
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.544379
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 27
t = 35
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.544379
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 27
t = 36
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.651657
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 27
t = 37
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.242608
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 27
t = 38
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.385141
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 27
t = 39
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.921949
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 27
t = 40
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.921949
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 30
t = 9
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 78.474105
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 30
t = 10
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 78.474105
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 30
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 78.581734
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 30
t = 12
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 58.071810
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 30
t = 13
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 58.071810
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 30
t = 14
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 58.174682
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 30
t = 15
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 58.332372
Explore Rate: 1.000000
Learning Rate: 0.50000


Episode = 33
t = 4
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 79.049486
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 33
t = 5
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 79.049486
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 33
t = 6
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 79.154238
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 33
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 74.707501
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 33
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 74.707501
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 33
t = 9
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 74.833964
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 33
t = 10
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 79.154238
Explore Rate: 1.000000
Learning Rate: 0.500000
Str


Episode = 36
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.406999
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 36
t = 3
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.406999
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 36
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.509964
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 36
t = 5
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.509964
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 36
t = 6
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.612415
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 36
t = 7
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.633112
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 36
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.724597
Explore Rate: 1.000000
Learning Rate: 0.500000
Stre

Streaks: 0


Episode = 39
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 79.144797
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 39
t = 2
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 79.144797
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 39
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 79.144797
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 39
t = 4
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.486174
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 39
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.486174
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 39
t = 6
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.486174
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 39
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 77.486174
Explore Rate: 1.000000
Learning Rate: 0


Episode = 42
t = 29
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 80.147030
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 42
t = 30
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 80.246295
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 42
t = 31
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 80.246295
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 42
t = 32
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 79.748004
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 42
t = 33
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 79.748004
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 42
t = 34
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 36.868584
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 42
t = 35
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 33.705655
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 45
t = 34
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 74.361992
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 45
t = 35
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 74.361992
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 45
t = 36
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 74.361992
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 45
t = 37
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 74.361992
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 45
t = 38
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 74.490182
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 45
t = 39
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 59.651286
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 45
t = 40
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 59.651286
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 49
t = 7
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.866135
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 49
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.675887
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 49
t = 9
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.866135
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 49
t = 10
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.866135
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 49
t = 11
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.966805
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 49
t = 12
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.966805
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 49
t = 13
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.066971
Explore Rate: 1.000000
Learning Rate: 0.500000


Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 52
t = 6
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 80.530285
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 52
t = 7
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 80.627634
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 52
t = 8
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.158820
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 52
t = 9
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.158820
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 52
t = 10
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.248026
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 52
t = 11
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.248026
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 52
t = 12
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q:


Episode = 55
t = 19
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.140805
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 55
t = 20
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.140805
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 55
t = 21
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.230100
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 55
t = 22
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 81.700998
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 55
t = 23
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.121136
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 55
t = 24
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.121136
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 55
t = 25
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.210530
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 57
t = 7
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.861211
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 57
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.861211
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 57
t = 9
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.956905
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 57
t = 10
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.052120
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 57
t = 11
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.052120
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 57
t = 12
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 80.200010
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 57
t = 13
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.052120
Explore Rate: 1.000000
Learning Rate: 0.500000



Episode = 59
t = 9
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 60.946111
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 59
t = 10
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 61.141380
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 59 finished after 10.000000 time steps

Episode = 60
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.003874
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 60
t = 1
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.003874
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 60
t = 2
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.093855
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 60
t = 3
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.183385
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 60
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.183385
Exp


Episode = 63
t = 18
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.989135
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 63
t = 19
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 80.460557
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 63
t = 20
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.989135
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 63
t = 21
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 81.809901
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 63
t = 22
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 81.809901
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 63
t = 23
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 81.809901
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 63
t = 24
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 81.900851
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 66
t = 6
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.184866
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 66
t = 7
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 83.029192
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 66
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.337059
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 66
t = 9
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 81.580020
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 66
t = 10
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.337059
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 66
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 81.580020
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 66
t = 12
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.337059
Explore Rate: 1.000000
Learning Rate: 0.500000
S


Episode = 69
t = 1
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 81.239536
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 69
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.196146
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 69
t = 3
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.196146
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 69
t = 4
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.280165
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 69
t = 5
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.280165
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 69
t = 6
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.363764
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 69
t = 7
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.363764
Explore Rate: 1.000000
Learning Rate: 0.500000
Stre


Episode = 70
t = 24
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 41.393075
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 70
t = 25
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 41.393075
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 70 finished after 25.000000 time steps

Episode = 71
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.982949
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 71
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 82.882679
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 71
t = 2
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 82.882679
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 71
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 72.122649
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 71
t = 4
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 72.122649
Ex


Episode = 73
t = 41
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 71.938219
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 73
t = 42
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 72.078528
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 73
t = 43
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 72.218135
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 73
t = 44
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 72.357044
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 73
t = 45
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 42.222972
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 73
t = 46
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 42.222972
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 73 finished after 46.000000 time steps

Episode = 74
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.08878


Episode = 76
t = 26
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 76.915157
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 76
t = 27
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 42.448826
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 76
t = 28
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 42.448826
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 76 finished after 28.000000 time steps

Episode = 77
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.487523
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 77
t = 1
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 76.915157
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 77
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.487523
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 77
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 80.283903
E


Episode = 80
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 69.686999
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 80
t = 9
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 69.686999
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 80
t = 10
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 69.686999
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 80
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 69.838564
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 80
t = 12
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 43.322320
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 80
t = 13
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 43.322320
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 80 finished after 13.000000 time steps

Episode = 81
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.570086


Learning Rate: 0.500000
Streaks: 0


Episode = 83
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 82.252766
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 83
t = 6
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 82.252766
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 83
t = 7
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 82.252766
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 83
t = 8
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 43.605708
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 83
t = 9
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 43.605708
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 83 finished after 9.000000 time steps

Episode = 84
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.102800
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 84
t = 1
Action: 0
State: (0, 0, 3, 2)
Rewa


Episode = 86
t = 12
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.597445
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 86
t = 13
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.674458
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 86
t = 14
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.751086
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 86
t = 15
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.751086
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 86
t = 16
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.827330
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 86
t = 17
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.827330
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 86
t = 18
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 83.776239
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 90
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 84.656126
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 90
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.959361
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 90
t = 3
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 83.829866
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 90
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 83.829866
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 90
t = 5
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 83.868379
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 90
t = 6
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 83.929781
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 90
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 82.739664
Explore Rate: 1.000000
Learning Rate: 0.500000
Stre


Episode = 93
t = 14
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.457808
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 93
t = 15
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.462478
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 93
t = 16
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.535166
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 93
t = 17
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.568811
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 93
t = 18
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.010132
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 93
t = 19
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.010132
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 93
t = 20
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.010132
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 95
t = 15
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 46.194600
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 95 finished after 15.000000 time steps

Episode = 96
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.341727
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 96
t = 1
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.420019
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 96
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.420019
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 96
t = 3
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.461122
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 96
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.518265
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 96
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 81.554876
Exp


Episode = 98
t = 16
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 46.731309
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 98
t = 17
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 46.731309
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 98
t = 18
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 46.997653
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 98 finished after 18.000000 time steps

Episode = 99
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.399205
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 99
t = 1
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 83.851102
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 99
t = 2
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 83.851102
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 99
t = 3
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 83.851102
E


Episode = 103
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.730738
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 103
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 84.830245
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 103
t = 2
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 84.830245
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 103
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 84.906093
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 103
t = 4
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 83.961715
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 103
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 83.961715
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 103
t = 6
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 83.961715
Explore Rate: 1.000000
Learning Rate: 0.5000

Best Q: 85.165864
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 107
t = 3
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.165864
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 107
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.165864
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 107
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 79.186774
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 107
t = 6
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 79.186774
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 107
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 79.186774
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 107
t = 8
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 79.186774
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 107
t = 9
Action: 0
State: (0, 0, 2, 0)
Re


Episode = 110
t = 13
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 85.288718
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 110
t = 14
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 85.288718
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 110
t = 15
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 85.288718
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 110
t = 16
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 80.312249
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 110
t = 17
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 80.312249
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 110
t = 18
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 80.410687
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 110
t = 19
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 80.410687
Explore Rate: 1.000000
Learning Rate:


Episode = 113
t = 13
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 84.877413
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 113
t = 14
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 84.953026
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 113
t = 15
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 65.203386
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 113
t = 16
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 65.203386
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 113
t = 17
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 65.203386
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 113
t = 18
Action: 0
State: (0, 0, 1, 1)
Reward: 1.000000
Best Q: 63.146749
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 113 finished after 18.000000 time steps

Episode = 114
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 


Episode = 117
t = 1
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 84.594848
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 117
t = 2
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 84.594848
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 117
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 84.671874
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 117
t = 4
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 84.748515
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 117
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 84.748515
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 117
t = 6
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 84.824772
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 117
t = 7
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 49.539679
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 120
t = 11
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.183984
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 120
t = 12
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.253064
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 120
t = 13
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.253064
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 120
t = 14
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.190299
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 120
t = 15
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.253064
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 120
t = 16
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.253064
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 120
t = 17
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.321799
Explore Rate: 1.000000
Learning Rate:


Episode = 123
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.720781
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 123
t = 3
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.625848
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 123
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.720781
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 123
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.739711
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 123
t = 6
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.739711
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 123
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.739711
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 123
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.739711
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 125
t = 15
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.078094
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 125
t = 16
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.147704
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 125
t = 17
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.216965
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 125
t = 18
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 85.291326
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 125
t = 19
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 85.291326
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 125
t = 20
Action: 0
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 85.364870
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 125
t = 21
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.862147
Explore Rate: 1.000000
Learning Rate:


Episode = 126
t = 34
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.231750
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 126
t = 35
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.231750
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 126
t = 36
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.231750
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 126
t = 37
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.231750
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 126
t = 38
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.231750
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 126
t = 39
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 66.352009
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 126
t = 40
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 66.352009
Explore Rate: 1.000000
Learning Rate:


Episode = 130
t = 17
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 51.317371
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 130
t = 18
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 44.807843
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 130 finished after 18.000000 time steps

Episode = 131
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.583811
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 131
t = 1
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.583811
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 131
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.650892
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 131
t = 3
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.681486
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 131
t = 4
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 86.7


Episode = 133
t = 16
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.007014
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 133
t = 17
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.007014
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 133
t = 18
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 82.437209
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 133
t = 19
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 82.437209
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 133
t = 20
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 82.525023
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 133
t = 21
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.007014
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 133
t = 22
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.007014
Explore Rate: 1.000000
Learning Rate:

State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.127111
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 135
t = 24
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 51.802980
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 135
t = 25
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 51.802980
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 135
t = 26
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 52.043965
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 135
t = 27
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 44.807843
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 135 finished after 27.000000 time steps

Episode = 136
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.427915
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 136
t = 1
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.485775
Explore Rate: 1.000000



Episode = 138
t = 35
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.165934
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 138
t = 36
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.235104
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 138
t = 37
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.235104
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 138
t = 38
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.235104
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 138
t = 39
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.303929
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 138
t = 40
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 67.460590
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 138
t = 41
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 67.460590
Explore Rate: 1.000000
Learning Rate:


Episode = 141
t = 37
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 44.807843
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 141
t = 38
Action: 0
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 44.807843
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 141
t = 39
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 45.083804
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 141
t = 40
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 45.083804
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 141
t = 41
Action: 0
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 45.083804
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 141
t = 42
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 52.283745
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 141
t = 43
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 52.283745
Explore Rate: 1.000000
Learning Rate:


Episode = 144
t = 1
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.808731
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 144
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.851129
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 144
t = 3
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.911873
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 144
t = 4
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.920743
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 144
t = 5
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.976704
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 144
t = 6
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.008840
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 144
t = 7
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 88.240033
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 146
t = 10
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 88.200206
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 146
t = 11
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 88.259205
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 146
t = 12
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 88.262491
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 146
t = 13
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 88.321178
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 146
t = 14
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 52.940383
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 146
t = 15
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 52.940383
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 146
t = 16
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 53.085347
Explore Rate: 1.000000
Learning Rate:


Episode = 149
t = 23
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 67.623287
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 149 finished after 23.000000 time steps

Episode = 150
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.338154
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 150
t = 1
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 83.923426
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 150
t = 2
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 83.923426
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 150
t = 3
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 83.923426
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 150
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.338154
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 150
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.22


Episode = 153
t = 12
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.142997
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 153
t = 13
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.142997
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 153
t = 14
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 68.133449
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 153
t = 15
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 68.133449
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 153
t = 16
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 68.133449
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 153
t = 17
Action: 0
State: (0, 0, 1, 1)
Reward: 1.000000
Best Q: 65.241826
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 153 finished after 17.000000 time steps

Episode = 154
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 


Episode = 155
t = 13
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.374792
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 155
t = 14
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.374792
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 155
t = 15
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.437918
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 155
t = 16
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.437918
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 155
t = 17
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.500729
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 155
t = 18
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.500729
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 155
t = 19
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 68.292782
Explore Rate: 1.000000
Learning Rate:


Episode = 158
t = 31
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.753672
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 158
t = 32
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.753672
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 158
t = 33
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.753672
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 158
t = 34
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.824903
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 158
t = 35
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.824903
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 158
t = 36
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.824903
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 158
t = 37
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 68.516839
Explore Rate: 1.000000
Learning Rate:


Episode = 160
t = 23
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.602902
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 160
t = 24
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.639435
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 160
t = 25
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 49.059646
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 160
t = 26
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 54.694302
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 160
t = 27
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 52.103502
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 160
t = 28
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 52.103502
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 160
t = 29
Action: 0
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 52.103502
Explore Rate: 1.000000
Learning Rate:


Episode = 163
t = 15
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 81.393206
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 163
t = 16
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.066846
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 163
t = 17
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.066846
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 163
t = 18
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 85.557670
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 163
t = 19
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 85.557670
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 163
t = 20
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 85.557670
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 163
t = 21
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 54.920830
Explore Rate: 1.000000
Learning Rate:


Episode = 167
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.003597
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 167
t = 1
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.208974
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 167
t = 2
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.208974
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 167
t = 3
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.208974
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 167
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.262929
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 167
t = 5
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.316614
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 167
t = 6
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.316614
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 169
t = 21
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.077817
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 169
t = 22
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.147427
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 169
t = 23
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.147427
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 169
t = 24
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.216690
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 169
t = 25
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.285607
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 169
t = 26
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 56.256413
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 169
t = 27
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 56.256413
Explore Rate: 1.000000
Learning Rate:


Episode = 173
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.254388
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 173
t = 3
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.254388
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 173
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.308117
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 173
t = 5
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.361576
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 173
t = 6
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.361576
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 173
t = 7
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.234164
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 173
t = 8
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.234164
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 174
t = 45
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 56.849853
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 174 finished after 45.000000 time steps

Episode = 175
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.787280
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 175
t = 1
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.838344
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 175
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.838344
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 175
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.691930
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 175
t = 4
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.691930
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 175
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 86.75

Best Q: 90.139408
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 178
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.774557
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 178
t = 6
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.139408
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 178
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.774557
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 178
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.774557
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 178
t = 9
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.835685
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 178
t = 10
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.896506
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 178
t = 11
Action: 1
State: (0, 0, 3, 2)



Episode = 183
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.713726
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 183
t = 3
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.713726
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 183
t = 4
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.765157
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 183
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.600379
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 183
t = 6
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.765157
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 183
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.600379
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 183
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.600379
Explore Rate: 1.000000
Learning Rate: 0.5000


Episode = 186
t = 18
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.284308
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 186
t = 19
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.337886
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 186
t = 20
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.391197
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 186
t = 21
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.391197
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 186
t = 22
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.391197
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 186
t = 23
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.444241
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 186
t = 24
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.497020
Explore Rate: 1.000000
Learning Rate:


Episode = 188
t = 16
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 70.765333
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 188 finished after 16.000000 time steps

Episode = 189
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.820740
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 189
t = 1
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.871637
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 189
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.871637
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 189
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.377273
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 189
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.702889
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 189
t = 5
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.70


Episode = 190
t = 4
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.950948
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 190
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.950948
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 190
t = 6
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.950948
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 190
t = 7
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.950948
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 190
t = 8
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 71.056948
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 190
t = 9
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 71.056948
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0

Episode 190 finished after 9.000000 time steps

Episode = 191
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.1764


Episode = 194
t = 9
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 88.921507
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 194
t = 10
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.102642
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 194
t = 11
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.102642
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 194
t = 12
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 85.411716
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 194
t = 13
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.102642
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 194
t = 14
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.311666
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 194
t = 15
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.311666
Explore Rate: 1.000000
Learning Rate: 


Episode = 197
t = 9
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.238658
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 197
t = 10
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.454253
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 197
t = 11
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.454253
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 197
t = 12
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.501982
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 197
t = 13
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.505180
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 197
t = 14
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.551055
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 197
t = 15
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.410520
Explore Rate: 1.000000
Learning Rate: 


Episode = 199
t = 14
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.238658
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 199
t = 15
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.287465
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 199
t = 16
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.336028
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 199
t = 17
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.384347
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 199
t = 18
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.384347
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 199
t = 19
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.432426
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 199
t = 20
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 71.423792
Explore Rate: 1.000000
Learning Rate:


Episode = 202
t = 17
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.521985
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 202
t = 18
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.569375
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 202
t = 19
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.569375
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 202
t = 20
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.555822
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 202
t = 21
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.555822
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 202
t = 22
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 90.599732
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 202
t = 23
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 90.633396
Explore Rate: 1.000000
Learning Rate:


Episode = 205
t = 15
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.496306
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 205
t = 16
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.496306
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 205
t = 17
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.496306
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 205
t = 18
Action: 1
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.539988
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 205
t = 19
Action: 1
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.565447
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 205
t = 20
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.612620
Explore Rate: 1.000000
Learning Rate: 0.500000
Streaks: 0


Episode = 205
t = 21
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 88.806310
Explore Rate: 1.000000
Learning Rate:


Episode = 208

KeyboardInterrupt: 

In [30]:
env.close()