## Libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from collections import deque
import skimage.measure
import numpy as np
import gym

## Functions

In [2]:
def preprocess(rgb_array):
    '''
    Takes 3D RGB tensor and consolidates to 2D Gray-scale tensor.
    '''
    transform = rgb_array[30:194,:,:]  ## crop
    gray = np.dot(transform[...,:3], [0.2989, 0.5870, 0.1140])  ## convert RGB to Gray Scale
    downsample = skimage.measure.block_reduce(gray, (2,2), np.max)  ## max pooling aka downsampling
    return downsample

def experience_replay(C, DQ, seq_init, action, reward, seq_update, gamestatus):
    '''
    Inputs:
        C = capacity of experience replay (how big window should be)
        DQ = deque object
        seq_init = preprocessed frame before next action taken
        action = action taken
        reward = reward given action
        seq_update = preprocessed frame after action taken (new observation)
        gamestatus = whether end of game or not
    Output:
        updated DQ
    '''
    DQ.append((seq_init, action, reward, seq_update, gamestatus))
    if len(DQ) > C:
        DQ.popleft()
    return DQ

def epsilon_generator(start=1, stop=0.1, num=10):
    '''
    a generator that linearly anneals epsilon
    '''
    test = 0
    epsilon = start
    step = (start - stop) / (num - 1)
    while epsilon >= stop and test < num:    
        yield epsilon
        epsilon -= step
        test += 1
    else:
        yield stop

## DQN

Initialize replay memory D to capacity N  
Initialize action-value function Q with random weights  
for episode = 1, M do  
    Initialise sequence s1 = {x1} and preprocessed sequenced φ1 = φ(s1)  
    for t = 1, T do  
        With probability  select a random action at  
        otherwise select a_t = maxa Q∗(φ(st), a; θ)  
        Execute action a_t in emulator and observe reward rt and image xt+1  
        Set st+1 = s_t, a_t, xt+1 and preprocess φt+1 = φ(s_t+1)  
        Store transition (φt, a_t, r_t, φt+1) in D  
        Sample random minibatch of transitions (φ_j , a_j , r_j , φj+1) from D  
        Set y_j = r_j (for terminal φ_j+1)  
                 r_j + γ maxa0 Q(φ_j+1, a0; θ) (for non-terminal φ_j+1)  
        Perform a gradient descent step on (yj − Q(φj , aj ; θ))^2 according to equation 3  
    end for  
end for

In [3]:
# Atari emulator
env = gym.make('Breakout-v0')

# game variables
nb_games = 5  ## number of games to play
time_steps = 500  ## max number of time steps per game

# experience replay variables
N = int(1e6)  ## capacity
D = deque()  ## deque object

# RL vars
#epsilon = 0.05  ## prob of choosing random action
anneal_tracker = 0  ## tally of how many total iterations have passed
anneal_stop = 1000  ## nb of steps until annealing stops
gen_epsilon = epsilon_generator(start=1, stop=0.1, num=anneal_stop)  ## prob of choosing random action w/linear annealing
discount = 0.9  ## on future rewards

# CNN setup
# To be completed here

# algorithm
for episode in range(nb_games):
    gamestatus = 'nonterminal'
    raw_frame = env.reset()  ## raw initial frame
    seq_init = preprocess(raw_frame)  ## preprocessed initial sequence 
    
    for t in range(time_steps):
        
        # show game in real-time
        env.render()
        
        # linearly anneal epsilon
        if anneal_tracker <= anneal_stop:
            epsilon = next(gen_epsilon)
        print('epsilon:', epsilon)
        anneal_tracker += 1
        
        # take agent-based action every 4 time steps; otherwise push action forward w/out agent computing
        if t%4 == 0:
            action = env.action_space.sample() # take a random action
            #action = env.action_space.sample() if np.random.binomial(n=1, p=epsilon, size=1) else action w/max Q-value
            #print('action =', action)
        
        # feedback from emulator
        observation, reward, done, info = env.step(action)
        
        # preprocess new observation after action    
        seq_update = preprocess(observation)
        
        # stop if out of lives
        if done:
            gamestatus = 'terminal'
            # update experience replay
            experience_replay(C=N, DQ = D, seq_init=seq_init, 
                              action=action, reward=reward, 
                              seq_update=seq_update, gamestatus=gamestatus)
            print('*step: ', t, '| gamestatus: ', gamestatus, '| len(D):', len(D), 
                  '| init != update:', (D[len(D)-1][0] != D[len(D)-1][3]).sum())
            break
        else:
            # update experience replay
            experience_replay(C=N, DQ = D, seq_init=seq_init, 
                              action=action, reward=reward, 
                              seq_update=seq_update, gamestatus=gamestatus)
            print('step:', t, '| gamestatus:', gamestatus, '| len(D):', len(D), 
                  '| init != update:',(D[len(D)-1][0] != D[len(D)-1][3]).sum())
    
        
        # mini-batch sample of experience replay for ConvNet
        D_size = len(D)
        idx = np.random.choice(range(D_size), size=min(D_size, 32), replace=False)
        # calculate target
        for i in idx:
            if D[i][4] == 'terminal':
                target = D[i][2] + 100
            else:
                #target = sample[i][2] + discount*(to be completed)
                target = D[i][2]
            #print('step: ', i, 'gamestatus: ', D[4], 'reward: ', D[2])
        # SGD update
        #update weights
        # set new observation as initial sequence
        seq_init = seq_update
        #print('final target =', target)
    #print( (D[len(D)-1][0] != D[len(D)-1][3]).sum())
    #print(D)

[2017-08-23 11:23:23,152] Making new env: Breakout-v0


epsilon: 1
step: 0 | gamestatus: nonterminal | len(D): 1 | init != update: 0
epsilon: 0.9990990990990991
step: 1 | gamestatus: nonterminal | len(D): 2 | init != update: 0
epsilon: 0.9981981981981982
step: 2 | gamestatus: nonterminal | len(D): 3 | init != update: 0
epsilon: 0.9972972972972973
step: 3 | gamestatus: nonterminal | len(D): 4 | init != update: 0
epsilon: 0.9963963963963964
step: 4 | gamestatus: nonterminal | len(D): 5 | init != update: 9
epsilon: 0.9954954954954955
step: 5 | gamestatus: nonterminal | len(D): 6 | init != update: 48
epsilon: 0.9945945945945946
step: 6 | gamestatus: nonterminal | len(D): 7 | init != update: 33
epsilon: 0.9936936936936938
step: 7 | gamestatus: nonterminal | len(D): 8 | init != update: 36
epsilon: 0.9927927927927929
step: 8 | gamestatus: nonterminal | len(D): 9 | init != update: 33
epsilon: 0.991891891891892
step: 9 | gamestatus: nonterminal | len(D): 10 | init != update: 14
epsilon: 0.9909909909909911
step: 10 | gamestatus: nonterminal | len(D):

step: 86 | gamestatus: nonterminal | len(D): 87 | init != update: 8
epsilon: 0.9216216216216224
step: 87 | gamestatus: nonterminal | len(D): 88 | init != update: 17
epsilon: 0.9207207207207215
step: 88 | gamestatus: nonterminal | len(D): 89 | init != update: 52
epsilon: 0.9198198198198206
step: 89 | gamestatus: nonterminal | len(D): 90 | init != update: 62
epsilon: 0.9189189189189197
step: 90 | gamestatus: nonterminal | len(D): 91 | init != update: 44
epsilon: 0.9180180180180189
step: 91 | gamestatus: nonterminal | len(D): 92 | init != update: 56
epsilon: 0.917117117117118
step: 92 | gamestatus: nonterminal | len(D): 93 | init != update: 17
epsilon: 0.9162162162162171
step: 93 | gamestatus: nonterminal | len(D): 94 | init != update: 35
epsilon: 0.9153153153153162
step: 94 | gamestatus: nonterminal | len(D): 95 | init != update: 63
epsilon: 0.9144144144144153
step: 95 | gamestatus: nonterminal | len(D): 96 | init != update: 59
epsilon: 0.9135135135135144
step: 96 | gamestatus: nontermin

step: 172 | gamestatus: nonterminal | len(D): 173 | init != update: 0
epsilon: 0.8441441441441457
step: 173 | gamestatus: nonterminal | len(D): 174 | init != update: 0
epsilon: 0.8432432432432448
*step:  174 | gamestatus:  terminal | len(D): 175 | init != update: 0
epsilon: 0.842342342342344
step: 0 | gamestatus: nonterminal | len(D): 176 | init != update: 11
epsilon: 0.8414414414414431
step: 1 | gamestatus: nonterminal | len(D): 177 | init != update: 53
epsilon: 0.8405405405405422
step: 2 | gamestatus: nonterminal | len(D): 178 | init != update: 66
epsilon: 0.8396396396396413
step: 3 | gamestatus: nonterminal | len(D): 179 | init != update: 62
epsilon: 0.8387387387387404
step: 4 | gamestatus: nonterminal | len(D): 180 | init != update: 37
epsilon: 0.8378378378378395
step: 5 | gamestatus: nonterminal | len(D): 181 | init != update: 4
epsilon: 0.8369369369369386
step: 6 | gamestatus: nonterminal | len(D): 182 | init != update: 4
epsilon: 0.8360360360360377
step: 7 | gamestatus: nontermi

step: 83 | gamestatus: nonterminal | len(D): 259 | init != update: 54
epsilon: 0.766666666666669
step: 84 | gamestatus: nonterminal | len(D): 260 | init != update: 3
epsilon: 0.7657657657657682
step: 85 | gamestatus: nonterminal | len(D): 261 | init != update: 0
epsilon: 0.7648648648648673
step: 86 | gamestatus: nonterminal | len(D): 262 | init != update: 0
epsilon: 0.7639639639639664
step: 87 | gamestatus: nonterminal | len(D): 263 | init != update: 0
epsilon: 0.7630630630630655
step: 88 | gamestatus: nonterminal | len(D): 264 | init != update: 27
epsilon: 0.7621621621621646
step: 89 | gamestatus: nonterminal | len(D): 265 | init != update: 51
epsilon: 0.7612612612612637
step: 90 | gamestatus: nonterminal | len(D): 266 | init != update: 36
epsilon: 0.7603603603603628
step: 91 | gamestatus: nonterminal | len(D): 267 | init != update: 48
epsilon: 0.7594594594594619
step: 92 | gamestatus: nonterminal | len(D): 268 | init != update: 30
epsilon: 0.758558558558561
step: 93 | gamestatus: non

step: 169 | gamestatus: nonterminal | len(D): 345 | init != update: 5
epsilon: 0.6891891891891924
step: 170 | gamestatus: nonterminal | len(D): 346 | init != update: 4
epsilon: 0.6882882882882915
step: 171 | gamestatus: nonterminal | len(D): 347 | init != update: 8
epsilon: 0.6873873873873906
step: 172 | gamestatus: nonterminal | len(D): 348 | init != update: 12
epsilon: 0.6864864864864897
step: 173 | gamestatus: nonterminal | len(D): 349 | init != update: 8
epsilon: 0.6855855855855888
step: 174 | gamestatus: nonterminal | len(D): 350 | init != update: 4
epsilon: 0.6846846846846879
step: 175 | gamestatus: nonterminal | len(D): 351 | init != update: 8
epsilon: 0.683783783783787
step: 176 | gamestatus: nonterminal | len(D): 352 | init != update: 32
epsilon: 0.6828828828828861
step: 177 | gamestatus: nonterminal | len(D): 353 | init != update: 53
epsilon: 0.6819819819819852
step: 178 | gamestatus: nonterminal | len(D): 354 | init != update: 66
epsilon: 0.6810810810810843
step: 179 | games

step: 16 | gamestatus: nonterminal | len(D): 431 | init != update: 48
epsilon: 0.6117117117117157
step: 17 | gamestatus: nonterminal | len(D): 432 | init != update: 36
epsilon: 0.6108108108108148
step: 18 | gamestatus: nonterminal | len(D): 433 | init != update: 36
epsilon: 0.6099099099099139
step: 19 | gamestatus: nonterminal | len(D): 434 | init != update: 33
epsilon: 0.609009009009013
step: 20 | gamestatus: nonterminal | len(D): 435 | init != update: 27
epsilon: 0.6081081081081121
step: 21 | gamestatus: nonterminal | len(D): 436 | init != update: 3
epsilon: 0.6072072072072112
step: 22 | gamestatus: nonterminal | len(D): 437 | init != update: 0
epsilon: 0.6063063063063103
step: 23 | gamestatus: nonterminal | len(D): 438 | init != update: 0
epsilon: 0.6054054054054094
step: 24 | gamestatus: nonterminal | len(D): 439 | init != update: 9
epsilon: 0.6045045045045085
step: 25 | gamestatus: nonterminal | len(D): 440 | init != update: 54
epsilon: 0.6036036036036077
step: 26 | gamestatus: no

epsilon: 0.534234234234239
step: 103 | gamestatus: nonterminal | len(D): 518 | init != update: 0
epsilon: 0.5333333333333381
step: 104 | gamestatus: nonterminal | len(D): 519 | init != update: 0
epsilon: 0.5324324324324372
step: 105 | gamestatus: nonterminal | len(D): 520 | init != update: 0
epsilon: 0.5315315315315363
step: 106 | gamestatus: nonterminal | len(D): 521 | init != update: 0
epsilon: 0.5306306306306354
step: 107 | gamestatus: nonterminal | len(D): 522 | init != update: 0
epsilon: 0.5297297297297345
step: 108 | gamestatus: nonterminal | len(D): 523 | init != update: 0
epsilon: 0.5288288288288336
step: 109 | gamestatus: nonterminal | len(D): 524 | init != update: 0
epsilon: 0.5279279279279327
step: 110 | gamestatus: nonterminal | len(D): 525 | init != update: 0
epsilon: 0.5270270270270319
step: 111 | gamestatus: nonterminal | len(D): 526 | init != update: 0
epsilon: 0.526126126126131
step: 112 | gamestatus: nonterminal | len(D): 527 | init != update: 2
epsilon: 0.52522522522

epsilon: 0.4567567567567623
step: 189 | gamestatus: nonterminal | len(D): 604 | init != update: 37
epsilon: 0.4558558558558614
step: 190 | gamestatus: nonterminal | len(D): 605 | init != update: 52
epsilon: 0.4549549549549605
step: 191 | gamestatus: nonterminal | len(D): 606 | init != update: 56
epsilon: 0.45405405405405963
step: 192 | gamestatus: nonterminal | len(D): 607 | init != update: 24
epsilon: 0.45315315315315874
step: 193 | gamestatus: nonterminal | len(D): 608 | init != update: 12
epsilon: 0.45225225225225785
step: 194 | gamestatus: nonterminal | len(D): 609 | init != update: 8
epsilon: 0.45135135135135696
step: 195 | gamestatus: nonterminal | len(D): 610 | init != update: 8
epsilon: 0.45045045045045606
step: 196 | gamestatus: nonterminal | len(D): 611 | init != update: 8
epsilon: 0.44954954954955517
step: 197 | gamestatus: nonterminal | len(D): 612 | init != update: 8
epsilon: 0.4486486486486543
step: 198 | gamestatus: nonterminal | len(D): 613 | init != update: 8
epsilon: 

epsilon: 0.3792792792792856
step: 275 | gamestatus: nonterminal | len(D): 690 | init != update: 47
epsilon: 0.3783783783783847
step: 276 | gamestatus: nonterminal | len(D): 691 | init != update: 10
epsilon: 0.37747747747748384
step: 277 | gamestatus: nonterminal | len(D): 692 | init != update: 41
epsilon: 0.37657657657658294
step: 278 | gamestatus: nonterminal | len(D): 693 | init != update: 10
epsilon: 0.37567567567568205
step: 279 | gamestatus: nonterminal | len(D): 694 | init != update: 8
epsilon: 0.37477477477478116
step: 280 | gamestatus: nonterminal | len(D): 695 | init != update: 6
epsilon: 0.37387387387388027
step: 281 | gamestatus: nonterminal | len(D): 696 | init != update: 6
epsilon: 0.3729729729729794
step: 282 | gamestatus: nonterminal | len(D): 697 | init != update: 6
epsilon: 0.3720720720720785
step: 283 | gamestatus: nonterminal | len(D): 698 | init != update: 4
epsilon: 0.3711711711711776
step: 284 | gamestatus: nonterminal | len(D): 699 | init != update: 5
epsilon: 0.

epsilon: 0.30180180180180893
step: 6 | gamestatus: nonterminal | len(D): 776 | init != update: 0
epsilon: 0.30090090090090804
step: 7 | gamestatus: nonterminal | len(D): 777 | init != update: 0
epsilon: 0.30000000000000715
step: 8 | gamestatus: nonterminal | len(D): 778 | init != update: 0
epsilon: 0.29909909909910626
step: 9 | gamestatus: nonterminal | len(D): 779 | init != update: 0
epsilon: 0.29819819819820537
step: 10 | gamestatus: nonterminal | len(D): 780 | init != update: 0
epsilon: 0.2972972972973045
step: 11 | gamestatus: nonterminal | len(D): 781 | init != update: 0
epsilon: 0.2963963963964036
step: 12 | gamestatus: nonterminal | len(D): 782 | init != update: 0
epsilon: 0.2954954954955027
step: 13 | gamestatus: nonterminal | len(D): 783 | init != update: 0
epsilon: 0.2945945945946018
step: 14 | gamestatus: nonterminal | len(D): 784 | init != update: 0
epsilon: 0.2936936936937009
step: 15 | gamestatus: nonterminal | len(D): 785 | init != update: 0
epsilon: 0.2927927927928
step

epsilon: 0.22432432432433225
step: 92 | gamestatus: nonterminal | len(D): 862 | init != update: 0
epsilon: 0.22342342342343136
step: 93 | gamestatus: nonterminal | len(D): 863 | init != update: 0
epsilon: 0.22252252252253046
step: 94 | gamestatus: nonterminal | len(D): 864 | init != update: 0
epsilon: 0.22162162162162957
step: 95 | gamestatus: nonterminal | len(D): 865 | init != update: 0
epsilon: 0.22072072072072868
step: 96 | gamestatus: nonterminal | len(D): 866 | init != update: 27
epsilon: 0.2198198198198278
step: 97 | gamestatus: nonterminal | len(D): 867 | init != update: 48
epsilon: 0.2189189189189269
step: 98 | gamestatus: nonterminal | len(D): 868 | init != update: 36
epsilon: 0.218018018018026
step: 99 | gamestatus: nonterminal | len(D): 869 | init != update: 54
epsilon: 0.2171171171171251
step: 100 | gamestatus: nonterminal | len(D): 870 | init != update: 27
epsilon: 0.21621621621622422
step: 101 | gamestatus: nonterminal | len(D): 871 | init != update: 6
epsilon: 0.2153153

step: 178 | gamestatus: nonterminal | len(D): 948 | init != update: 59
epsilon: 0.14594594594595467
step: 179 | gamestatus: nonterminal | len(D): 949 | init != update: 56
epsilon: 0.14504504504505378
step: 180 | gamestatus: nonterminal | len(D): 950 | init != update: 31
epsilon: 0.14414414414415289
step: 181 | gamestatus: nonterminal | len(D): 951 | init != update: 14
epsilon: 0.143243243243252
step: 182 | gamestatus: nonterminal | len(D): 952 | init != update: 12
epsilon: 0.1423423423423511
step: 183 | gamestatus: nonterminal | len(D): 953 | init != update: 12
epsilon: 0.1414414414414502
step: 184 | gamestatus: nonterminal | len(D): 954 | init != update: 8
epsilon: 0.14054054054054932
step: 185 | gamestatus: nonterminal | len(D): 955 | init != update: 12
epsilon: 0.13963963963964843
step: 186 | gamestatus: nonterminal | len(D): 956 | init != update: 8
epsilon: 0.13873873873874754
step: 187 | gamestatus: nonterminal | len(D): 957 | init != update: 12
epsilon: 0.13783783783784664
step: 

step: 8 | gamestatus: nonterminal | len(D): 1046 | init != update: 10
epsilon: 0.1
step: 9 | gamestatus: nonterminal | len(D): 1047 | init != update: 31
epsilon: 0.1
step: 10 | gamestatus: nonterminal | len(D): 1048 | init != update: 37
epsilon: 0.1
step: 11 | gamestatus: nonterminal | len(D): 1049 | init != update: 56
epsilon: 0.1
step: 12 | gamestatus: nonterminal | len(D): 1050 | init != update: 45
epsilon: 0.1
step: 13 | gamestatus: nonterminal | len(D): 1051 | init != update: 8
epsilon: 0.1
step: 14 | gamestatus: nonterminal | len(D): 1052 | init != update: 12
epsilon: 0.1
step: 15 | gamestatus: nonterminal | len(D): 1053 | init != update: 8
epsilon: 0.1
step: 16 | gamestatus: nonterminal | len(D): 1054 | init != update: 8
epsilon: 0.1
step: 17 | gamestatus: nonterminal | len(D): 1055 | init != update: 26
epsilon: 0.1
step: 18 | gamestatus: nonterminal | len(D): 1056 | init != update: 59
epsilon: 0.1
step: 19 | gamestatus: nonterminal | len(D): 1057 | init != update: 59
epsilon: 0

epsilon: 0.1
step: 107 | gamestatus: nonterminal | len(D): 1145 | init != update: 48
epsilon: 0.1
step: 108 | gamestatus: nonterminal | len(D): 1146 | init != update: 48
epsilon: 0.1
step: 109 | gamestatus: nonterminal | len(D): 1147 | init != update: 48
epsilon: 0.1
step: 110 | gamestatus: nonterminal | len(D): 1148 | init != update: 30
epsilon: 0.1
step: 111 | gamestatus: nonterminal | len(D): 1149 | init != update: 36
epsilon: 0.1
step: 112 | gamestatus: nonterminal | len(D): 1150 | init != update: 14
epsilon: 0.1
step: 113 | gamestatus: nonterminal | len(D): 1151 | init != update: 4
epsilon: 0.1
step: 114 | gamestatus: nonterminal | len(D): 1152 | init != update: 8
epsilon: 0.1
step: 115 | gamestatus: nonterminal | len(D): 1153 | init != update: 12
epsilon: 0.1
step: 116 | gamestatus: nonterminal | len(D): 1154 | init != update: 12
epsilon: 0.1
step: 117 | gamestatus: nonterminal | len(D): 1155 | init != update: 8
epsilon: 0.1
step: 118 | gamestatus: nonterminal | len(D): 1156 | in

step: 205 | gamestatus: nonterminal | len(D): 1243 | init != update: 17
epsilon: 0.1
step: 206 | gamestatus: nonterminal | len(D): 1244 | init != update: 55
epsilon: 0.1
step: 207 | gamestatus: nonterminal | len(D): 1245 | init != update: 58
epsilon: 0.1
step: 208 | gamestatus: nonterminal | len(D): 1246 | init != update: 38
epsilon: 0.1
step: 209 | gamestatus: nonterminal | len(D): 1247 | init != update: 9
epsilon: 0.1
step: 210 | gamestatus: nonterminal | len(D): 1248 | init != update: 6
epsilon: 0.1
step: 211 | gamestatus: nonterminal | len(D): 1249 | init != update: 4
epsilon: 0.1
step: 212 | gamestatus: nonterminal | len(D): 1250 | init != update: 16
epsilon: 0.1
step: 213 | gamestatus: nonterminal | len(D): 1251 | init != update: 6
epsilon: 0.1
step: 214 | gamestatus: nonterminal | len(D): 1252 | init != update: 6
epsilon: 0.1
step: 215 | gamestatus: nonterminal | len(D): 1253 | init != update: 5
epsilon: 0.1
step: 216 | gamestatus: nonterminal | len(D): 1254 | init != update: 9


step: 304 | gamestatus: nonterminal | len(D): 1342 | init != update: 5
epsilon: 0.1
step: 305 | gamestatus: nonterminal | len(D): 1343 | init != update: 6
epsilon: 0.1
step: 306 | gamestatus: nonterminal | len(D): 1344 | init != update: 6
epsilon: 0.1
step: 307 | gamestatus: nonterminal | len(D): 1345 | init != update: 6
epsilon: 0.1
step: 308 | gamestatus: nonterminal | len(D): 1346 | init != update: 6
epsilon: 0.1
step: 309 | gamestatus: nonterminal | len(D): 1347 | init != update: 5
epsilon: 0.1
step: 310 | gamestatus: nonterminal | len(D): 1348 | init != update: 4
epsilon: 0.1
step: 311 | gamestatus: nonterminal | len(D): 1349 | init != update: 4
epsilon: 0.1
step: 312 | gamestatus: nonterminal | len(D): 1350 | init != update: 4
epsilon: 0.1
step: 313 | gamestatus: nonterminal | len(D): 1351 | init != update: 8
epsilon: 0.1
step: 314 | gamestatus: nonterminal | len(D): 1352 | init != update: 8
epsilon: 0.1
step: 315 | gamestatus: nonterminal | len(D): 1353 | init != update: 8
epsil

epsilon: 0.1
step: 403 | gamestatus: nonterminal | len(D): 1441 | init != update: 6
epsilon: 0.1
step: 404 | gamestatus: nonterminal | len(D): 1442 | init != update: 4
epsilon: 0.1
step: 405 | gamestatus: nonterminal | len(D): 1443 | init != update: 0
epsilon: 0.1
step: 406 | gamestatus: nonterminal | len(D): 1444 | init != update: 0
epsilon: 0.1
step: 407 | gamestatus: nonterminal | len(D): 1445 | init != update: 27
epsilon: 0.1
step: 408 | gamestatus: nonterminal | len(D): 1446 | init != update: 33
epsilon: 0.1
step: 409 | gamestatus: nonterminal | len(D): 1447 | init != update: 0
epsilon: 0.1
step: 410 | gamestatus: nonterminal | len(D): 1448 | init != update: 0
epsilon: 0.1
*step:  411 | gamestatus:  terminal | len(D): 1449 | init != update: 0


# EXAMPLE

### Get Frames

In [None]:
frames = []
rewards = []
nb_frames = 10
env = gym.make('Breakout-v0')
env.reset()
for _ in range(nb_frames):
    env.render()
    action = env.action_space.sample() # take a random action
    observation, reward, done, info = env.step(action)
    rewards.append(reward)
    if done:
        break
    elif _ % 2 == 0:  ## sample every other frame
        frames.append(preprocess(observation))

### Show Preprocessed Data Frames

In [None]:
for frame in frames:
    plt.imshow(frame, cmap = plt.get_cmap('gray'))
    plt.show()

### Frame Dimensions

In [None]:
frame.shape

# EXPERIMENTAL

In [None]:
Max = 1
Min = 0.1

In [None]:
# a generator that yields items instead of returning a list
def firstn(n):
    num = 0
    while num < n:
        yield num
        num += 1

In [None]:
for i in firstn(10):
    print(i)

In [None]:
# a generator that linearly annealed epsilon
def epsilon_generator(start=1, stop=0.1, num=10):
    test = 0
    epsilon = start
    step = (start - stop) / (num - 1)
    while epsilon >= stop and test < num:    
        yield epsilon
        epsilon -= step
        test += 1
    else:
        yield stop

In [None]:
generator = epsilon_generator()

In [None]:
for i in range(11):
    tmp = next(generator)
    print(tmp)