In [16]:
import numpy as np
#import pandas as pd
import tensorflow as tf
import random
import gym
from gym.wrappers import monitoring
from scipy.misc import imresize
import matplotlib.pyplot as plt
import Image
from datetime import datetime

In [2]:
def downsample(observ): 
    obs_resized = observ[30:195,6:154].mean(axis = 2)
    # nearest neighbor interpolation produces a much sharper image
    # than default bilinear
    obs_resized = imresize(obs_resized, size=(IM_SIZE, IM_SIZE)
                         , interp='nearest')
    return obs_resized

In [3]:
class exp_buffer:
    def __init__(self, MIN_EXPERIENCES, MAX_EXPERIENCES):
        self.experience = []
        self.min_length = MIN_EXPERIENCES
        self.max_length = MAX_EXPERIENCES
        self.buffer_length = 0
#        self.batch_size = batch_size
        
    def obs2state(self, previous_state, observ):
        obs_resized = downsample(observ)
        state = np.append(previous_state[1:], np.expand_dims(
                          obs_resized, 0) , axis=0)
        return state
    
    def update(self, (state, action, reward, state_new, done )):
        if self.buffer_length < self.max_length:
            self.experience.append( (state, action, reward, state_new, done) )
            self.buffer_length +=1
        else:     
            self.experience.pop(0)
            self.experience.append( (state, action, reward, state_new, done) )
        
    def get_batch(self, batch_size):
        state_batch=[]
#        sample_idxs = np.random.choice(self.buffer_length, batch_size)
#        for idx in sample_idxs:
#            state_batch.append(self.experience[idx][0])
#            state_batch = np.stack(state_batch, axis = 0)
        samples = random.sample(self.experience, batch_size)
## below zip(*some_list) is to unzip some_list:
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(
            np.array, zip(*samples))
        return state_batch, action_batch, reward_batch, next_state_batch, done_batch

In [4]:
class DQN:
    def __init__(self, K, conv_sizes, fc_sizes, session, scope):
        self.eps_greedy = 0.1
        self.K = K
        self.scope = scope
        self.sess = session
        self.gamma = 0.9
        
        with tf.variable_scope(scope):
            self.action = tf.placeholder(tf.int32, shape = (None,), name = 'action' )
            self.G = tf.placeholder(tf.float32, shape = (None,), name = 'G') 
            self.X = tf.placeholder(tf.float32, shape = (None, 4, IM_SIZE, IM_SIZE),
                                    name = 'X' )
            # tensorflow convolution needs the order to be:
            # (num_samples, height, width, "color")
            # so we need to tranpose later
            Z = self.X/255.0    
            Z = tf.transpose(Z, [0,2,3,1])

            for num_output_filters, filtersz, poolsz in conv_sizes:
                Z = tf.contrib.layers.conv2d(
                Z,
                num_output_filters,
                filtersz,
                poolsz,
                activation_fn=tf.nn.relu
                )
            
            Z = tf.contrib.layers.flatten(Z)
        
            for sizes in fc_sizes:
                Z = tf.contrib.layers.fully_connected(Z, sizes)
#            for sizes in fc_sizes:
#                Z = tf.contrib.layers.fully_connected(Z, sizes, activation_fn=tf.nn.relu)
            
            
            self.predict_op = tf.contrib.layers.fully_connected(Z, K)

            values_selected  = tf.reduce_sum(
              self.predict_op * tf.one_hot(self.action, K),
              reduction_indices=[1]      )
#        values_selected = []
#        for idx in xrange(self.X.shape[0]):
#            values_selected.append(self.predict_op[idx][self.action[idx]]) 
            

            
#            cost = np.mean(np.square(self.G - values_selected), axis = 0)
            cost = tf.reduce_mean(tf.square(self.G - values_selected))        
            self.train_op = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6).minimize(cost)
            self.cost = cost
        
        
    def select_action(self, state, epsilon):
        if np.random.uniform(0,1) > epsilon: 
            action = np.argmax( self.predict([state])[0] )  ## why a "[0]" at the end??      
        else:
            action = np.random.choice(self.K)
        return action  
    
    def update(self, state_batch, action_batch, target_batch):       
        self.sess.run(self.train_op, feed_dict={
               self.X: state_batch, self.action: action_batch,
               self.G: target_batch })
#        return self.cost
        
    def copy_from(self, other_model):
        params_target = [parms for parms in tf.trainable_variables() if parms.name.startswith(
                self.scope)]
        params_target = sorted(params_target, key=lambda v: v.name)
        params_source = [parms for parms in tf.trainable_variables() if parms.name.startswith(
                other_model.scope)]
        params_source = sorted(params_source, key=lambda v: v.name)
         
        copy_ops = []
        for p,q in zip(params_target, params_source):
            params_to_move = self.sess.run(q)
            copy_op = p.assign(params_to_move)
            copy_ops.append(copy_op)
            
        self.sess.run(copy_ops)
            
    def predict(self, states):
        predictions = self.sess.run(self.predict_op, feed_dict={
                                                    self.X: states})
        return predictions          

In [5]:
def one_round(env, model, model_supp, buff, SAR_cycles, gamma,
              eps, eps_delta, eps_min, batch_size ):

    t0 = datetime.now()    
    total_time_training = 0
    num_steps_in_episode = 0
#    lost = None    
    
    one_round_return = 0.
    observ = env.reset()
    obs_resized = downsample(observ)
    state = np.stack([obs_resized]*4, axis = 0)
    
#    assert(state.shape == (4, 80, 80))

    done = False
    
    while not done:
        action = model_supp.select_action(state, eps)
        observ_new, reward, done, _ = env.step(action)  
        one_round_return += reward

        state_new = buff.obs2state(state, observ_new)
        buff.update((state, action, reward, state_new, done))
        state = state_new
       
    
        
        t0_2 = datetime.now()
        
        ## This buff.get_batch() will grab different batches, 
        ## each with (batch_size) samples.
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = buff.get_batch(
        batch_size)
        
        Q_next_batch = model.predict(next_state_batch)
        max_Q_next_batch = np.amax(Q_next_batch, axis=1)
        target_batch = reward_batch + np.invert(done_batch).astype(np.float32)*gamma*max_Q_next_batch 
        model_supp.update(state_batch, action_batch, target_batch) 
#        cost = model_supp.update(state_batch, action_batch, target_batch)        
#        loss = learn(model, target_model, experience_replay_buffer, gamma, batch_size)
        dt = datetime.now() - t0_2
        total_time_training += dt.total_seconds()
        
        num_steps_in_episode += 1    
            
    
            
        if SAR_cycles%model_update_period == 0:
            model.copy_from(model_supp)
            print("Copy params from model_supp. SAR_cycles = %s, period = %s" % (SAR_cycles, model_update_period))
        
        SAR_cycles += 1
        eps = max(eps - eps_delta, eps_min)
    
    return one_round_return, eps, SAR_cycles

In [6]:
if __name__ == '__main__':
#def main():
    env = gym.make('Breakout-v0')
    IM_SIZE = 80
    session = tf.Session()
    conv_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)]
    fc_sizes = [512]
    gamma = 0.99


#    episode_rewards = np.zeros(num_episodes)
    
    K = 4 #env.action_space.n
    MIN_EXPERIENCES = 50000
    MAX_EXPERIENCES = 500000
    num_episodes = 3000  #10000
    SAR_cycles = 0
    model_update_period = 10000
    batch_size = 32
    
    eps = 1.0
    eps_min = 0.1
    eps_delta = (eps - eps_min) / 500000
    
    
#    model = DQN(
#      K=K,
#      conv_layer_sizes=conv_layer_sizes,
#      hidden_layer_sizes=hidden_layer_sizes,
#      gamma=gamma,
#      scope= "model")
    
    model = DQN(K, conv_sizes, fc_sizes, session, scope = 'model')
    model_supp = DQN(K, conv_sizes, fc_sizes, session, scope = 'model_supp')
    buff = exp_buffer(MIN_EXPERIENCES, MAX_EXPERIENCES)
    

    #sess.run(tf.global_variables_initializer())
    session.run(tf.initialize_all_variables())
    
    
#    round_length = []
    observ = env.reset()
    obs_resized = downsample(observ)
    state = np.stack([obs_resized]*4, axis = 0)
    
    for iters in xrange(MIN_EXPERIENCES ):
        action = np.random.choice(K)
        observ_new, reward, done, _ = env.step(action)          
        state_new = buff.obs2state(state, observ_new)
        buff.update((state, action, reward, state_new, done))
        
        if done:      
            observ = env.reset()
            obs_resized = downsample(observ)
            state = np.stack([obs_resized]*4, axis = 0)    
        else:
            state = state_new
        
#        if iters%20 == 0:
#            print(iters, buff.buffer_length)

# total_t, episode_reward, duration, 
# num_steps_in_episode, time_per_step, epsilon 
# = play_one(  env, total_t,  experience_replay_buffer,
#              model, target_model, gamma,  batch_sz,
#              epsilon,  epsilon_change,  epsilon_min,      )

    all_returns = [] 
    for iters in xrange(num_episodes):       
        one_round_return, eps, SAR_cycles = one_round(env, model, 
                              model_supp, buff, SAR_cycles, gamma,
                              eps, eps_delta, eps_min, batch_size )

        all_returns.append( one_round_return )
        
        if iters%100 == 0:
            last_100_avg = np.mean(all_returns[max(0, iters-100):iters+1])
            print(iters, last_100_avg)  
    
#    print(all_returns[:5])


#            last_100_avg = episode_rewards[max(0, i - 100):i + 1].mean()
#            print("Episode:", i,
#              "Duration:", duration,
#              "Num steps:", num_steps_in_episode,
#              "Reward:", episode_reward,
#              "Training time per step:", "%.3f" % time_per_step,
#              "Avg Reward (Last 100):", "%.3f" % last_100_avg,
#              "Epsilon:", "%.3f" % epsilon
#            )
#            sys.stdout.flush()

[2017-07-21 23:55:06,998] Making new env: Breakout-v0


Copy params from model_supp. SAR_cycles = 0, period = 10000
(0, 1.0)
Copy params from model_supp. SAR_cycles = 10000, period = 10000
Copy params from model_supp. SAR_cycles = 20000, period = 10000
(100, 1.0297029702970297)
Copy params from model_supp. SAR_cycles = 30000, period = 10000
Copy params from model_supp. SAR_cycles = 40000, period = 10000
(200, 1.4554455445544554)
Copy params from model_supp. SAR_cycles = 50000, period = 10000
Copy params from model_supp. SAR_cycles = 60000, period = 10000
Copy params from model_supp. SAR_cycles = 70000, period = 10000
(300, 1.3762376237623761)
Copy params from model_supp. SAR_cycles = 80000, period = 10000
Copy params from model_supp. SAR_cycles = 90000, period = 10000
(400, 1.1386138613861385)
Copy params from model_supp. SAR_cycles = 100000, period = 10000
Copy params from model_supp. SAR_cycles = 110000, period = 10000
Copy params from model_supp. SAR_cycles = 120000, period = 10000
(500, 1.1485148514851484)
Copy params from model_supp. S

In [228]:
#if __name__ == '__main__':
#    main()

In [12]:
np.mean(all_returns[2900:])


19.710000000000001

In [9]:
len(all_returns)

3000

In [50]:
env = gym.make('Breakout-v0')
name = 'openai_gym_vedios/breakout/02'
env = monitoring.Monitor(env, name , force=True)

observ = env.reset()
obs_resized = downsample(observ)
state = np.stack([obs_resized]*4, axis = 0)
 
done = False
    
while not done:
    env.render()
    action = model_supp.select_action(state, eps)
    observ_new, reward, done, _ = env.step(action)  
    state_new = buff.obs2state(state, observ_new)
    state = state_new
 

[2017-07-22 05:56:56,656] Making new env: Breakout-v0
[2017-07-22 05:56:56,690] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/ny/openai_gym_vedios/breakout/02')
[2017-07-22 05:56:56,695] Clearing 4 monitor files from previous run (because force=True was provided)
[2017-07-22 05:56:56,705] Starting new video recorder writing to /home/ny/openai_gym_vedios/breakout/02/openaigym.video.33.4286.video000000.mp4


## TEST main()

In [199]:
logic = True
for i in xrange(IM_SIZE):
    for j in xrange(IM_SIZE):
        logic = logic and (buff.experience[buff.buffer_length-10][3][0][i][j]==buff.experience[buff.buffer_length-1][3][0][i][j])
        
print logic
        

False


In [26]:
img = Image.fromarray(obs_resized)
#img = Image.fromarray(observ)
img.show()

In [27]:
print(observ.shape)
print(type(obs_resized))

(210, 160, 3)
<type 'numpy.ndarray'>


In [31]:
np.random.uniform(0,1)
np.random.choice(4)
np.argmax([1,2,4,16,8,1])

3