In [3]:
import numpy as np
import tensorflow as tf
import os
import gym
env = gym.make("Blackjack-v0")


In [4]:
class Agent:
    def __init__(self, num_actions, state_size):
         
        initializer = tf.contrib.layers.xavier_initializer()
         
        self.input_layer = tf.placeholder(dtype=tf.float32, shape=[None, state_size])
         
        # Neural net starts here
         
        hidden_layer = tf.layers.dense(self.input_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)
        hidden_layer_2 = tf.layers.dense(hidden_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)
         
        # Output of neural net
        out = tf.layers.dense(hidden_layer_2, num_actions, activation=None)
         
        self.outputs = tf.nn.softmax(out)
        self.choice = tf.argmax(self.outputs, axis=1)
         
        # Training Procedure
        self.rewards = tf.placeholder(shape=[None, ], dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None, ], dtype=tf.int32)
         
        one_hot_actions = tf.one_hot(self.actions, num_actions)
         
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=one_hot_actions)
         
        self.loss = tf.reduce_mean(cross_entropy * self.rewards)
         
        self.gradients = tf.gradients(self.loss, tf.trainable_variables())
         
        # Create a placeholder list for gradients
        self.gradients_to_apply = []
        for index, variable in enumerate(tf.trainable_variables()):
            gradient_placeholder = tf.placeholder(tf.float32)
            self.gradients_to_apply.append(gradient_placeholder)
         
        # Create the operation to update gradients with the gradients placeholder.
        optimizer = tf.train.AdamOptimizer()
        self.update_gradients = optimizer.apply_gradients(zip(self.gradients_to_apply, tf.trainable_variables()))


In [6]:
# TODO Create the discounted and normalized rewards function
discount_rate = 0.95


def discount_normalize_rewards(rewards):
    discounted_rewards = np.zeros_like(rewards)
    total_rewards = 0

    for i in reversed(range(len(rewards))):
        total_rewards = total_rewards * discount_rate + rewards[i]
        discounted_rewards[i] = total_rewards

    return discounted_rewards


def format_state(state):
    return [state[0] / 21] + list(np.eye(10)[state[1] - 1]) + [int(state[2])]


In [None]:

# TODO Create the training loop
tf.reset_default_graph()
 
# Modify these to match shape of actions and states in your environment
num_actions = 2
state_size = 12
 
path = "./bj-pg/"
 
training_episodes = 10000
max_steps_per_episode = 10
episode_batch_size = 5
 
loading = False

agent = Agent(num_actions, state_size)
 
init = tf.global_variables_initializer()
 
saver = tf.train.Saver(max_to_keep=2)

 
if not os.path.exists(path):
    os.makedirs(path)
 
with tf.Session() as sess:
    if not loading:
        sess.run(init)
    else:
        checkpoint = tf.train.get_checkpoint_state(path)
        saver.restore(sess,checkpoint.model_checkpoint_path)
     
    total_episode_rewards = []
     
    # Create a buffer of 0'd gradients
    gradient_buffer = sess.run(tf.trainable_variables())
    for index, gradient in enumerate(gradient_buffer):
        gradient_buffer[index] = gradient * 0
 
    for episode in range(1, training_episodes):
 
        state = env.reset()
        state = format_state(state)
        
        episode_history = []
        episode_rewards = 0
         
        for step in range(1,max_steps_per_episode):
             
             
            # Get weights for each action
            action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [state]})
            action_choice = np.argmax(action_probabilities)
             
            state_next, reward, done, _ = env.step(action_choice)
            episode_history.append([state, action_choice, reward, state_next])
            state = state_next
            state = format_state(state_next)
             
            episode_rewards += reward
             
            if done or step + 1 == max_steps_per_episode:
                total_episode_rewards.append(episode_rewards)
                episode_history = np.array(episode_history)
                episode_history[:,2] = discount_normalize_rewards(episode_history[:,2])
                 
                ep_gradients = sess.run(agent.gradients, feed_dict={agent.input_layer: np.vstack(episode_history[:, 0]),
                                                                    agent.actions: episode_history[:, 1],
                                                                    agent.rewards: episode_history[:, 2]})
                # add the gradients to the grad buffer:
                for index, gradient in enumerate(ep_gradients):
                    gradient_buffer[index] += gradient
                 
                break
             
        if episode % episode_batch_size == 0:
         
            feed_dict_gradients = dict(zip(agent.gradients_to_apply, gradient_buffer))
             
            sess.run(agent.update_gradients, feed_dict=feed_dict_gradients)
             
            for index, gradient in enumerate(gradient_buffer):
                gradient_buffer[index] = gradient * 0
                 
        if episode % 1000 == 0:
            saver.save(sess, path + "pg-checkpoint", episode)
            print("Average reward / 1000 eps: " + str(np.mean(total_episode_rewards[-1000:])))
            
    print("TOTAL AVERAGE REWARD:" + str(np.mean(total_episode_rewards)))


In [9]:
with tf.Session() as sess:
    checkpoint = tf.train.get_checkpoint_state(path)
    saver.restore(sess,checkpoint.model_checkpoint_path)
        
    for episode in range(10):
 
        state = env.reset()
        
        state = format_state(state)
        
        episode_history = []
        episode_rewards = 0
        print("new Game")
        for step in range(10):
            print(state)
            print("ai sum:" +str(state[0]*21)+ " dealer sum: "+str(bj.sum_hand(env.dealer)))

            # Get weights for each action
            action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [state]})
            print("hit prob")
            print(action_probabilities[0][1])
            print("stay prob")
            print(action_probabilities[0][0])
            action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])
            print("action choice")
            choice_interpret = ['stay', 'hit']
            print(choice_interpret[int(action_choice)])
             
            state_next, reward, done, _ = env.step(action_choice)
            episode_history.append([state, action_choice, reward, state_next])
            state = state_next
            #if step == 9:
            print("ai sum:" +str(state[0])+ " dealer sum: "+str(bj.sum_hand(env.dealer)))
            state = format_state(state_next)
            

            result_interpret = ["loss", "tie", "win"]
            episode_rewards += reward
            
            
            print()
            if done or step + 1 == max_steps_per_episode:
                print("reward:")
                print(reward)
                print(result_interpret[int(reward)+1])
                total_episode_rewards.append(episode_rewards)
                episode_history = np.array(episode_history)
                episode_history[:,2] = discount_normalize_rewards(episode_history[:,2])
                 
                break

INFO:tensorflow:Restoring parameters from ./bj-pg/pg-checkpoint-9000


new Game
[0.9523809523809523, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0]
ai sum:20.0 dealer sum: 18
hit prob
1.0349092e-14
stay prob
1.0
action choice
stay
ai sum:20 dealer sum: 18

reward:
1.0
win
new Game
[0.7142857142857143, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0]
ai sum:15.0 dealer sum: 17
hit prob
0.0033404483
stay prob
0.9966595
action choice
stay
ai sum:15 dealer sum: 17

reward:
-1.0
loss
new Game
[0.9523809523809523, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]
ai sum:20.0 dealer sum: 7
hit prob
7.7074544e-20
stay prob
1.0
action choice
stay
ai sum:20 dealer sum: 17

reward:
1.0
win
new Game
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1]
ai sum:21.0 dealer sum: 20
hit prob
1.2035026e-11
stay prob
1.0
action choice
stay
ai sum:21 dealer sum: 20

reward:
1.0
win
new Game
[0.42857142857142855, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]
ai sum:9.0 dealer sum: 13
hit prob
0.9999927
stay prob
7.2665894e-06
action choice
hit
ai sum