In [1]:
import numpy as np
import tensorflow as tf

from tf_agents.environments.py_environment import PyEnvironment
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.trajectories import time_step as ts
from tf_agents.specs.array_spec import BoundedArraySpec,ArraySpec
from tensorflow import TensorSpec
from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
from tf_agents.trajectories.trajectory import Trajectory
from tf_agents.trajectories import trajectory, policy_step
from tf_agents.trajectories.time_step import TimeStep
from tf_agents.environments import utils
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.optimizers import Adam

from ppo_agent import PPOAgent
from yim.helpers.tf_helpers import append_tensor
from yim.helpers.np_help import np_one_hot
from common import whole_paths

# Blackjack Environment

In [2]:
DECK_DIR='deck.npy'
class CardGameEnv(PyEnvironment):
    def __init__(self,shuffle_at=51):
        #self.shuffle_at = np.array(shuffle_at,np.int32)
        self.deck= np.load(DECK_DIR).astype(np.float32)
        self.card_count = np.array(0,np.int32)
        self.cards_seen = np.zeros(52,np.int32) -1
        
        self._action_spec = BoundedArraySpec(shape=(),minimum=0,maximum=2,dtype=np.int32)
        self._observation_spec = (ArraySpec(shape=(52,),dtype=np.int32),
                                 ArraySpec(shape=(),dtype=np.int32),
                                 ArraySpec(shape=(),dtype=np.bool),
                                 ArraySpec(shape=(),dtype=np.int32),
                                 ArraySpec(shape=(),dtype=np.bool),
                                 BoundedArraySpec(shape=(),minimum=0.0,maximum=1.0,dtype=np.float32))
        self.start_cards = np.sum(self.deck)
        self.out_of_cards=False
    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec
    def _reset(self):
        self.new_hand()
        obs = self.get_obs()
        return ts.restart(obs)
    def _step(self,action):
        #0 = stand
        #1 = hit
        #2 = doubledown
        multiplier=1.0
        if action==2:
            multiplier=2.0
        if action==0:
            rew =self.stood(multiplier)
            self.new_hand()
            if self.out_of_cards:
                obs = self.get_obs()
                rew = np.array(0,np.float32)
                return ts.termination(obs,rew)
            obs = self.get_obs()
            return ts.transition(obs,rew)
        elif action==2:
            self.deal_card(True)
            if self.player_score> 21:
                rew = np.array(-1,np.float32) * multiplier
            else:
                rew =self.stood(multiplier)
            self.new_hand()
            if self.out_of_cards:
                obs = self.get_obs()
                rew = np.array(0,np.float32)
                return ts.termination(obs,rew)
            obs = self.get_obs()
            return ts.transition(obs,rew)
        else:
            rew = np.array(0,np.float32)
            self.deal_card(True)
            if self.player_score> 21:
                rew = np.array(-1,np.float32) * multiplier
                self.new_hand()
            
            if self.out_of_cards:
                obs = self.get_obs()
                rew = np.array(0,np.float32)
                return ts.termination(obs,rew)
            obs = self.get_obs()
            return ts.transition(obs,rew)
    def get_obs(self):
        return (self.cards_seen,self.player_score,
            self.player_has_ace,self.dealer_score,self.dealer_has_ace,(self.card_count).astype(np.float32)/self.start_cards)
    def deal_card(self,player=True):
        if np.sum(self.deck)<1:
            self.out_of_cards=True
            return
        p = self.deck / np.sum(self.deck)
        card = np.squeeze(np.random.choice(10,1,p=p),0)
        self.deck[card]-=1
        self.cards_seen[self.card_count]=card
        self.card_count+=1
        if card==0:
            card=10
        if player:
            if card==1:
                self.player_has_ace =True
            self.player_score+=card
        else:
            if card==1:
                self.dealer_has_ace=True
            self.dealer_score+=card
    def new_hand(self):
        self.ended=False
        self.player_score=np.array(0,np.int32)
        self.dealer_score=np.array(0,np.int32)
        self.player_has_ace=False
        self.dealer_has_ace=False
        self.deal_card(False)
        self.deal_card(True)
        self.deal_card(True)
        
    def stood(self,multiplier):
        while not self.ended:
            self.deal_card(False)
            if self.out_of_cards:
                self.ended=True
            elif self.dealer_score > 21:
                self.ended=True
            elif self.dealer_score>=17:
                self.ended=True
            elif self.dealer_has_ace:
                if self.dealer_score>8 and self.dealer_score<=11:
                    self.ended=True
        if self.player_has_ace:
            if self.player_score+10<=21:
                self.player_score+=10
        if self.dealer_has_ace:
            if self.dealer_score+10<=21:
                self.dealer_score+=10
        if (self.dealer_score>21) or(self.dealer_score<self.player_score) :
            return np.array(1,np.float32) * multiplier
        if self.dealer_score==self.player_score:
            return np.array(0,np.float32)
        return np.array(-1,np.float32) * multiplier

environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=10)

# Visualization of Environment

In [3]:
# Visualization of Environment
def print_info(time_step,startInd=0,endInd=0,action=0):
    def to_string(cards):
        if len(cards)<2:
            the_str = '?, '+str(cards)
        else:
            the_str = str(cards)
        return the_str.replace('1','A').replace('[','').replace(']','').replace('0','10')
    newHand=False
    obs = time_step.observation[0]
    inds = obs==-1
    obs = obs[~inds]
    
    obs = list(obs)
    dealer_cards = []
    dealer_cards.append(obs.pop(startInd))
    
    if endInd==0:
        endInd=len(obs)
    
    if action==1 or action==2:
        endInd+=1
        
    player_cards = obs[startInd:endInd]
    if len(obs)>endInd:
        

        newHand=True
        if time_step.is_last():
            dealer_cards.append(obs[endInd:])
        else:
            startInd = endInd
            while startInd<len(obs)-3:
                dealer_cards.append(obs[startInd])
                startInd+=1
            
    
    
        

    print("Dealer:",to_string(dealer_cards))
    print("Player:",to_string(player_cards))
    if newHand:
        print("Reward:",time_step.reward)
        if time_step.is_last():
            print('-'*20)
            print('END OF CARDS')
            print('-'*20)
            return None,None
        print('-'*20)
        print('New Hand')
        print('-'*20)
        return print_info(time_step,startInd=startInd+1,endInd=0)
    
    return startInd,endInd



In [4]:
env = CardGameEnv()
time_step=env._reset()   
startInd,endInd=print_info(time_step,startInd=0,endInd=0)

Dealer: ?, 5
Player: 10, 9


In [5]:
action=0 #[0: STAND] [1: HIT] [2:DoubleDown]

time_step=env._step(action)
#print(time_step.observation[0],time_step.observation[5])
startInd,endInd=print_info(time_step,startInd=startInd,endInd=endInd,action=action)


Dealer: 5, 6, 7
Player: 10, 9
Reward: 1.0
--------------------
New Hand
--------------------
Dealer: ?, 3
Player: 4, 5


# Results

In [6]:
import os
N_RUNS = 3 #each run includes collection and model update
#Directory
ACTOR_CHKP = 'saved_actor'
VALUE_CHKP = 'saved_value'
MOD_NAME='model'
#Collection
N_EPISODES = 1
NUM_ACTIONS = 3

#ACTOR_CHKP,VALUE_CHKP,DECK_DIR = whole_paths([ACTOR_CHKP,VALUE_CHKP,DECK_DIR])

#==============================================================
PRINTOUT=True
def load_model_chkpoint(dir_name=None,ver_name='None',num_outs=1):#returns the model at given chkpoint
    if ver_name =='None':
        dir_name = tf.train.latest_checkpoint(dir_name)
    else:
        dir_name = os.path.join(dir_name,ver_name)

    dummy_env = TFPyEnvironment(CardGameEnv())
    time_step = dummy_env.reset()
    temp = BlackJackModel(num_outs)
    #initialize model shape by running an observation through
    temp(time_step.observation)
    checkpoint2 = tf.train.Checkpoint(module=temp)
    status=checkpoint2.restore(dir_name)
    return temp,checkpoint2
def get_env_specs():
    dummy_env = TFPyEnvironment(CardGameEnv())
    return dummy_env.observation_spec(),dummy_env.action_spec()


class BlackJackModel(tf.Module):
    def __init__(self,num_outs):
        super(BlackJackModel,self).__init__()
        self.lstm_root= Sequential([
            Masking(mask_value=0.),
            LSTM(64)
            ])
            
        self.dense1 = Dense(64,activation='relu')
        self._droprate = 0.1
        if num_outs>1:#num_outs 1 for value. num_outs=num_actions for actor
            self.dense2 = Sequential([
                Dense(64,activation='relu'),
                Dense(num_outs,activation='softmax')
                ])
        else:
            self.dense2 = Sequential([
                Dense(64,activation='relu'),
                Dense(num_outs)
                ])
    def preproc_obs(self,observation):#Preprocessing
        def revise(score_ace):
            score,ace = score_ace
            if ace:
                score2 = score+10
            else:
                score2=0
            return tf.maximum(tf.one_hot(score-1,21),tf.one_hot(score2-1,21))
        
        a = tf.one_hot(observation[0],10,dtype=tf.float32)
        b = tf.map_fn(revise,(observation[1],observation[2]),dtype=tf.float32)
        c = tf.map_fn(revise,(observation[3],observation[4]),dtype=tf.float32)
        d = tf.expand_dims(tf.cast(observation[5],dtype=tf.float32),-1)
        return a,b,c,d
    def __call__(self,all_obs,is_training=False):
        x0,x1,x2,x3 = self.preproc_obs(all_obs)
        
        x0 = self.lstm_root(x0)
        x_all = tf.concat((x0,x1,x2,x3),axis=-1)
        x_all=self.dense1(x_all)
        if is_training:
            x_all = tf.nn.dropout(x_all,self._droprate)
        x_all = self.dense2(x_all)
        
        return x_all

def sample_policy_tf(probac,num_ac=2):
    probac = tf.math.log(probac)
    a = tf.random.categorical(logits=probac,num_samples=1,dtype=tf.int32)
    a = tf.squeeze(a,0)
    return a
def get_action(the_trainer,the_obs,eval=True):
    achead = the_trainer.actor_net(the_obs)
    val = the_trainer._value_net(the_obs)
    if PRINTOUT:
        print(the_obs[1].numpy(),the_obs[3].numpy(),val.numpy(),achead.numpy())
    if eval:
        action = tf.argmax(achead,axis=-1)
    else:
        action = sample_policy_tf(achead)
    return achead,tf.cast(action,dtype=tf.int32)
def env_runner(the_trainer):
    env=TFPyEnvironment(CardGameEnv())
    time_step = env.reset()
    rew=0.0
    hands=0
    while not tf.reduce_all(time_step.is_last()):
        achead,action = get_action(the_trainer,time_step.observation)
        next_time_step = env.step(action)
        time_step = next_time_step
        r = time_step.reward

        if time_step.observation[5]>0.95:
            r*=1
        rew += r
        
    return rew
def collection_run(trainer,n_episodes):
    rewards = 0.0
    for _ in range(n_episodes):
        reward= env_runner(trainer)
        rewards+=reward
    return rewards /n_episodes



if __name__=="__main__":
    with tf.device('/CPU:0'):
        #MAIN
        value_net,checkpoint_val = load_model_chkpoint(dir_name=VALUE_CHKP,num_outs=1,ver_name='model-56')
        actor_net,checkpoint_act = load_model_chkpoint(dir_name=ACTOR_CHKP,num_outs=NUM_ACTIONS,ver_name='model-56')
        optimizer = Adam()
        observation_spec,_ = get_env_specs()
        ppo_trainer = PPOAgent(
                   optimizer=optimizer,
                   actor_net=actor_net,
                   value_net=value_net,
                   observation_spec=observation_spec,
                   num_actions=NUM_ACTIONS)
        all_rews = []
        for n_step in range(N_RUNS):
            #Collect
            rew = collection_run(ppo_trainer,N_EPISODES)
            rew = rew.numpy()
            print("Run",n_step,"Result:",rew)
            all_rews.append(rew)
        print("Avg:",np.mean(all_rews))

W0307 21:38:37.178660  8584 deprecation.py:323] From c:\users\ericy\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\keras\backend.py:3868: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


[2] [10] [[-2.1336157]] [[0.35737604 0.28994608 0.35267782]]
[19] [2] [[-0.22117655]] [[0.8581143  0.09495622 0.04692944]]
[19] [5] [[0.44305655]] [[0.9708848  0.02247752 0.00663763]]
[20] [10] [[-0.3656309]] [[0.93128973 0.05583587 0.01287438]]
[8] [10] [[-1.6748561]] [[0.3675769  0.37231967 0.26010343]]
[15] [10] [[-2.1670158]] [[0.4792729  0.2612255  0.25950167]]
[6] [10] [[-1.6730815]] [[0.4617107  0.23820426 0.3000851 ]]
[20] [4] [[-0.525481]] [[0.9155629  0.0684029  0.01603422]]
[20] [2] [[0.6782062]] [[0.7886682  0.15634415 0.05498765]]
[10] [10] [[0.21357304]] [[0.19042571 0.2826331  0.5269412 ]]
[17] [10] [[0.11500724]] [[0.36174363 0.43504918 0.20320714]]
Run 0 Result: [3.]
[16] [7] [[-1.9615806]] [[0.36137703 0.2822516  0.35637134]]
[19] [10] [[-1.946772]] [[0.82751304 0.11430119 0.05818576]]
[20] [8] [[0.0803768]] [[0.7303263  0.17001581 0.0996579 ]]
[10] [6] [[-0.7833669]] [[0.350575   0.1622716  0.48715338]]
[3] [4] [[-0.32007378]] [[0.26205596 0.3237459  0.41419813]]
[18

# Parameters

In [None]:
N_RUNS = 100 #each run includes collection and model update
#Directory
ACTOR_CHKP = 'model_actor'
VALUE_CHKP = 'model_value'
#Collection
N_EPISODES = 1
NUM_ACTIONS = 2
#Train
BATCH_SIZE=32
N_BATCHES =8
N_STEPS = 10
N_EPOCHS = 15
VALUE_LOSS_COEF= 0.5
ENTROPY_REG_COEF=0.2
LEARNING_RATE = 1e-3
SAVE_INTERVAL=10

ACTOR_CHKP,VALUE_CHKP = whole_paths([ACTOR_CHKP,VALUE_CHKP])

# Buffer

In [None]:
# Buffer
def get_env_specs():
    dummy_env = TFPyEnvironment(CardGameEnv())
    return dummy_env.observation_spec(),dummy_env.action_spec()
def get_tf_buffer(num_outs=2,max_length = 10000):
    obs_spec,ac_spec = get_env_specs()
    time_step_spec = ts.time_step_spec(obs_spec)
    info = BoundedTensorSpec(shape=(num_outs,),
                            dtype=np.float32, 
                            minimum=np.zeros(num_outs,dtype=np.float32),
                            maximum=np.ones(num_outs,dtype=np.float32))
    action_spec = policy_step.PolicyStep(ac_spec,info=info)
    trajectory_spec = trajectory.from_transition(
        time_step_spec, action_spec , time_step_spec)
    the_replay_buffer = TFUniformReplayBuffer(
        data_spec=trajectory_spec,
        batch_size=1,
        max_length=max_length)
    
    return the_replay_buffer

# Model

In [None]:
def load_model_chkpoint(dir_name=None,num_outs=1):#returns the model at given chkpoint
    dir_name = tf.train.latest_checkpoint(dir_name)
    dummy_env = TFPyEnvironment(CardGameEnv())
    time_step = dummy_env.reset()
    temp = BlackJackModel(num_outs)
    #initialize model shape by running an observation through
    temp(time_step.observation)
    checkpoint2 = tf.train.Checkpoint(module=temp)
    status=checkpoint2.restore(dir_name)
    return temp,checkpoint2

#Preprocessing
def preproc_obs(observation):
    def revise(score_ace):
        score,ace = score_ace
        if ace:
            score2 = score+10
        else:
            score2=0
        return tf.maximum(tf.one_hot(score-1,21),tf.one_hot(score2-1,21))

    a = tf.one_hot(observation[0],10,dtype=tf.float32)
    b = tf.map_fn(revise,(observation[1],observation[2]),dtype=tf.float32)
    c = tf.map_fn(revise,(observation[3],observation[4]),dtype=tf.float32)
    return a,b,c
class BlackJackModel(tf.Module):
    def __init__(self,num_outs):
        super(BlackJackModel,self).__init__()
        self.lstm_root= Sequential([
            Masking(mask_value=0.),
            LSTM(64)
            ])
            
        self.dense1 = Dense(64,activation='relu')
        self._droprate = 0.25
        if num_outs>1:#num_outs 1 for value. num_outs=num_actions for actor
            self.dense2 = Sequential([
                Dense(64,activation='relu'),
                Dense(num_outs,activation='softmax')
                ])
        else:
            self.dense2 = Sequential([
                Dense(64,activation='relu'),
                Dense(num_outs)
                ])
    def __call__(self,all_obs,is_training=False):
        x0,x1,x2 = preproc_obs(all_obs)
        x0 = self.lstm_root(x0)
        x_all = tf.concat((x0,x1,x2),axis=-1)
        x_all=self.dense1(x_all)
        if is_training:
            x_all = tf.nn.dropout(x_all,self._droprate)
        x_all = self.dense2(x_all)
        
        return x_all

# Collection

In [None]:
def sample_policy_tf(probac,num_ac=2):
    probac = tf.math.log(probac)
    a = tf.random.categorical(logits=probac,num_samples=1,dtype=tf.int32)
    a = tf.squeeze(a,0)
    return a
def get_action(the_model,the_obs,eval=False):
    achead = the_model(the_obs)
    if eval:
        action = tf.argmax(achead,axis=-1)
    else:
        action = sample_policy_tf(achead)
    return achead,tf.cast(action,dtype=tf.int32)
def env_runner(buffer,model):
    env=TFPyEnvironment(CardGameEnv())
    time_step = env.reset()
    rew=0.0
    while not tf.reduce_all(time_step.is_last()):
        achead,action = get_action(model,time_step.observation,eval=eval)
        next_time_step = env.step(action)
        pol_step = policy_step.PolicyStep(action,info=achead)
        #tfagents ignores reward on terminal timestep... 
        #can include it by inserting a dummy step 
        if tf.reduce_all(next_time_step.is_last()):
            dummy_time_step=ts.transition(next_time_step.observation,next_time_step.reward)
            traj = trajectory.from_transition(time_step, pol_step, dummy_time_step)
            buffer.add_batch(traj)
            #dummy traj
            traj = trajectory.from_transition(dummy_time_step, pol_step, next_time_step)
            buffer.add_batch(traj)
        else:
            traj = trajectory.from_transition(time_step, pol_step, next_time_step)
            buffer.add_batch(traj)
        time_step = next_time_step
        rew += time_step.reward
    return buffer,rew
def collection_run(model,buffer,n_episodes):
    rewards = 0.0
    for _ in range(n_episodes):
        buffer,reward= env_runner(buffer,model)
        rewards+=reward
    return buffer,rewards /n_episodes

In [None]:
#MAIN
value_net,checkpoint_val = load_model_chkpoint(dir_name=VALUE_CHKP,num_outs=1)
actor_net,checkpoint_act = load_model_chkpoint(dir_name=ACTOR_CHKP,num_outs=NUM_ACTIONS)
optimizer = Adam(learning_rate = LEARNING_RATE)
observation_spec,_ = get_env_specs()
ppo_trainer = PPOAgent(
               optimizer=optimizer,
               actor_net=actor_net,
               value_net=value_net,
               observation_spec=observation_spec,
               num_actions=NUM_ACTIONS,
               importance_ratio_clipping=0.2,
               lambda_value=0.95,
               discount_factor=0.97,
               entropy_regularization=ENTROPY_REG_COEF,
               value_pred_loss_coef=VALUE_LOSS_COEF,
               num_epochs=N_EPOCHS,
               use_gae=True,
               use_td_lambda_return=True,
               normalize_rewards=True,
               reward_norm_clipping=10.0,
               normalize_observations=False,
               gradient_clipping=None)

In [None]:
def train_ppo_agent(the_experience,the_weights,the_ppo_agent):
    loss_info = the_ppo_agent._train(the_experience,the_weights)
    return loss_info

for n_step in range(N_RUNS):
    #Collect
    buffer = get_tf_buffer()
    buffer,rew = collection_run(ppo_trainer.actor_net,buffer,N_EPISODES)
    print("Run",n_step,"Result:",rew.numpy())
    ds = buffer.as_dataset(num_parallel_calls=2, sample_batch_size=BATCH_SIZE, num_steps=N_STEPS).prefetch(2).repeat(-1)
    iterator = iter(ds)
    for _ in range(N_BATCHES):
        experience,_ = next(iterator)
        weights = tf.ones((BATCH_SIZE,1),dtype=tf.float32)
        loss_info = train_ppo_agent(experience,weights,ppo_trainer)
    if (n_step+1)%SAVE_INTERVAL==0:
        checkpoint_val.save(save_dir_val)
        checkpoint_act.save(save_dir_act)
        print(n_step+1,"VLoss:",loss_info.extra.value_estimation_loss.numpy())