In [None]:
import tensorflow as tf
from tensorflow import keras

from collections import deque
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import random
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [None]:
def Qfunction(input_nodes, hidden_layer_size, output_nodes, opt):
	model = keras.models.Sequential()
        
	model.add(keras.layers.Dense(input_nodes, activation='selu', kernel_initializer='lecun_normal', input_dim=input_nodes))
	
	model.add(keras.layers.Dense(input_nodes, activation='selu', kernel_initializer='lecun_normal'))
	for ii in range(len(hidden_layer_size)):
		model.add(keras.layers.Dense(hidden_layer_size[ii], activation='selu', kernel_initializer='lecun_normal'))
	
	model.add(keras.layers.Dense(output_nodes, activation='selu', kernel_initializer='lecun_normal'))
	model.compile(loss='mse', optimizer=opt, metrics=['mae'])
	return model

def Qtest(input_nodes, hidden_layer_size, output_nodes, opt):
	model = keras.models.Sequential()
        
	model.add(keras.layers.Dense(input_nodes, activation='selu', kernel_initializer='lecun_normal', input_dim=input_nodes))
	
	model.add(keras.layers.Dense(input_nodes, activation='selu', kernel_initializer='lecun_normal'))
	for ii in range(len(hidden_layer_size)):
		model.add(keras.layers.Dense(hidden_layer_size[ii], activation='selu', kernel_initializer='lecun_normal'))
	
	model.add(keras.layers.Dense(output_nodes, activation='selu', kernel_initializer='lecun_normal'))
	model.compile(loss='mse', optimizer=opt, metrics=['mae'])
	return model


In [None]:
# Wrapper class for training Qfunction and updating weights (target network) 

class DQN(object):
    
    def __init__(self, obssize, actsize, hidden_dims, optimizer):
        self.qfunction = Qfunction(obssize,hidden_dims,actsize,optimizer)
        self.optimizer = optimizer
        self.obssize = obssize
        self.actsize = actsize

    def _predict_q(self, states, actions):
        Qvalues = self.compute_Qvalues(states)
        Qpreds = tf.gather_nd(Qvalues, [[i, actions[i]] for i in range(len(actions))])
        return Qpreds
        

    def _loss(self, Qpreds, targets):
        return tf.math.reduce_mean(tf.square(Qpreds - targets))

    
    def compute_Qvalues(self, states):
        inputs = tf.experimental.numpy.atleast_2d(states.astype('float32'))
        return self.qfunction(inputs)

    @tf.function(jit_compile=True)
    def train(self, states, actions, targets):
        with tf.GradientTape() as tape:
            Qpreds = self._predict_q(states, actions)
            loss = self._loss(Qpreds, targets)
        variables = self.qfunction.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss

    def update_weights(self, from_network):
        from_var = from_network.qfunction.trainable_variables
        to_var = self.qfunction.trainable_variables
        
        for v1, v2 in zip(from_var, to_var):
            v2.assign(v1)

In [None]:
class ReplayBuffer(object):
    def __init__(self, maxlength):
        self.buffer = deque()
        self.number = 0
        self.maxlength = maxlength
    
    def append(self, experience):
        self.buffer.append(experience)
        self.number += 1
        if(self.number > self.maxlength):
            self.pop()
        
    def pop(self):
        while self.number > self.maxlength:
            self.buffer.popleft()
            self.number -= 1
    
    def sample(self, batchsize):
        inds = np.random.choice(len(self.buffer), batchsize, replace=False)
        return [self.buffer[idx] for idx in inds]
        

In [None]:
import importlib
from Agent import Agent
from tensorflow.python.ops.numpy_ops import np_config
import Game2
importlib.reload(Game2)
from Game2 import Game
np_config.enable_numpy_behavior()

lr = 1e-5
batchsize = 64
maxlength = 100000 
tau = 1000  
initialize = 5000  
epsilon = .1  
gamma = .95 
hidden_dims=[128, 256, 512, 512, 265, 64] 
nstep = 6
episodes = 200000

optimizer = keras.optimizers.Adam(learning_rate=lr)

game = Game(9)
obssize = 120
actsize = 54
Qprincipal = DQN(obssize, actsize, hidden_dims, optimizer)
Qtarget = DQN(obssize, actsize, hidden_dims, optimizer)
buffer = ReplayBuffer(maxlength)
Train_start = True

rrecord = []
totalstep = 0

wins = []
unif_agent = Agent()
maxWinRate = 0
baseline = 1
testWinSet = []
for ite in range(episodes):

    obs = game.reset(display=0)
    done = False
    rsum = 0
    gameStep = 0
    stateDeque = []
    actionDeque = []
    rewardDeque = []
    while not done:
        totalstep += 1
        gameStep += 1
        if np.random.rand() < max(epsilon,1-(1-epsilon)*ite/episodes/0.3):
            action = random.choice(range(actsize))
        else:
            Q = Qprincipal.compute_Qvalues(obs)
            actionMask = np.array([(i%6 in game.validCard[game.player]) and (i//6 in game.validStone[game.player]) for i in range(actsize)])
            action = np.argmax(Q*actionMask)
        
        obs_tmp, reward, done = game.step(action)
        if game.done:
            if game.winner == 0:
                wins.append(1)
            else:
                wins.append(0)
            obs_post = obs_tmp
        else:
            op_action = unif_agent.action(validCard=game.validCard[game.player], validStone=game.validStone[game.player])
            obs_post, _, done = game.step(op_action)
            if game.done:
                if game.winner == 0:
                    wins.append(1)
                    reward += 50
                else:
                    wins.append(0)
        if gameStep >= nstep:
            if gameStep > nstep:
                stateDeque.pop(0)
                actionDeque.pop(0)
                rewardDeque.pop(0)
            stateDeque.append(obs)
            actionDeque.append(action)
            rewardDeque.append(reward)
            if done:
                for ii in range(nstep):
                    buffer.append((stateDeque[ii], actionDeque[ii], rewardDeque, done, obs_post))
                    rewardDeque.pop(0)
                    rewardDeque.append(0)
            else:
                buffer.append((stateDeque[0], actionDeque[0], rewardDeque, done, obs_post))

        else:
            stateDeque.append(obs)
            actionDeque.append(action)
            rewardDeque.append(reward)

        
        rsum += reward
        
        obs = obs_post

        if totalstep > initialize:
            if hi:
                print('start training')
                hi = False
            samples = buffer.sample(batchsize)
            states = np.array([sample[0] for sample in samples])
            actions = np.array([sample[1] for sample in samples])
            rewards = np.array([sample[2] for sample in samples])
            dones = np.array([sample[3] for sample in samples])
            states_post = np.array([sample[4] for sample in samples])
            G = np.zeros(batchsize)
            for jj in range(nstep):
                G += gamma**jj * rewards[:,jj]
                
            q_values = Qprincipal.compute_Qvalues(states_post)
            actions_post = tf.argmax(q_values, axis=1)
            q_values_next_state = Qtarget.compute_Qvalues(states_post) 
            double_q = tf.gather_nd(q_values_next_state, [[i, actions_post[i]] for i in range(len(actions_post))])
            target = G + gamma**nstep * double_q * (1-dones)
            Qprincipal.train(states, actions, target)

            if totalstep % tau == 0:
                Qtarget.update_weights(Qprincipal)
    rrecord.append(rsum)
    disp_number = 50
    if ite % disp_number == 0:
        print('iteration {},  win rate {}, reward {}'.format(ite, int(np.mean(wins[-disp_number:])*100), np.mean(rrecord[-disp_number:])))
    if ite > 100:
        ave100 = np.mean(wins[-100:])
        if ave100 > 0.9:
            Qprincipal.qfunction.save('models/model_solved')
            print("Solved after %d episodes."%ite)
            break
        elif ite % 1000 == 0:
            Qprincipal.qfunction.save('models/model_'+str(ite))
            test_model = DQN(obssize, actsize, hidden_dims, optimizer)
            test_model.qfunction.set_weights(Qprincipal.qfunction.get_weights())
            testWins = 0
            for jj in range(1000):
                game.reset(display=0)
                done = False
                while not done:
                    if game.player == 0:
                        Q = test_model.compute_Qvalues(obs)
                        actionMask = np.array([(i%6 in game.validCard[game.player]) and (i//6 in game.validStone[game.player]) for i in range(actsize)])
                        action = np.argmax(Q*actionMask)
                    else:
                        action = unif_agent.action(validCard=game.validCard[game.player], validStone=game.validStone[game.player])
                    obs, _, done = game.step(action)
                if game.winner == 0:
                    testWins += 0.1
            print('test win rate (%): ', testWins)
            testWinSet.append(testWins)
Qprincipal.qfunction.save('models/model_'+str(ite+1))

In [None]:
x = [i+1 for i in range(len(wins))]
wr = [np.mean(wins[i-100:i]) for i in range(len(wins))]
plt.plot(x, wr)
plt.xlabel('episodes')
plt.ylabel('win rate')
plt.show()