In [1]:
import gym, pickle, numpy as np
import backgammonenv

TOT_BOARD_PTS = 24
WHITE,BLACK = 0,1
ALL_CHECKERS = 15
WHITE_HOME,BLACK_HOME,BAR_IND = 25,26,0
roll_dice = lambda: return (np.random.randint(1,7), np.random.randint(1,7))

In [2]:
import sys,os

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
currentdir = os.path.abspath(os.getcwd())

## Player Class

Default super class that lays out defualt functions for both human and agent players.

In [3]:
class Player:
    '''
    Default player class.

    Attributes:
        name (str): Name of the player.
    '''
    def __init__(self, name):
        self.name = name
        
    def chooseAction(self, openPositions, currentBoard, dice, env):
        pass
        
    def addState(self, state):
        pass
    
    def feedReward(self, reward):
        pass
    
    def reset(self):
        pass

## QLearning

Agents of this class use a QLearning table to keep track of rewards.  Two of these agents will play against each other during training.

In [22]:
class QAgentPlayer:
    '''
    Agent player that learns from QLearning table.

    Attributes:
        name (str) : Name of Player.
        turn (int) : Number that designates their turn.
        explRate (float) : Percentage of time agent takes a random action vs greedy action when playing (default 30%).
        states (Python list) : All positions taken in current game
        lr (float) : Learning Rate, used when feeding reward.
        decayGamma (float) : Used when feeding reward.
        statesValue (Python dict) : QLearning table. Stoes total reward for each possible game state.
    '''
    def __init__(self, name, turn, explRate=.3):
        '''
        Constructor for QAgentPlayer class.

        Parameters:
            name (str) : Name of Player.
            turn (int) : Number that designates their turn.
            explRate (float) : Percentage of time agent takes a random action vs greedy action when playing (default 30%).
        '''
        self.name = name
        self.turn = turn
        self.explRate = explRate
        self.states = []
        self.lr = .2
        self.decayGamma = .9
        self.statesValue = {}
        
    def getHash(self, board):
        '''
        Get a unique hash value that corresponds with the given board state.
        
        Parameters:
            board (np.ndarray) : Current board (from environment).
        
        Returns:
            str: Hash value of board.
        '''
        return str(self.board.reshape(TOT_BOARD_PTS + 3))    

    def addState(self, state):
        '''
        Adds a state to the Players state attribute
        
        Parameters:
            state (str) : Put state's hash value into self.states after choosing action.
        '''
        self.states.append(state)
        
    def chooseAction(self, openPositions, currentBoard, env, roll=None):
        '''
        Choose an action to take using epsilon-greedy method.
        
        Parameters:
            openPositions (Python list) : List of available places to make a move on board.
            currentBoard (np.ndarray) : Game board.
        
        Returns:
            int: Position of board that agent wants to make a move.
        '''
        if not openPositions:
            # No moves can be made
            return []
        if np.random.uniform(0,1) <= self.explRate:
            # Take random action
            index = np.random.choice(len(openPositions))
            action = openPositions[index]
        else:
            maxValue = -999
            for p in openPositions:
                nextBoard = currentBoard.copy()
                for move in p:
                    nextBoard = env.updateBoard(move,nextBoard)
                nextBoardHash = self.getHash(nextBoard)
                value = 0 if self.statesValue.get(nextBoardHash) is None else self.statesValue.get(nextBoardHash)
                if value > maxValue:
                    maxValue = value
                    action = p
        return action
    
    def feedReward(self, reward):
        '''
        At the end of the game, backpropogate and update state values.
        The updated value of state t equals the current value of state t
            adding the difference between the value of next state and the value of current state,
            which is multiplied by a learning rate α (Given the reward of intermediate state is 0).
        
        Parameters:
            reward (float) : The reward determined by the environment.
        '''
        for state in (reversed(self.states)):
            if self.statesValue.get(state) is None:
                self.statesValue[state] = 0
            self.statesValue[state] += self.lr * (self.decayGamma * reward - self.statesValue[state])
            reward = self.statesValue[state]
    
    def reset(self):
        '''
        Reset board when agent starts a new game.
        '''
        self.states = []

    def savePolicy(self):
        '''
        After training, an agent has its policy stored in self.stateValues.
        This function saves that attribute in a file to play later.
        '''
        with open(currentdir + '/policies/c4_policy_' + str(self.name), 'wb') as fw:
            pickle.dump(self.statesValue, fw)

    # Loading the policy when playing a human
    def loadPolicy(self, file):
        '''
        Reload previous self.stateValues.
        
        Parameters:
            file (str) : Name of file that has policy.
        '''
        with open(file, 'rb') as fr:
            self.statesValue = pickle.load(fr)

## Human Player

Adds functionality for human player using raw input on the cmdline.

In [23]:
class HumanPlayer(Player):
    '''
    Allows user to input moves through the cmdline.
    
    Parameters:
        openPositions (Python list) : List of available places to make a move on board.
        currentBoard (np.ndarray) : Game board.
    
    Returns:
        int: Position of board that user wants to make a move.
    '''
    def chooseAction(self, positions, currentBoard, env, ):
        while True:
            try:

            except ValueError:
                continue

## Driver Code

The code below is how a game is started, Using 2 player objects and an instance of the tic tac toe environment.

In [20]:
def startGame(pwhite, pblack, env):
    '''
    Initiates a game of Backgammon.

    Parameters:
        white (Player) : Player who uses white checkers.
        black (Player) : Player who uses black checkers.
        env (BackgammonEnv) : Environment for the game.        
    ''' 
    gameOver = False
    human = p1.name == "human" or p2.name == "human"
    # When there's a human player, print out stuff
    if human:
        env.render()

    # Determine who goes first
    roll = roll_dice()
    while roll[0] == roll[1]:
        # Reroll if they're equal
        roll = roll_dice()

    if roll[0] > roll[1]:
        if human:
            print('White goes first!')
        env.playerTurn = WHITE
        p1 = pwhite
        p2 = pblack
    else:
        if human:
            print('Black goes first!')
        env.playerTurn = BLACK
        p1 = pblack
        p2 = pwhite

    while not gameOver:
        # Player who goes first
        openPositions = env.availablePositions(roll) # openPositions returns a list of tuples, where each element in the tuple is an action, and the tuple represents an entire turn
        # Example: [((4,6),(3,7)),((4,8),(2,4))]
        #   In this example the dice roll came up 2 and 4. The first choice is to move a checker from spot 4 on the board to spot 6, and then move a checker from 3 to 7
        #   The second choice is to move a checker from 4 to 8 and then 2 to 4
        all_moves_in_turn = p1.chooseAction(openPositions, env.board, env, roll) # This will return a tuple with all the actions for the player's turn
        for move in all_moves_in_turn:
            # The for loop then loops through the tuple and invokes each action on the board
            reward,gameOver,actionHash = env.step(move)
            if gameOver:
                break
            p1.addState(actionHash)
        if human:
            if all_moves_in_turn:
                env.render()
            else:
                print('No moves can be made')
        if gameOver:
            if human and reward[0] == 1:
                print(f"{p1.name} wins!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
        roll = make_roll()

        # Player who goes second
        openPositions = env.availablePositions(roll)
        all_moves_in_turn = p2.chooseAction(openPositions, env.board, env, roll)
        for move in all_moves_in_turn:
            reward,gameOver,actionHash = env.step(move)
            if gameOver:
                break
            p2.addState(actionHash)
        if human:
            if all_moves_in_turn:
                env.render()
            else:
                print('No moves can be made')
        if gameOver:
            if human:
                print(f"{p2.name} wins!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
        roll = make_roll()

    env.reset()
    p1.reset()
    p2.reset()

In [25]:
# Train agents
p1 = AgentPlayer("p1", WHITE)
p2 = AgentPlayer("p2", BLACK)
env = gym.make('backgammonenv-v0')
print("training...")
for _ in range(1):
    startGame(p1, p2, env)
# Save Results
p1.savePolicy()
print("saved p1 policy")
p2.savePolicy()
print("saved p2 policy")

training...
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]]
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]]
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0.  0.  0.  0.]
 [ 1.  1.  0.  0.  0.  0.  0.]]
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [