In [2]:
import gym, pickle, numpy as np,random,copy,matplotlib.pyplot as plt
import backgammonenv

TOT_BOARD_PTS = 24
WHITE,BLACK = 0,1
ALL_CHECKERS = 15
WHITE_HOME,BLACK_HOME,BAR_IND = 25,26,0
roll_dice = lambda: (np.random.randint(1,7), np.random.randint(1,7))

ModuleNotFoundError: No module named 'backgammonenv'

In [None]:
import sys,os

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
currentdir = os.path.abspath(os.getcwd())

## Player Class

Default super class that lays out defualt functions for both human and agent players.

In [4]:
class Player:
    '''
    Default player class.

    Attributes:
        name (str): Name of the player.
    '''
    def __init__(self, name):
        self.name = name
        
    def chooseAction(self, openPositions, currentBoard, dice, env):
        pass
        
    def addState(self, state):
        pass
    
    def feedReward(self, reward):
        pass
    
    def reset(self):
        pass

## QLearning

Agents of this class use a QLearning table to keep track of rewards.  Two of these agents will play against each other during training.

In [24]:
class QAgentPlayer:
    '''
    Agent player that learns from QLearning table.

    Attributes:
        name (str) : Name of Player.
        turn (int) : Number that designates their turn.
        explRate (float) : Percentage of time agent takes a random action vs greedy action when playing (default 30%).
        states (Python list) : All positions taken in current game
        lr (float) : Learning Rate, used when feeding reward.
        decayGamma (float) : Used when feeding reward.
        statesValue (Python dict) : QLearning table. Stoes total reward for each possible game state.
    '''
    def __init__(self, name, turn, explRate=.3):
        '''
        Constructor for QAgentPlayer class.

        Parameters:
            name (str) : Name of Player.
            turn (int) : Number that designates their turn.
            explRate (float) : Percentage of time agent takes a random action vs greedy action when playing (default 30%).
        '''
        self.name = name
        self.turn = turn
        self.explRate = explRate
        self.states = []
        self.lr = .2
        self.decayGamma = .9
        self.statesValue = {}
        
    def getHash(self, board):
        '''
        Get a unique hash value that corresponds with the given board state.
        
        Parameters:
            board (np.ndarray) : Current board (from environment).
        
        Returns:
            str: Hash value of board.
        '''
        return str(board.flatten)    

    def addState(self, state):
        '''
        Adds a state to the Players state attribute
        
        Parameters:
            state (str) : Put state's hash value into self.states after choosing action.
        '''
        self.states.append(state)
        
    def chooseAction(self, openPositions, currentBoard, env, roll):
        '''
        Choose an action to take using epsilon-greedy method.
        
        Parameters:
            openPositions (Python list) : List of available places to make a move on board.
            currentBoard (np.ndarray) : Game board.
        
        Returns:
            int: Position of board that agent wants to make a move.
        '''
        if not openPositions:
            # No moves can be made
            return []
        
        # Possible random action
        index = np.random.choice(len(openPositions))
        action = openPositions[index]
        secondAction = action
        maxValue = -999
                    
        if np.random.uniform(0,1) > self.explRate:
            secondMax = -999
            for p in openPositions:
                nextBoard = currentBoard.copy()
                for move in p:
                    nextBoard = env.updateBoard(move,nextBoard)
                nextBoardHash = self.getHash(nextBoard)
                value = 0 if self.statesValue.get(nextBoardHash) is None else self.statesValue.get(nextBoardHash)
                if value > maxValue:
                    secondMax = maxValue
                    secondAction = action
                    maxValue = value
                    action = p
        if maxValue <= 0:
            # If agent can't find a best move, pick a random one anyways
            index = np.random.choice(len(openPositions))
            action = openPositions[index]
            secondAction = action
        # 20% of the time actually choose the second best action, for variance in decision making
        return action if np.random.uniform(0,1) > 0.2 else secondAction
    
    def feedReward(self, reward):
        '''
        At the end of the game, backpropogate and update state values.
        The updated value of state t equals the current value of state t
            adding the difference between the value of next state and the value of current state,
            which is multiplied by a learning rate α (Given the reward of intermediate state is 0).
        
        Parameters:
            reward (float) : The reward determined by the environment.
        '''
        for state in (reversed(self.states)):
            if self.statesValue.get(state) is None:
                self.statesValue[state] = 0
            self.statesValue[state] += self.lr * (self.decayGamma * reward - self.statesValue[state])
            reward = self.statesValue[state]
    
    def reset(self):
        '''
        Reset board when agent starts a new game.
        '''
        self.states = []

    def savePolicy(self):
        '''
        After training, an agent has its policy stored in self.stateValues.
        This function saves that attribute in a file to play later.
        '''
        with open(currentdir + '/policies/c4_policy_' + str(self.name), 'wb') as fw:
            pickle.dump(self.statesValue, fw)

    # Loading the policy when playing a human
    def loadPolicy(self, file):
        '''
        Reload previous self.stateValues.
        
        Parameters:
            file (str) : Name of file that has policy.
        '''
        with open(file, 'rb') as fr:
            self.statesValue = pickle.load(fr)

## Human Player

Adds functionality for human player using raw input on the cmdline.

In [9]:
class HumanPlayer(Player):
    '''
    Allows user to input moves through the cmdline.
    
    Parameters:
        openPositions (Python list) : List of available places to make a move on board.
        currentBoard (np.ndarray) : Game board.
    
    Returns:
        int: Position of board that user wants to make a move.
    '''
    def chooseAction(self, positions, currentBoard, env, roll):
        message = f"You rolled a {roll[0]} and {roll[1]}, select where you want to move checkers to and from\n"
        while True:
            try:
                i = input(message+'First move (0 for bar, 25 for white home, 26 for black home: ')
                i = tuple([int(x) for x in i.split(',')])
                j = input(message+'Second move (0 for bar, 25 for white home, 26 for black home: ')
                j = tuple([int(x) for x in j.split(',')])
            except ValueError:
                continue
            if len(i) != 2 or len(j) != 2:
                continue
            elif (i,j) in positions:
                return (i,j)

## Driver Code

The code below is how a game is started, Using 2 player objects and an instance of the tic tac toe environment.

In [19]:
def startGame(pwhite, pblack, env):
    '''
    Initiates a game of Backgammon.

    Parameters:
        white (Player) : Player who uses white checkers.
        black (Player) : Player who uses black checkers.
        env (BackgammonEnv) : Environment for the game.        
    ''' 
    gameOver = False
    human = pwhite.name == "human" or pblack.name == "human"
    # When there's a human player, print out stuff
    if human:
        env.render()

    # Determine who goes first
    roll = roll_dice()
    while roll[0] == roll[1]:
        # Reroll if they're equal
        roll = roll_dice()

    if roll[0] > roll[1]:
        if human:
            print('White goes first!')
        env.playerTurn = WHITE
        p1 = pwhite
        p2 = pblack
    else:
        if human:
            print('Black goes first!')
        env.playerTurn = BLACK
        p1 = pblack
        p2 = pwhite

    while not gameOver:
        # Player who goes first
        openPositions = env.availablePositions(roll) # openPositions returns a list of tuples, where each element in the tuple is an action, and the tuple represents an entire turn
        # Example: [((4,6),(3,7)),((4,8),(2,4))]
        #   In this example the dice roll came up 2 and 4. The first choice is to move a checker from spot 4 on the board to spot 6, and then move a checker from 3 to 7
        #   The second choice is to move a checker from 4 to 8 and then 2 to 4
        all_moves_in_turn = p1.chooseAction(openPositions, env.board, env, roll) # This will return a tuple with all the actions for the player's turn
        for move in all_moves_in_turn:
            # The for loop then loops through the tuple and invokes each action on the board
            reward,gameOver,actionHash = env.step(move)
            if gameOver:
                break
            p1.addState(actionHash)
        if human:
            if all_moves_in_turn:
                env.render()
            else:
                print('No moves can be made')
        if gameOver:
            if human and reward[0] == 1:
                print(f"{p1.name} wins!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
        roll = roll_dice()

        # Player who goes second
        openPositions = env.availablePositions(roll)
        all_moves_in_turn = p2.chooseAction(openPositions, env.board, env, roll)
        for move in all_moves_in_turn:
            reward,gameOver,actionHash = env.step(move)
            if gameOver:
                break
            p2.addState(actionHash)
        if human:
            if all_moves_in_turn:
                env.render()
            else:
                print('No moves can be made')
        if gameOver:
            if human:
                print(f"{p2.name} wins!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
        roll = roll_dice()

    env.reset()
    p1.reset()
    p2.reset()
    return 1 if reward[0] == 1 else (2 if reward[1] == 1 else 0)

In [25]:
env = gym.make('backgammonenv-v0') #make env
//startGame(HumanPlayer('human'),QAgentPlayer('p1',1),env)

13|14|15|16|17|18|   |19|20|21|22|23|24
--|--|--|--|--|--|---|--|--|--|--|--|--
B |  |  |  |W |  |   |W |  |  |  |  |B 
B |  |  |  |W |  |   |W |  |  |  |  |B 
B |  |  |  |W |  |   |W |  |  |  |  |  
B |  |  |  |  |  |   |W |  |  |  |  |  
B |  |  |  |  |  |   |W |  |  |  |  |  
--|--|--|--|--|--|   |--|--|--|--|--|--
W |  |  |  |  |  |   |B |  |  |  |  |  
W |  |  |  |  |  |   |B |  |  |  |  |  
W |  |  |  |B |  |   |B |  |  |  |  |  
W |  |  |  |B |  |   |B |  |  |  |  |W 
W |  |  |  |B |  |   |B |  |  |  |  |W 
--|--|--|--|--|--|---|--|--|--|--|--|--
12|11|10| 9| 8| 7|   | 6| 5| 4| 3| 2| 1
Black goes first!
13|14|15|16|17|18|   |19|20|21|22|23|24
--|--|--|--|--|--|---|--|--|--|--|--|--
B |  |  |  |W |  |   |W |  |  |  |  |B 
B |  |  |  |W |  |   |W |  |  |  |  |B 
B |  |  |  |W |  |   |W |  |  |  |  |  
B |  |  |  |  |  |   |W |  |  |  |  |  
  |  |  |  |  |  |   |W |  |  |  |  |  
  |  |  |  |  |  |   |  |  |  |  |  |  
--|--|--|--|--|--|   |--|--|--|--|--|--
  |  |  |  |  |  |   |

KeyboardInterrupt: Interrupted by user

In [None]:
def train(agents,numGames=2000,explRate=.3):
    for agent in agents:
        agent.lr = .2
        agent.explRate = explRate
    for i in range(numGames):
        random.shuffle(agents)
        agents[0].turn,agents[1].turn = 1,-1
        startGame(agents[0],agents[1],env)
        agents[2].turn,agents[3].turn = 1,-1
        startGame(agents[2],agents[3],env)
        agents[4].turn,agents[5].turn = 1,-1
        startGame(agents[4],agents[5],env)

def test(agents,numGames=2000):
    for agent in agents:
        agent.lr = 0
        agent.explRate = .05

    graphStats = {name:[0,0,0] for name in ['p1','p2','p3','p4','p5','p6']}

    for i in range(numGames):
        random.shuffle(agents)
        agents[0].turn,agents[1].turn = 1,-1
        result = startGame(agents[0],agents[1],env)
        if result == 0:
            graphStats[agents[0].name][2] += 1
            graphStats[agents[1].name][2] += 1
        elif result == 1:
            graphStats[agents[0].name][0] += 1
            graphStats[agents[1].name][1] += 1
        elif result == 2:
            graphStats[agents[0].name][1] += 1
            graphStats[agents[1].name][0] += 1
        agents[2].turn,agents[3].turn = 1,-1
        result = startGame(agents[2],agents[3],env)
        if result == 0:
            graphStats[agents[2].name][2] += 1
            graphStats[agents[3].name][2] += 1
        elif result == 1:
            graphStats[agents[2].name][0] += 1
            graphStats[agents[3].name][1] += 1
        elif result == 2:
            graphStats[agents[2].name][1] += 1
            graphStats[agents[3].name][0] += 1
        agents[4].turn,agents[5].turn = 1,-1
        result = startGame(agents[4],agents[5],env)
        if result == 0:
            graphStats[agents[4].name][2] += 1
            graphStats[agents[5].name][2] += 1
        elif result == 1:
            graphStats[agents[4].name][0] += 1
            graphStats[agents[5].name][1] += 1
        elif result == 2:
            graphStats[agents[4].name][1] += 1
            graphStats[agents[5].name][0] += 1
    return graphStats

def graphResults(graphStats):
    fig,axes = plt.subplots(2,3, figsize=(15,15))
    xname = ['wins','losses','ties']
    count = 0

    for i in range(2):
        for j in range(3):
            agent = agents[count]
            ax = axes[i,j]
            ax.bar(xname,graphStats[agent.name])
            ax.set_xticklabels(xname)
            ax.set_title(f'{agent.name}\'s Results')
            count += 1
    plt.show()

def createNewGeneration(agents,graphStats):
    maxScore = 0
    for agent in agents:
        agentScore = graphStats[agent.name][0] + (graphStats[agent.name][2] * .5)
        if agentScore > maxScore:
            # Agent with best score (accounting for wins and ties) will be the basis for the next generation
            survivor = agent
            maxScore = agentScore

    np1 = copy.deepcopy(survivor)
    np1.name = 'p1'
    np2 = copy.deepcopy(survivor)
    np2.name = 'p2'
    np3 = copy.deepcopy(survivor)
    np3.name = 'p3'
    np4 = copy.deepcopy(survivor)
    np4.name = 'p4'
    np5 = copy.deepcopy(survivor)
    np5.name = 'p5'
    np6 = copy.deepcopy(survivor)
    np6.name ='p6'
    return [np1,np2,np3,np4,np5,np6]

def savePolicy(agents):
    for agent in agents:
        agent.savePolicy()
        print('saved an agent:',currentdir + '/policies/ttt_policy_' + str(agent.name))

In [None]:
p1 = QAgentPlayer("p1", 1)
p2 = QAgentPlayer("p2", -1)
p3 = QAgentPlayer("p3", 1)
p4 = QAgentPlayer("p4", -1)
p5 = QAgentPlayer("p5", 1)
p6 = QAgentPlayer("p6", -1)
agents = [p1,p2,p3,p4,p5,p6]

epochs = 10
explore = .8
for i in range(epochs):
    train(agents,numGames=2000,explRate=explore)
    explore -= .05
    graphStats = test(agents)
    agents = createNewGeneration(agents,graphStats)