In [1]:
import gym, pickle, numpy as np
import connect4env

BOARD_ROWS,BOARD_COLS = 6,7
TOTAL_BOARD_SPACES = BOARD_ROWS*BOARD_COLS
COORD_TO_INDEX = lambda x : (x[0] * BOARD_ROWS) + x[1]

In [2]:
import sys,os

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
currentdir = os.path.abspath(os.getcwd())

## Player Class

Default super class that lays out defualt functions for both human and agent players.

In [3]:
class Player:
    '''
    Default player class.

    Attributes:
        name (str): Name of the player.
    '''
    def __init__(self, name):
        self.name = name
        
    def chooseAction(self, positions, curRows=None, currentBoard=None):
        pass
        
    def addState(self, state):
        pass
    
    def feedReward(self, reward):
        pass
    
    def reset(self):
        pass

## QLearning

Agents of this class use a QLearning table to keep track of rewards.  Two of these agents will play against each other during training.

In [4]:
class QAgentPlayer:
    '''
    Agent player that learns from QLearning table.

    Attributes:
        name (str) : Name of Player.
        turn (int) : Number that designates their turn.
        explRate (float) : Percentage of time agent takes a random action vs greedy action when playing (default 30%).
        states (Python list) : All positions taken in current game
        lr (float) : Learning Rate, used when feeding reward.
        decayGamma (float) : Used when feeding reward.
        statesValue (Python dict) : QLearning table. Stoes total reward for each possible game state.
    '''
    def __init__(self, name, turn, explRate=.3):
        '''
        Constructor for QAgentPlayer class.

        Parameters:
            name (str) : Name of Player.
            turn (int) : Number that designates their turn.
            explRate (float) : Percentage of time agent takes a random action vs greedy action when playing (default 30%).
        '''
        self.name = name
        self.turn = turn
        self.explRate = explRate
        self.states = []
        self.lr = .2
        self.decayGamma = .9
        self.statesValue = {}
        
    def getHash(self, board):
        '''
        Get a unique hash value that corresponds with the given board state.
        
        Parameters:
            board (np.ndarray) : Current board (from environment).
        
        Returns:
            str: Hash value of board.
        '''
        return str(board.reshape(BOARD_ROWS * BOARD_COLS))
    
    def addState(self, state):
        '''
        Adds a state to the Players state attribute
        
        Parameters:
            state (str) : Put state's hash value into self.states after choosing action.
        '''
        self.states.append(state)
        
    def chooseAction(self, openPositions, curRows=None, currentBoard=None):
        '''
        Choose an action to take using epsilon-greedy method.
        
        Parameters:
            openPositions (Python list) : List of available places to make a move on board.
            currentBoard (np.ndarray) : Game board.
        
        Returns:
            int: Position of board that agent wants to make a move.
        '''
        if np.random.uniform(0,1) <= self.explRate:
            # Take random action
            index = np.random.choice(len(openPositions))
            action = openPositions[index]
        else:
            maxValue = -999
            for p in openPositions:
                nextBoard = currentBoard.copy()
                nextBoard[p] = self.turn
                nextBoardHash = self.getHash(nextBoard)
                value = 0 if self.statesValue.get(nextBoardHash) is None else self.statesValue.get(nextBoardHash)
                if value > maxValue:
                    maxValue = value
                    action = p
        return action
    
    def feedReward(self, reward):
        '''
        At the end of the game, backpropogate and update state values.
        The updated value of state t equals the current value of state t
            adding the difference between the value of next state and the value of current state,
            which is multiplied by a learning rate α (Given the reward of intermediate state is 0).
        
        Parameters:
            reward (float) : The reward determined by the environment.
        '''
        for state in (reversed(self.states)):
            if self.statesValue.get(state) is None:
                self.statesValue[state] = 0
            self.statesValue[state] += self.lr * (self.decayGamma * reward - self.statesValue[state])
            reward = self.statesValue[state]
    
    def reset(self):
        '''
        Reset board when agent starts a new game.
        '''
        self.states = []

    def savePolicy(self):
        '''
        After training, an agent has its policy stored in self.stateValues.
        This function saves that attribute in a file to play later.

        Returns:
            string : Name of file where policy is saved
        '''
        filename = currentdir + '/policies/c4_policy_' + str(self.name)
        with open(filename, 'wb') as fw:
            pickle.dump(self.statesValue, fw)
        return filename

    # Loading the policy when playing a human
    def loadPolicy(self, file):
        '''
        Reload previous self.stateValues.
        
        Parameters:
            file (str) : Name of file that has policy.
        '''
        with open(file, 'rb') as fr:
            self.statesValue = pickle.load(fr)

## Human Player

Adds functionality for human player using raw input on the cmdline.

In [5]:
class HumanPlayer(Player):
    '''
    Allows user to input moves through the cmdline.
    
    Parameters:
        openPositions (Python list) : List of available places to make a move on board.
        currentBoard (np.ndarray) : Game board.
    
    Returns:
        int: Position of board that user wants to make a move.
    '''
    def chooseAction(self, positions, curRows=None, currentBoard=None):
        while True:
            try:
                i = int(input("Input action column-> "))
            except ValueError:
                continue
            res = [l[1] for l in positions]
            if i in res:
                return (curRows[i] - 1, i)

## Getting Available Positions

How agents find possible moves

In [6]:
def availablePositions(curRows):
    '''Update vacant positions after a turn is made'''
    positions = []
    for i in range(BOARD_COLS):
        if curRows[i] > 0:
            # Coordinates need to be in tuple form
            positions.append((curRows[i] - 1, i))
    return positions

## Driver Code

The code below is how a game is started, Using 2 player objects and an instance of the tic tac toe environment.

In [7]:
def startGame(p1, p2, env):
    '''
    Initiates a game of Connect 4.

    Parameters:
        p1 (Player) : Player who takes the first move.
        p2 (Player) : Player who takes the second move.
        env (TicTacToeEnv) : Environment for the game.
        learn (bool) : Determines if agent players learn during the game.

    Returns:
        
    ''' 
    gameOver = False
    human = p1.name == "human" or p2.name == "human"
    observation = env.curRows
    while not gameOver:
        # Player 1
        openPositions = availablePositions(observation)
        observation,reward,gameOver,actionHash = env.step(p1.chooseAction(openPositions, curRows=observation, currentBoard=env.board))
        p1.addState(actionHash)
        # When there's a human player, print out stuff
        if human:
            env.render()
        if gameOver:
            if human and reward[0] == 1:
                print(f"{p1.name} wins!")
            elif human:
                print("tie!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
        # Player 2
        openPositions = availablePositions(observation)
        observation,reward,gameOver,actionHash = env.step(p2.chooseAction(openPositions, curRows=observation, currentBoard=env.board))
        p2.addState(actionHash)
        if human:
            env.render()
        if gameOver:
            if human:
                print(f"{p2.name} wins!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
    env.reset()
    p1.reset()
    p2.reset()

In [8]:
human = HumanPlayer("human")
# Train agents
p1 = QAgentPlayer("p1", 1)
p2 = QAgentPlayer("p2", -1)
env = gym.make('connect4env-v0')
print("training...")
for _ in range(3):
    startGame(p1, p2, env)
# Save Results
f1 = p1.savePolicy()
print("saved p1 policy")
f2 = p2.savePolicy()
print("saved p2 policy")

training...
saved p1 policy
saved p2 policy


## Play Against Agent

In [9]:
startGame(p1,human,env)

41
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   | x |   |   |   |   | 
----------------------------
(6, 0)
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
| o |   | x |   |   |   |   | 
----------------------------
human wins!


In [9]:
startGame(human,p2,env)

----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
| x |   |   |   |   |   |   | 
----------------------------
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
| o |   |   |   |   |   |   | 
----------------------------
| x |   |   |   |   |   |   | 
----------------------------
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |   |   |   |   | 
----------------------------
|   |   |   |