In [12]:
import gym, pickle, numpy as np
import tttenv

BOARD_ROWS,BOARD_COLS = 3,3
TOTAL_BOARD_SPACES = BOARD_ROWS*BOARD_COLS
COORD_TO_INDEX = lambda x : (x[0] * BOARD_ROWS) + x[1]

In [24]:
import sys,os

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
currentdir = os.path.abspath(os.getcwd())

/Users/Ethan/Desktop/senior_project/senior-project-MARL/agent_code


In [14]:
# Super class for players (both human and agent)
class Player:
    def __init__(self, name):
        self.name = name
        
    def chooseAction(self, positions, currentBoard=None):
        pass
        
    def addState(self, state):
        pass
    
    def feedReward(self, reward):
        pass
    
    def reset():
        pass

In [27]:
# Class that defines agent players
# Two of these players will be learning and playing with each other
class AgentPlayer:
    def __init__(self, name, turn, explRate=.3):
        self.name = name
        self.turn = turn
        # Implement epsilon-greedy method of selecting actions
        # Default .3 value means 30% of time agent takes random action, 70% of time agent takes greedy action
        self.explRate = explRate
        # Record all positions taken
        self.states = []
        # Learning rate 
        self.lr = .2
        self.decayGamma = .9
        # State -> Value
        self.statesValue = {}
        
    # Get a unique hash value that corresponds with the given board state
    def getHash(self, board):
        return str(board.reshape(BOARD_ROWS * BOARD_COLS))
    
    # Using this abstraction because HumanPlayer class will have this as well
    def addState(self, state):
        self.states.append(state)
        
    def chooseAction(self, openPositions, currentBoard):
        if np.random.uniform(0,1) <= self.explRate:
            # Take random action
            index = np.random.choice(len(openPositions))
            action = openPositions[index]
        else:
            maxValue = -999
            for p in openPositions:
                nextBoard = currentBoard.copy()
                nextBoard[p] = self.turn
                nextBoardHash = self.getHash(nextBoard)
                value = 0 if self.statesValue.get(nextBoardHash) is None else self.statesValue.get(nextBoardHash)
                if value > maxValue:
                    maxValue = value
                    action = p
        return action
    
    # At the end of the game, backpropogate and update state values
    # The updated value of state t equals the current value of state t
    #   adding the difference between the value of next state and the value of current state,
    #   which is multiplied by a learning rate α (Given the reward of intermediate state is 0)
    def feedReward(self, reward):
        for state in (reversed(self.states)):
            if self.statesValue.get(state) is None:
                self.statesValue[state] = 0
            self.statesValue[state] += self.lr * (self.decayGamma * reward - self.statesValue[state])
            reward = self.statesValue[state]
    
    # For when there's a new round
    def reset(self):
        self.states = []

    # After training, an agent has its policy stored in self.stateValues
    # This can be saved to play against a human player
    def savePolicy(self):
        fw = open(currentdir + '/policies/ttt_policy_' + str(self.name), 'wb')
        pickle.dump(self.statesValue, fw)
        fw.close()

    # Loading the policy when playing a human
    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.statesValue = pickle.load(fr)
        fr.close()

In [16]:
# Find vacant positions after a turn is made
def availablePositions(board):
    positions = []
    for i in range(BOARD_ROWS):
        for j in range(BOARD_COLS):
            if board[i,j] == 0:
                # Coordinates need to be in tuple form
                positions.append((i,j))
    return positions

def startGame(p1, p2, env):
    gameOver = False
    human = p1.name == "human" or p2.name == "human"
    observation = np.zeros((BOARD_ROWS, BOARD_COLS))
    while not gameOver:
        # Player 1
        openPositions = availablePositions(observation)
        observation,reward,gameOver,actionHash = env.step(p1.chooseAction(openPositions, observation))
        p1.addState(actionHash)
        # When there's a human player, print out stuff
        if human:
            env.render()
        if gameOver:
            if human and reward[0] == 1:
                print(f"{self.p1.name} wins!")
            else:
                print("tie!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
        # Player 2
        openPositions = availablePositions(observation)
        observation,reward,gameOver,actionHash = env.step(p2.chooseAction(openPositions, observation))
        p2.addState(actionHash)
        if human:
            env.render()
        if gameOver:
            if human:
                print(f"{self.p2.name} wins!")
            p1.feedReward(reward[0])
            p2.feedReward(reward[1])
            break
    env.reset()
    p1.reset()
    p2.reset()
    

In [17]:
# Class for human player
# Mostly inherited from super class Player
class HumanPlayer(Player):    
    def chooseAction(self, positions, currentBoard=None):
        while True:
            try:
                i = int(input("Input action row-> "))
                j = int(input("Input action column-> "))
            except ValueError:
                continue
            if (i, j) in positions:
                return (i, j)

In [18]:
# Train agents
p1 = AgentPlayer("p1", 0)
p2 = AgentPlayer("p2", 1)
env = gym.make('tttenv-v0')
print("training...")
for _ in range(100):
    startGame(p1, p2, env)
# Save Results
p1.savePolicy()
print("saved p1 policy")
p2.savePolicy()
print("saved p2 policy")

training...
saved p1 policy
saved p2 policy


In [28]:
# Human play with trained p1
# Make sure Agent isn't training anymore
p1 = AgentPlayer("computer", 0, explRate=0)
p1.loadPolicy(currentdir + "/policies/ttt_policy_p1")
p2 = HumanPlayer("human")
startGame(p1, p2, env)

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
Input action row-> 2
Input action column-> 3
Input action row-> 2
Input action column-> 0
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
Input action row-> 0
Input action column-> 0
-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
Input action row-> 0
Input action column-> 1
-------------
| x | x |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
-------------
| x | x |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
Input action row-> 1
Input action column-> 0
-------------
| x | x |   | 
-------------
| x |   |   | 
----------

NameError: name 'self' is not defined

In [26]:
# Human play with trained p2
# Make sure Agent isn't training anymore
p1 = HumanPlayer("human")
p2 = AgentPlayer("computer", 1, explRate=0)
p2.loadPolicy(currentdir + "/policies/ttt_policy_p2")
startGame(p1, p2, env)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Ethan/Desktop/senior_project/senior-project-MARL/agent_code/policies/ttt_policy_2'