In [69]:
from random import random
import numpy as np
from numpy.random import normal, choice
import pickle

class TaskController:
    def __init__(self, env, learning_algorithm='SARSA', exploration=True, exploration_decay='SIMULATED_ANNEALING', 
                 exploration_strategy='E_GREEDY', exploration_epsilon=0.8, 
                 learning_rate=0.01, learning_rate_decay='EXPONENTIAL', gamma=0.9, online_learning=False,
                 *args, **kwargs):
        self.env = env
        policy = Policy(self.env, states=self.env.all_states(), actions=self.env.all_actions(),
                        state_action_validity_checker=self.env.is_state_action_pair_valid, 
                        algorithm=learning_algorithm, exploration=exploration, exploration_decay=exploration_decay, 
                        exploration_strategy=exploration_strategy, exploration_epsilon=exploration_epsilon, 
                        learning_rate=learning_rate, learning_rate_decay=learning_rate_decay, gamma=gamma, 
                        *args, **kwargs)
        self.agent = Agent(policy, initial_state=self.env.get_initial_state())
            
        self.online_learning = online_learning
        
    def run_episode(self, learning_phase=True):
        
        action = self.agent.choose_action(self.agent.current_state, always_greedy=not learning_phase)
        
        self.agent.next_state, reward = self.env.get_next_state_reward(self.agent.current_state, action)
        if learning_phase:
            if self.online_learning:
                next_action = None
                if self.agent.policy.algorithm_type == 'SARSA':
                    next_action = self.agent.choose_action(self.agent.next_state, always_greedy=True)
                elif self.agent.policy.algorithm_type == 'QLEARNING':
                    next_action = None
                else:
                    # EXPECTED_SARSA all actions are averaged
                    next_action = None
                self.agent.feed_reward(reward, self.agent.current_state, self.agent.next_state, 
                                              action, next_action)
            else:
                self.agent.policy.history.append({'state': self.agent.current_state, 'action': action, 'reward': reward})
                #print(self.agent.policy.history)
        else:
            self.agent.policy.history.append({'state': self.agent.current_state, 'action': action, 'reward': reward})
                
        if self.env.is_terminal(self.agent.next_state):
            self.env.episode_terminated = True
            if not self.online_learning and learning_phase:
                self.agent.feed_reward(reward, self.agent.current_state, self.agent.next_state,
                                                  None, None, online=False)
                self.agent.reset_history()
            
        current_state = self.agent.current_state
        self.agent.current_state = self.agent.next_state
        
        return current_state, action, self.agent.next_state, reward
    
    def reset(self):
        self.env.episode_terminated = False
        self.agent.current_state=self.env.get_initial_state()
        
        
import numpy as np
from numpy.random import random, choice

class Policy:
    def __init__(self, env, states={}, actions={}, state_action_validity_checker=None, 
                 hash_states=False, hash_actions=False,
                 algorithm='QLEARNING', exploration=True, exploration_decay='SIMULATED_ANNEALING', 
                 exploration_strategy='E_GREEDY', exploration_epsilon=0.8, 
                 learning_rate=0.01, learning_rate_decay='EXPONENTIAL', gamma=0.9, *args, **kwargs):
        # Set the update rule
        self.env = env
        self.algorithm_type = algorithm
        
        if algorithm=='QLEARNING':
            self.algorithm = QLearning(learning_rate=learning_rate,
                                       gamma=gamma, *args, **kwargs)
        elif algorithm=='SARSA':
            self.algorithm = Sarsa(learning_rate=learning_rate, 
                                   gamma=gamma,*args, **kwargs)
        elif algorithm=='EXPECTED_SARSA':
            self.algorithm = ExpectedSarsa(learning_rate=learning_rate, 
                                           gamma=gamma,*args, **kwargs)
        elif algorithm=='EVERY_VISIT_MC':
            self.algorithm = EveryVisitMC(*args, **kwargs)
        elif algorithm=='FIRST_VISIT_MC':
            self.algorithm = FirstVisitMC(*args, **kwargs)
        else:
            self.algorithm = QLearning(learning_rate=learning_rate,
                                       gamma=gamma, *args, **kwargs)
           
        # Set exploration-exploitation strategy
        self.exploration = exploration
        if self.exploration:
            self.exploration_decay = exploration_decay
            self.exploration_strategy = exploration_strategy
            self.exploration_epsilon = exploration_epsilon
            
        # Set LR and gamma
        self.learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        
        self.gamma = gamma
        
        # Initialize policy
        self.states = [hash(state) for state in states] if hash_states else states
        
        self.hash_states = hash_states
        self.hash_actions = hash_actions
        
        if 'load_policy' not in kwargs:
            self.policy = dict()
            actions = list(actions)
            for state in self.states:
                temp = dict()
                for action in actions:
                    if state_action_validity_checker(state, action):
                        temp[action] = 0.1
                if len(temp.keys())>0:
                    self.policy[state] = temp
        else:
            self.policy = self.load_policy(kwargs['load_policy'])
        
        self.history = []
    
    def feed_reward(self, reward, current_state, next_state, current_action, next_action, online=True):
        if online:
            self.algorithm.feed_reward(self, reward=reward, 
                                       current_state=current_state, 
                                       next_state=next_state, 
                                       current_action=current_action, 
                                       next_action=next_action)
        else:
            # Offline episodic updates
            # History is an array of dicts [{'state': state, 'action': action},{...},...]
            last_element = True
            
            for elm in reversed(self.history):
                reward = elm['reward']
                if last_element:
                    last_element = False
                    next_state = elm['state']
                    next_action = elm['action']
                current_state = elm['state']
                current_action = elm['action']
                self.algorithm.feed_reward(self, reward=reward, 
                                           current_state=current_state, 
                                           next_state=next_state, 
                                           current_action=current_action, 
                                           next_action=next_action)
                next_state = current_state
                next_action = current_action
                
            if self.algorithm_type in ['EVERY_VISIT_MC', 'FIRST_VISIT_MC']:
                for state, temp in self.algorithm.visit_count.items():
                    for action, value in temp.items():
                        self.policy[state][action] = (self.policy[state][action]*(self.algorithm.num_iter) 
                                                      + np.mean(self.algorithm.visit_count[state][action]))/float(self.algorithm.num_iter+1)
                self.algorithm.reset_episode()
            
        return reward
    
    def choose_action(self, state, always_greedy=False):
        action = None
        if always_greedy:
            return max(self.policy[state], key=self.policy[state].get)
        if self.exploration:
            # Learning Phase
            if random() > self.exploration_epsilon:
                # Exploitation
                action = max(self.policy[state], key=self.policy[state].get)
            else:
                # Exploration
                if self.exploration_strategy == 'E_GREEDY':
                    min_v = min(self.policy[state].values())
                    non_neg_v = [v - min_v + 0.1 for v in self.policy[state].values()]
                    total = sum(non_neg_v)
                    ordered_actions = [(k,float(v - min_v + 0.1)/total) for k,v in self.policy[state].items()]
                    a = [a[0] for a in ordered_actions]
                    p = [a[1] for a in ordered_actions]
                    action = choice(a, p=p)
                else:
                    action = choice(list(self.policy[state].keys()))
                
                # Decay of exploration rate
                if self.exploration_decay=='SIMULATED_ANNEALING':
                    self.exploration_epsilon = self.exploration_epsilon*0.99
                elif self.exploration_decay=='CONSTANT':
                    self.exploration_epsilon = self.exploration_epsilon
        
        else:
            # Non Learning Phase
            action = max(self.policy[state], key=self.policy[state].get)
        
        return action
    
    def save_policy(self, filename='policy.pkl'):
        with open(filename, 'wb') as output:  # Overwrites any existing file.
            pickle.dump(self.policy, output, pickle.HIGHEST_PROTOCOL)
            
    def load_policy(self, filename='policy.pkl'):
        with open(filename, 'rb') as input:
            policy = pickle.load(input)
        return policy
    
    
class Sarsa:
    # Use >300 episodes
    def __init__(self, learning_rate=0.01, gamma=0.9, eligibility_trace=False, *args, **kwargs):
        self.LR = learning_rate
        self.GAMMA = gamma
        self.num_iter = 0
    
    def feed_reward(self, policy, reward=None, current_state=None, next_state=None, current_action=None, next_action=None):
        # Python is pass by name and mutability of object decides whether it acts as pass by value or reference
        # Dicts are mutable and therefore this modifies the self.policy on Policy instance
        policy.policy[current_state][current_action] = policy.policy[current_state][current_action] + self.LR*(
            reward + self.GAMMA*policy.policy[next_state][next_action] - policy.policy[current_state][current_action])
        
        # Learning rate decay (logarithmic)
        self.num_iter += 1
        self.LR = min(1.0/np.log(self.num_iter), self.LR)
        return self.num_iter
    
class QLearning:
    # Use >300 episodes
    def __init__(self, learning_rate=0.01, gamma=0.9, eligibility_trace=False, *args, **kwargs):
        self.LR = learning_rate
        self.GAMMA = gamma
        self.num_iter = 0
    
    def feed_reward(self, policy, reward=None, current_state=None, next_state=None, current_action=None, next_action=None):
        # Python is pass by name and mutability of object decides whether it acts as pass by value or reference
        # Dicts are mutable and therefore this modifies the self.policy on Policy instance
        policy.policy[current_state][current_action] = policy.policy[current_state][current_action] + self.LR*(
            reward + self.GAMMA*max(policy.policy[next_state].values()) - policy.policy[current_state][current_action])
        
        # Learning rate decay (logarithmic)
        self.num_iter += 1
        self.LR = min(1.0/np.log(self.num_iter), self.LR)
        return self.num_iter
    
class ExpectedSarsa:
    # Use >1000 episodes
    def __init__(self, learning_rate=0.01, gamma=0.9, eligibility_trace=False, *args, **kwargs):
        self.LR = learning_rate
        self.GAMMA = gamma
        self.num_iter = 0
    
    def feed_reward(self, policy, reward=None, current_state=None, next_state=None, current_action=None, next_action=None):
        # Python is pass by name and mutability of object decides whether it acts as pass by value or reference
        # Dicts are mutable and therefore this modifies the self.policy on Policy instance
        policy.policy[current_state][current_action] = policy.policy[current_state][current_action] + self.LR*(
            reward + self.GAMMA*np.mean(list(policy.policy[next_state].values())) - policy.policy[current_state][current_action])
        
        # Learning rate decay (logarithmic)
        self.num_iter += 1
        self.LR = min(1.0/np.log(self.num_iter), self.LR)
        return self.num_iter
    
    
class EveryVisitMC:
    # Use >1000 episodes
    def __init__(self, gamma=0.9, *args, **kwargs):
        self.num_iter = 1
        self.visit_count = dict()
        self.episode_return = 0
        self.GAMMA = gamma
    
    def feed_reward(self, policy, reward=None, current_state=None, next_state=None, current_action=None, next_action=None):
        # Python is pass by name and mutability of object decides whether it acts as pass by value or reference
        # Dicts are mutable and therefore this modifies the self.policy on Policy instance
        self.episode_return += self.GAMMA*reward

        if current_state in self.visit_count:
            if current_action in self.visit_count[current_state]:
                self.visit_count[current_state][current_action].append(self.episode_return)
            else:
                self.visit_count[current_state][current_action] = [self.episode_return]
        else:
            self.visit_count[current_state] = dict()
            self.visit_count[current_state][current_action] = [self.episode_return]
            
        return self.num_iter
    
    def reset_episode(self):
        self.visit_count = dict()
        self.episode_return = 0
        self.num_iter += 1

    
class FirstVisitMC:
    # Use >1000 episodes
    def __init__(self, gamma=0.9, *args, **kwargs):
        self.num_iter = 1
        self.visit_count = dict()
        self.episode_return = 0
        self.GAMMA = gamma
    
    def feed_reward(self, policy, reward=None, current_state=None, next_state=None, current_action=None, next_action=None):
        # Python is pass by name and mutability of object decides whether it acts as pass by value or reference
        # Dicts are mutable and therefore this modifies the self.policy on Policy instance
        self.episode_return += self.GAMMA*reward

        if current_state in self.visit_count:
            self.visit_count[current_state][current_action] = [self.episode_return]
        else:
            self.visit_count[current_state] = dict()
            self.visit_count[current_state][current_action] = [self.episode_return]
            
        return self.num_iter
    
    def reset_episode(self):
        self.visit_count = dict()
        self.episode_return = 0
        self.num_iter += 1
        

class Agent():
    def __init__(self, policy, initial_state=None):
        self.current_state = initial_state
        self.next_state = None
        # Pass the generator object for both states and actions
        self.policy = policy
    
    def reset_history(self):
        self.policy.history = []
        
    def choose_action(self, *args, **kwargs):
        return self.policy.choose_action(*args, **kwargs)
    
    def feed_reward(self, *args, **kwargs):
        return self.policy.feed_reward(*args, **kwargs)

In [79]:
from itertools import product
from operator import itemgetter
class TicTacToe:
    def __init__(self, sym='X'):
        self.sym = sym
        self.episode_terminated = False
        
    def all_states(self):
        return product("_XO",repeat=9)
    
    def all_actions(self):
        return range(9)
    
    def get_next_state_reward(self, current_state, action):
        state = list(current_state)
        state[action] = self.sym
        possible_actions = [i for i,sym in enumerate(state) if sym=='_']
        state[choice(possible_actions)] = 'O' if self.sym=='X' else 'X'
        return tuple(state), 1 if self.has_won(state, self.sym) else 0
    
    def is_state_action_pair_valid(self, state, action):
        x_minus_o = 0
        count_ = 0
        for i in state:
            if i=='X':
                x_minus_o+=1 
            elif i=='O':
                x_minus_o-=1
            else:
                count_ +=1
        if ((x_minus_o == 0 and self.sym=='X' and state[action]=='_') 
            or (x_minus_o == 1 and self.sym=='O' and state[action]=='_')):
            return True
        return False
    
    def is_terminal(self, state):
        count_ = 0
        for i in state:
            if i=='_':
                count_ = count_ + 1
        return True if (self.has_won(state, 'X') or self.has_won(state, 'O') or count_==1) else False
    
    def get_initial_state(self):
        return ("_","_","_","_","_","_","_","_","_")
    
    def __has_won(self, state, sym):
        if state[0]==state[1]==state[2]==self.sym:
            return True
        elif state[3]==state[4]==state[5]==self.sym:
            return True
        elif state[6]==state[7]==state[8]==self.sym:
            return True
        elif state[0]==state[4]==state[8]==self.sym:
            return True
        elif state[2]==state[4]==state[6]==self.sym:
            return True
        elif state[0]==state[3]==state[6]==self.sym:
            return True
        elif state[1]==state[4]==state[7]==self.sym:
            return True
        elif state[2]==state[5]==state[8]==self.sym:
            return True
        return False
    
    def has_won(self, state, sym):
        count_ = 0
        for i in state:
            if i=='_':
                count_ = count_ + 1
        if count_==1:
            last_state = [i if i != '_' else 'X' for i in state]
            return self.__has_won(state, sym) or self.has_won(last_state, sym)
        else:
            return self.__has_won(state, sym)

In [103]:
p1 = TicTacToe(sym='X')

tc = TaskController(p1, 
                    learning_algorithm='QLEARNING', # QLEARNING, SARSA, EXPECTED_SARSA, FIRST_VISIT_MC, EVERY_VISIT_MC
                    exploration=True, 
                    exploration_decay='CONSTANT', # CONSTANT, SIMULATED_ANNEALING
                    exploration_strategy='E_GREEDY', # E_GREEDY, SOFT_E_GREEDY
                    exploration_epsilon=0.99, 
                    learning_rate=0.1, 
                    learning_rate_decay='EXPONENTIAL', # CONSTANT, EXPONENTIAL
                    gamma=0.9,
                    online_learning=True,
                    load_policy='tttpolicy.pkl'
                   )


'''
OPTIONAL PARAMETERS - 
load_policy='FILE_NAME'  eg: load_policy='policy.pkl'
eligibility_trace = True (NOT IMPLEMENTED HERE)

YOU CAN SAVE A POLICY BY CALLING - 
tc.agent.policy.save_policy('FILE_NAME')  eg: tc.agent.policy.save_policy('policy.pkl')

CASE STUDY 1 - YOU WANT TO INCREMENTALLY ADD STATES, FOR EXAMPLE
FIRST TRAIN USING JUST RED AND GREEN SIGNALS, SAVE THE RESULTING POLICY
RECREATE tc WITH load_policy OPTION AND SAVED POLICY AND TRAIN
BUT NOW WITH ADDITIONAL STATE OF YELLOW SIGNAL

IN THIS CASE, THE INITIAL POLICY *SHOULD* ALSO HAVE RED, GREEN AND YELLOW SIGNALS, JUST THAT
YELLOW IS NEVER ENCOUNTERED (SWITCHING TIME IS 0). IN THE SECOND ROUND, YOU CAN INTRODUCE YELLOW SIGNAL. FOR EXAMPLE BY
MAKING SWITCHING TIME OF YELLOW SIGNAL NON 0.

CASE STUDY 2 - TRAIN USING SARSA FIRST, SAVE POLICY, CREATE NEW TC WITH SAVED POLICY AND QLEARNING, TRAIN FROM THERE. 
MAKE SURE TO CREATE NEW TC WITH MANUALLY DECAYED LEARNING/EXPLORATION RATE 

ELIGIBILITY TRACE IS IMPORTANT IN THIS TASK TO LEARN EFFICIENTLY. BUT IT IS NOT IMPLEMENTED IN THIS VERSION
IT CAN BE TURNED ON/OFF USING OPTIONAL eligibility_trace PARAMETER
'''

"\nOPTIONAL PARAMETERS - \nload_policy='FILE_NAME'  eg: load_policy='policy.pkl'\neligibility_trace = True (NOT IMPLEMENTED HERE)\n\nYOU CAN SAVE A POLICY BY CALLING - \ntc.agent.policy.save_policy('FILE_NAME')  eg: tc.agent.policy.save_policy('policy.pkl')\n\nCASE STUDY 1 - YOU WANT TO INCREMENTALLY ADD STATES, FOR EXAMPLE\nFIRST TRAIN USING JUST RED AND GREEN SIGNALS, SAVE THE RESULTING POLICY\nRECREATE tc WITH load_policy OPTION AND SAVED POLICY AND TRAIN\nBUT NOW WITH ADDITIONAL STATE OF YELLOW SIGNAL\n\nIN THIS CASE, THE INITIAL POLICY *SHOULD* ALSO HAVE RED, GREEN AND YELLOW SIGNALS, JUST THAT\nYELLOW IS NEVER ENCOUNTERED (SWITCHING TIME IS 0). IN THE SECOND ROUND, YOU CAN INTRODUCE YELLOW SIGNAL. FOR EXAMPLE BY\nMAKING SWITCHING TIME OF YELLOW SIGNAL NON 0.\n\nCASE STUDY 2 - TRAIN USING SARSA FIRST, SAVE POLICY, CREATE NEW TC WITH SAVED POLICY AND QLEARNING, TRAIN FROM THERE. \nMAKE SURE TO CREATE NEW TC WITH MANUALLY DECAYED LEARNING/EXPLORATION RATE \n\nELIGIBILITY TRACE IS IM

In [104]:
NUM_EP=0
MAX_EP=100000

while NUM_EP < MAX_EP:
    cum_reward=0
    while not (tc.env.episode_terminated):
        current_state, action, next_state, reward = tc.run_episode(learning_phase=True)
        cum_reward += reward
    if NUM_EP % (int(MAX_EP/1000) if MAX_EP>1000 else 1) == 0:
        print("{} earned after {} episodes P1".format(cum_reward,NUM_EP))
    NUM_EP += 1
    tc.reset()



0 earned after 0 episodes P1
1 earned after 100 episodes P1
1 earned after 200 episodes P1
1 earned after 300 episodes P1
0 earned after 400 episodes P1
1 earned after 500 episodes P1
1 earned after 600 episodes P1
1 earned after 700 episodes P1
1 earned after 800 episodes P1
1 earned after 900 episodes P1
0 earned after 1000 episodes P1
1 earned after 1100 episodes P1
1 earned after 1200 episodes P1
1 earned after 1300 episodes P1
1 earned after 1400 episodes P1
1 earned after 1500 episodes P1
1 earned after 1600 episodes P1
1 earned after 1700 episodes P1
1 earned after 1800 episodes P1
0 earned after 1900 episodes P1
1 earned after 2000 episodes P1
1 earned after 2100 episodes P1
1 earned after 2200 episodes P1
1 earned after 2300 episodes P1
1 earned after 2400 episodes P1
0 earned after 2500 episodes P1
1 earned after 2600 episodes P1
1 earned after 2700 episodes P1
1 earned after 2800 episodes P1
1 earned after 2900 episodes P1
1 earned after 3000 episodes P1
0 earned after 3100 

0 earned after 25200 episodes P1
1 earned after 25300 episodes P1
1 earned after 25400 episodes P1
1 earned after 25500 episodes P1
1 earned after 25600 episodes P1
1 earned after 25700 episodes P1
1 earned after 25800 episodes P1
1 earned after 25900 episodes P1
1 earned after 26000 episodes P1
1 earned after 26100 episodes P1
0 earned after 26200 episodes P1
1 earned after 26300 episodes P1
1 earned after 26400 episodes P1
1 earned after 26500 episodes P1
1 earned after 26600 episodes P1
1 earned after 26700 episodes P1
1 earned after 26800 episodes P1
1 earned after 26900 episodes P1
1 earned after 27000 episodes P1
1 earned after 27100 episodes P1
1 earned after 27200 episodes P1
1 earned after 27300 episodes P1
1 earned after 27400 episodes P1
1 earned after 27500 episodes P1
1 earned after 27600 episodes P1
1 earned after 27700 episodes P1
1 earned after 27800 episodes P1
1 earned after 27900 episodes P1
1 earned after 28000 episodes P1
1 earned after 28100 episodes P1
1 earned a

1 earned after 50500 episodes P1
0 earned after 50600 episodes P1
0 earned after 50700 episodes P1
1 earned after 50800 episodes P1
1 earned after 50900 episodes P1
1 earned after 51000 episodes P1
1 earned after 51100 episodes P1
1 earned after 51200 episodes P1
1 earned after 51300 episodes P1
1 earned after 51400 episodes P1
1 earned after 51500 episodes P1
1 earned after 51600 episodes P1
1 earned after 51700 episodes P1
1 earned after 51800 episodes P1
1 earned after 51900 episodes P1
1 earned after 52000 episodes P1
1 earned after 52100 episodes P1
0 earned after 52200 episodes P1
0 earned after 52300 episodes P1
1 earned after 52400 episodes P1
0 earned after 52500 episodes P1
1 earned after 52600 episodes P1
1 earned after 52700 episodes P1
1 earned after 52800 episodes P1
1 earned after 52900 episodes P1
1 earned after 53000 episodes P1
1 earned after 53100 episodes P1
1 earned after 53200 episodes P1
1 earned after 53300 episodes P1
1 earned after 53400 episodes P1
1 earned a

0 earned after 75600 episodes P1
0 earned after 75700 episodes P1
1 earned after 75800 episodes P1
1 earned after 75900 episodes P1
1 earned after 76000 episodes P1
1 earned after 76100 episodes P1
1 earned after 76200 episodes P1
1 earned after 76300 episodes P1
0 earned after 76400 episodes P1
1 earned after 76500 episodes P1
1 earned after 76600 episodes P1
1 earned after 76700 episodes P1
1 earned after 76800 episodes P1
1 earned after 76900 episodes P1
1 earned after 77000 episodes P1
1 earned after 77100 episodes P1
1 earned after 77200 episodes P1
1 earned after 77300 episodes P1
1 earned after 77400 episodes P1
0 earned after 77500 episodes P1
1 earned after 77600 episodes P1
1 earned after 77700 episodes P1
1 earned after 77800 episodes P1
1 earned after 77900 episodes P1
1 earned after 78000 episodes P1
1 earned after 78100 episodes P1
1 earned after 78200 episodes P1
1 earned after 78300 episodes P1
1 earned after 78400 episodes P1
1 earned after 78500 episodes P1
1 earned a

In [116]:
tc.agent.policy.save_policy('tttpolicy_silver.pkl')

In [115]:
tc.reset()
cum_reward = 0
while not tc.env.episode_terminated:
    current_state, action, next_state, reward = tc.run_episode(learning_phase=False)
    print(np.reshape(next_state, (3,3)))
    cum_reward += reward

[['O' '_' '_']
 ['_' '_' '_']
 ['X' '_' '_']]
[['O' 'O' '_']
 ['_' '_' '_']
 ['X' 'X' '_']]
[['O' 'O' 'O']
 ['_' '_' '_']
 ['X' 'X' 'X']]


[('_', '_'),
 ('_', 'X'),
 ('_', 'O'),
 ('X', '_'),
 ('X', 'X'),
 ('X', 'O'),
 ('O', '_'),
 ('O', 'X'),
 ('O', 'O')]