In [1]:
# okay, it's not "AI"
# see http://modelai.gettysburg.edu/2013/cfr/cfr.pdf

In [2]:
import copy
import random
import numpy as np
# kuhn poker time
# 3 cards
# 3*2 = 6 hands

HANDS = [(1,2), (1,3), (2,1), (2,3), (3,1), (3,2)]

# there are 12 information sets
# p1(1st) 1 {2, 3}
# p1(1st) 2 {1, 3}
# p1(1st) 3 {1, 2}
# p2(2nd) 1, p1 pass {2, 3}
# p2(2nd) 1, p1 bet  {2, 3}
# p2(2nd) 2, p1 pass {1, 3}
# p2(2nd) 2, p1 bet  {1, 3}
# p2(2nd) 3, p1 pass {1, 2}
# p2(2nd) 3, p1 bet  {1, 2}
# p1(3rd) 1, p1 pass, p2 bet  {2,3}
# p1(3rd) 2, p1 pass, p2 bet  {1,3}
# p1(3rd) 3, p1 pass, p2 bet  {1,2}

ISETS = [
    "1", "2", "3",                       # round 1
    "P1", "P2", "P3", "B1", "B2", "B3",  # round 2
    "PB1", "PB2", "PB3"                  # round 3
]  

# terminal history states
TERMINAL = ["PP", "PBP", "PBB", "BP", "BB"]
ACTIONS = ["P", "B"]
N_ACTIONS = 2


def payout(player_1_hand, player_2_hand, history):
    if history == "PBP":
        return -1
    elif history == "BP":
        return 1
    m = 1 if (player_1_hand > player_2_hand) else -1
    if history == "PP":
        return m
    if history in ["BB", "PBB"]:
        return m*2
    assert False

    
def get_information_set(player_1_hand, player_2_hand, history):
    assert history not in TERMINAL
    if history == "":
        return str(player_1_hand)
    elif len(history) == 1:
        return history + str(player_2_hand)
    else:
        return "PB" + str(player_1_hand)
    assert False

    
def cfr(sigma, regret, strategy, player_1_hand, player_2_hand, history, player_i, pi1, pi2):
    if history in TERMINAL:
        return payout(player_1_hand, player_2_hand, history) * (1 if player_i == 1 else -1)
    info_set = get_information_set(player_1_hand, player_2_hand, history)
    ph = 2 if len(history) == 1 else 1
    # if we are here, we have both actions available
    vo = 0.0
    voa = np.zeros(N_ACTIONS)
    for action_i, action in enumerate(ACTIONS):
        if ph == 1:
            voa[action_i] = cfr(
                sigma=sigma,
                regret=regret,
                strategy=strategy,
                player_1_hand=player_1_hand, 
                player_2_hand=player_2_hand, 
                history=history + action, 
                player_i=player_i, 
                pi1=sigma[info_set][action_i] * pi1, 
                pi2=pi2)
        else:
            voa[action_i] = cfr(
                sigma=sigma,
                regret=regret,
                strategy=strategy,
                player_1_hand=player_1_hand, 
                player_2_hand=player_2_hand, 
                history=history + action, 
                player_i=player_i, 
                pi1=pi1, 
                pi2=sigma[info_set][action_i] * pi2)
        vo += sigma[info_set][action_i] * voa[action_i]
    if ph == player_i:
        if player_i == 1:
            pi = pi1
            pnegi = pi2
        else:
            pi = pi2
            pnegi = pi1
        regret[info_set] += pnegi * (voa - vo)
        strategy[info_set] += pi * sigma[info_set]
        # update the strategy based on regret
        rsum = np.sum(np.maximum(regret[info_set], 0))
        if rsum > 0:
            sigma[info_set] = np.maximum(regret[info_set], 0) / rsum
        else:
            sigma[info_set] = np.full(N_ACTIONS, 0.5)
    return vo


def train(n_iterations: int = 20000):
    # init tables
    regret = {}
    strategy = {}
    for info_set in ISETS:
        regret[info_set] = np.zeros(N_ACTIONS)
        strategy[info_set] = np.zeros(N_ACTIONS)
    sigma = {}
    for info_set in ISETS:
        sigma[info_set] = np.full(N_ACTIONS, 0.5)
    # learn strategy
    for _ in range(n_iterations):
        for player_i in [1, 2]:
            player_1_hand, player_2_hand = random.choice(HANDS)
            cfr(sigma=sigma, 
                regret=regret, 
                strategy=strategy, 
                player_1_hand=player_1_hand, 
                player_2_hand=player_2_hand, 
                history="", 
                player_i=player_i, 
                pi1=1, 
                pi2=1)
    # print "average" strategy
    for k, v in strategy.items():
        norm = sum(list(v))
        print("%3s: P:%.4f B:%.4f" % (k, v[0]/norm, v[1]/norm))
    # https://en.wikipedia.org/wiki/Kuhn_poker#Optimal_strategy
    
    
train()

  1: P:0.7987 B:0.2013
  2: P:0.9995 B:0.0005
  3: P:0.2930 B:0.7070
 P1: P:0.6620 B:0.3380
 P2: P:0.9983 B:0.0017
 P3: P:0.0005 B:0.9995
 B1: P:0.9999 B:0.0001
 B2: P:0.6262 B:0.3738
 B3: P:0.0001 B:0.9999
PB1: P:1.0000 B:0.0000
PB2: P:0.4296 B:0.5704
PB3: P:0.0001 B:0.9999
