In [1]:
import numpy as np
import pandas as pd
import random
import itertools

In [5]:
class Action:
    
    def __init__(self):
        pass

class Hit(Action):
    
    def __init__(self):
        super(Hit, self).__init__()
        
    def apply(s: State, p: Player):
        p.hit()

class Stick(Action):
    def __init__():
        super(Hit, self).__init__()

    def apply(s: State, p: Player):
        p.can_hit = False

In [35]:
def croupier_policy_initialization(state):
    if state[0] < 17:
        return Hit
    else:
        return Stick


def random_policy_initialization(state):
    return random.choice([Hit, Stick])


class Policy:
    
    def __init__(self, state_space, policy_initialization):
        self.columns = ['player_sum', 'player_ace', 'croupier_sum', 'croupier_ace', 'Action']
        state_action_pairs = []
        for state in itertools.product(*s.get_state_space()):
            action = policy_initialization(state)
            state_with_action = [*state, action]
            state_action_pairs.append(state_with_action)
        self.p = pd.DataFrame(state_action_pairs, columns=self.columns)
        
    def get_action(self, state):
        current_state_policy = self.p[(self.p.player_sum == state[0]) & (self.p.player_ace == state[1]) &
                                      (self.p.croupier_sum == state[2]) & (self.p.croupier_ace == state[3])]
        assert current_state_policy.shape[0] == 1, current_state_policy
        return current_state_policy.Action.values[0] 


class Env():
    
    def __init__(self):
        self.player = Player(self, num_observable_cards=0)
        self.croupier = Player(self, num_observable_cards=1)
        self.players = [self.player, self.croupier]
        
    def create_policies():

        player_observed_state_space = self.get_state_space(self.player)
        random_policy = Policy(self, player_observed_state_space, random_policy_initialization)
        self.player.set_policy(random_policy)

        croupier_observed_state_space = self.get_state_space(self.player)
        croupier_policy = Policy(self, croupier_observed_state_space, croupier_policy_initialization)
        self.croupier.set_policy(croupier_policy)
        
    def get_state(self, observing_player):
        player_state = observing_player.get_state()
        observed_state = player_state
        for p in self.players:
            if p == observing_player:
                observed_state.extend(p.get_state(other_player=observing_player))
        return observed_state

    def get_state_space(self, observing_player):
        player_state_space = observing_player.get_state_space()
        state_space = player_state_space
        for p in self.players:
            if not p == observing_player:
                state_space.extend(p.get_state_space(other_player=observing_player))
        return state_space

    def step(self):
        if self.player.can_hit and self.player.cards.sum < Cards.limit:
            current_player = self.player
        elif self.croupier.can_hit:
            current_player = self.croupier
        else:
            current_player = None
        
        if current_player:
            current_player.get_action(self.get_state(current_player)).apply(self, current_player)
            return True
        else:
            return False
        
    def get_reward(self):
        if self.player.cards.sum > Cards.limit:
            if self.croupier.cards.sum > Cards.limit:
                return 0
            else:
                return -1
        elif self.croupier.cards.sum > Cards.limit:
            return 1
        else:
            if self.croupier.cards.sum == self.player.cards.sum:
                return 0
            elif self.croupier.cards.sum > self.player.cards.sum:
                return -1
            else:
                return 1
        
class Player():

    def __init__(self, s: State, num_observable_cards):
        self.cards = Cards()
        self.policy = None
        self.can_hit = True
        self.num_observable_cards = num_observable_cards

    def set_policy(self, p: Policy):
        self.policy = p
        
    def get_player_state(self, other_player=None):
        if not other_player:
            return self.cards.get_state()
        else:
            return self.cards.get_state(self.num_observable_cards)
        
    def get_action(self, state):
        return self.policy.get_action(state)
    
    def hit(self):
        self.cards.add_card()

    def get_state_space(self, other_player=None):
        if not other_player:
            return self.cards.get_state_space(np.inf)
        else:
            return self.cards.get_state_space(self.num_observable_cards)
        

class Cards:
    card_values = {"Ace": [1, 11], "2": [2], "3": [3], "4": [4], "5": [5],
                   "6": [6], "7": [7], "8": [8], "9": [9], "10": [10],
                   "J": [10], "Q": [10], "K": [10]}
    possible_cards = list(card_values.keys())
    limit = 21
    
    def __init__(self):
        self.cards = []
        self.states = []
        self.sum = 0
        self.usable_ace = False

    def add_card(self):
        new_card = random.choice(self.possible_cards)
        self.cards.append(new_card)
        if new_card == "Ace":
            if self.sum + self.card_values["Ace"][1] <= self.limit:
                self.sum += self.card_values["Ace"][1]
                self.usable_ace = True
            else:
                self.sum += self.card_values["Ace"][0]
        else:
            self.sum += self.card_values[new_card][0]
        
        if self.sum > self.limit and self.usable_ace:
            self.usable_ace = False
            self.sum -= self.card_values["Ace"][1]
            self.sum += self.card_values["Ace"][0]
            
        if self.sum > self.limit:
            self.sum = self.limit + 1
        self.states = [self.sum, int(self.usable_ace)]
    
    def get_state(self, num_observable_cards=-1):
        if num_observable_cards == 0:
            return (None, None)
        return self.states[num_observable_cards - 1]
    
    def get_state_space(self, num_observable_cards):
        if num_observable_cards == 0:
            return [[None], [None]]
        else:
            return [range(2, min(self.limit + 2, num_observable_cards * 11)), [0, 1]]

In [36]:
e = Env()

In [39]:
e.player.get_state_space(e.croupier)

[[None], [None]]

In [223]:
p = Policy(s, [Hit], random_policy_initialization)

In [311]:
s = State()
print(s.get_state())
while s.step():
    print(s.get_state())
s.get_reward()

(20, 0, 8, 0)
(20, 0, 8, 0)
(20, 0, 12, 0)
(20, 0, 14, 0)
(20, 0, 22, 0)
(20, 0, 22, 0)


1

In [286]:
print(s.get_state())
s.get_reward()

(21, 1, 17, 0)


1