In [123]:
import numpy as np
import pandas as pd
import random
import itertools

In [249]:
class Action:
    
    def __init__(self):
        pass

class Hit(Action):
    
    def __init__(self):
        super(Hit, self).__init__()
        
    def apply(s: State, p: Player):
        p.hit()

class Stick(Action):
    def __init__():
        super(Hit, self).__init__()

    def apply(s: State, p: Player):
        p.can_hit = False

In [293]:
def croupier_policy_initialization(state):
    if state[2] < 17:
        return Hit
    else:
        return Stick


def random_policy_initialization(state):
    return random.choice([Hit, Stick])


class Policy:
    
    def __init__(self, s : np.ndarray, policy_initialization):
        self.columns = ['player_sum', 'player_ace', 'croupier_sum', 'croupier_ace', 'Action']
        state_action_pairs = []
        for state in itertools.product(*s.get_state_space()):
            action = policy_initialization(state)
            state_with_action = [*state, action]
            state_action_pairs.append(state_with_action)
        self.p = pd.DataFrame(state_action_pairs, columns=self.columns)
        
    def get_action(self, state):
        current_state_policy = self.p[(self.p.player_sum == state[0]) & (self.p.player_ace == state[1]) &
                                      (self.p.croupier_sum == state[2]) & (self.p.croupier_ace == state[3])]
        assert current_state_policy.shape[0] == 1, current_state_policy
        return current_state_policy.Action.values[0] 


class State:
    
    def __init__(self):
        self.player = Player(self, random_policy_initialization)
        self.croupier = Player(self, croupier_policy_initialization)
        self.player.hit()
        self.player.hit()
        self.croupier.hit()

    def get_state(self):
        player_state = self.player.get_state()
        croupier_state = self.croupier.get_state()
        return (*player_state, *croupier_state)

    def get_state_space(self):
        state_space = []
        state_space.extend(self.player.get_state_space())
        state_space.extend(self.croupier.get_state_space())
        return state_space

    def step(self):
        if self.player.can_hit and self.player.cards.sum < Cards.limit:
            current_player = self.player
        elif self.croupier.can_hit:
            current_player = self.croupier
        else:
            current_player = None
        
        if current_player:
            current_player.get_action(self.get_state()).apply(self, current_player)
            return True
        else:
            return False
        
    def get_reward(self):
        if self.player.cards.sum > Cards.limit:
            if self.croupier.cards.sum > Cards.limit:
                return 0
            else:
                return -1
        elif self.croupier.cards.sum > Cards.limit:
            return 1
        else:
            if self.croupier.cards.sum == self.player.cards.sum:
                return 0
            elif self.croupier.cards.sum > self.player.cards.sum:
                return -1
            else:
                return 1
        
class Player():

    def __init__(self, State: s, policy_initialization):
        self.cards = Cards()
        self.policy = Policy(s, policy_initialization)
        self.can_hit = True

    def get_state(self):
        return self.cards.get_state()

    def get_action(self, state):
        return self.policy.get_action(state)
    
    def hit(self):
        self.cards.add_card()

    def get_state_space(self):
        return self.cards.get_state_space()


class Cards:
    card_values = {"Ace": [1, 11], "2": [2], "3": [3], "4": [4], "5": [5],
                        "6": [6], "7": [7], "8": [8], "9": [9], "10": [10],
                        "J": [10], "Q": [10], "K": [10]}
    possible_cards = list(card_values.keys())
    limit = 21
    
    def __init__(self):
        self.cards = []
        self.sum = 0
        self.usable_ace = False

    def add_card(self):
        new_card = random.choice(self.possible_cards)
        self.cards.append(new_card)
        if new_card == "Ace":
            if self.sum + self.card_values["Ace"][1] <= self.limit:
                self.sum += self.card_values["Ace"][1]
                self.usable_ace = True
            else:
                self.sum += self.card_values["Ace"][0]
        else:
            self.sum += self.card_values[new_card][0]
        
        if self.sum > self.limit and self.usable_ace:
            self.usable_ace = False
            self.sum -= self.card_values["Ace"][1]
            self.sum += self.card_values["Ace"][0]
            
        if self.sum > self.limit:
            self.sum = self.limit + 1
    
    def get_state(self):
        return self.sum, int(self.usable_ace)
    
    def get_state_space(self):
        return [range(self.limit + 2), [0, 1]]

In [284]:
s = State()

In [223]:
p = Policy(s, [Hit], random_policy_initialization)

In [311]:
s = State()
print(s.get_state())
while s.step():
    print(s.get_state())
s.get_reward()

(20, 0, 8, 0)
(20, 0, 8, 0)
(20, 0, 12, 0)
(20, 0, 14, 0)
(20, 0, 22, 0)
(20, 0, 22, 0)


1

In [286]:
print(s.get_state())
s.get_reward()

(21, 1, 17, 0)


1