In [95]:
import numpy as np
import pandas as pd
import random
import itertools
from copy import copy

In [185]:
class Action:
    
    def __init__(self):
        pass
    
class Hit(Action):
    
    def __init__(self):
        Action.__init__(self)
    
    @staticmethod
    def apply(e: Env, p: Player):
        p.hit()


class Stick(Action):
    def __init__():
        super(Hit, self).__init__()

    @staticmethod
    def apply(e: Env, p: Player):
        p.can_hit = False


In [294]:
def croupier_policy_initialization(state):
    if state[0] < 17:
        return Hit
    else:
        return Stick


def random_policy_initialization(state):
    return random.choice([Hit, Stick])

def qvalue_initialization(state):
    return 0


class Policy:
    
    def __init__(self, state_space, policy_initialization):
        self.columns = ['player_sum', 'player_ace', 'croupier_sum', 'croupier_ace', 'Action']
        state_action_pairs = []
        for state in itertools.product(*state_space):
            action = policy_initialization(state)
            state_with_action = [*state, action]
            state_action_pairs.append(state_with_action)
        self.p = pd.DataFrame(state_action_pairs, columns=self.columns)
        
    def get_action(self, state):
        current_state_policy = self.p[(self.p.player_sum == state[0]) & (self.p.player_ace == state[1]) &
                                      (self.p.croupier_sum == state[2]) & (self.p.croupier_ace == state[3])]
        assert current_state_policy.shape[0] == 1, current_state_policy
        return current_state_policy.Action.values[0] 


class Env():
    
    def __init__(self):
        self.player = Player(self, num_observable_cards=0)
        self.croupier = Player(self, num_observable_cards=1)
        self.players = [self.player, self.croupier]
        
    def create_policies(self):

        player_observed_state_space = self.get_state_space(self.player)
        random_policy = Policy(player_observed_state_space, random_policy_initialization)
        self.player.set_policy(random_policy)

        croupier_observed_state_space = self.get_state_space(self.player)
        croupier_policy = Policy(croupier_observed_state_space, croupier_policy_initialization)
        self.croupier.set_policy(croupier_policy)

    def create_value_func(self):

        player_observed_state_space = self.get_state_space(self.player)
        q = qValue(player_observed_state_space,qvalue_initialization)
        self.player.set_q(q)
        
    def get_state(self, observing_player=None):
        if observing_player:
            player_state = observing_player.get_state()
            observed_state = player_state
        else:
            observed_state = []
        for p in self.players:
            if not p == observing_player:
                observed_state.extend(p.get_state(other_player=observing_player))
        return observed_state

    def get_state_space(self, observing_player):
        player_state_space = observing_player.get_state_space()
        state_space = player_state_space
        for p in self.players:
            if not p == observing_player:
                state_space.extend(p.get_state_space(other_player=observing_player))
        return state_space

    def init_round(self):
        Hit.apply(e=self, p=self.player)
        Hit.apply(e=self, p=self.player)
        Hit.apply(e=self, p=self.croupier)

    def step(self):
        if self.player.can_hit:
            current_player = self.player
        elif self.croupier.can_hit:
            current_player = self.croupier
        else:
            current_player = None
        
        if current_player:
            observed_state = self.get_state(current_player)
            action = current_player.get_action(observed_state)
            action.apply(e=self, p=current_player)
            current_player.memorize(observed_state, action)

        r = self.get_reward()
        if r:
            self.player.give_reward(r)
        
        return not self.is_round_over()

    def is_round_over(self):
        return all([not p.can_hit for p in self.players])
        
    def get_reward(self):
        if not self.is_round_over():
            return 0
        
        if self.player.cards.sum > Cards.limit:
            if self.croupier.cards.sum > Cards.limit:
                return 0
            else:
                return -1
        elif self.croupier.cards.sum > Cards.limit:
            return 1
        else:
            if self.croupier.cards.sum == self.player.cards.sum:
                return 0
            elif self.croupier.cards.sum > self.player.cards.sum:
                return -1
            else:
                return 1
        
class Player():

    rho = 1
    
    def __init__(self, s: Env, num_observable_cards):
        self.cards = Cards()
        self.policy = None
        self.q = None
        self.memory = []
        self.can_hit = True
        self.num_observable_cards = num_observable_cards

    def set_policy(self, p: Policy):
        self.policy = p

    def set_q(self, q: qValue):
        self.q = q
    
    def get_state(self, other_player=None):
        if not other_player:
            return self.cards.get_state()
        else:
            return self.cards.get_state(self.num_observable_cards)
        
    def get_action(self, state):
        return self.policy.get_action(state)
    
    def hit(self):
        self.cards.add_card()
        if self.cards.sum > self.cards.limit:
            self.can_hit = False

    def memorize(self, state, action):
        self.memory.append([state, action, 0])
        
    def get_state_space(self, other_player=None):
        if not other_player:
            return self.cards.get_state_space(np.inf)
        else:
            return self.cards.get_state_space(self.num_observable_cards)

    def give_reward(self, r):
        for ind, m in enumerate(self.memory):
            m[2] += np.power(self.rho, (len(self.memory) - 1 - ind)) * r

    
class Cards:
    card_values = {"Ace": [1, 11], "2": [2], "3": [3], "4": [4], "5": [5],
                   "6": [6], "7": [7], "8": [8], "9": [9], "10": [10],
                   "J": [10], "Q": [10], "K": [10]}
    possible_cards = list(card_values.keys())
    limit = 21
    
    def __init__(self):
        self.cards = []
        self.states = [[0, 0]]
        self.sum = 0
        self.usable_ace = False

    def add_card(self):
        new_card = random.choice(self.possible_cards)
        self.cards.append(new_card)
        if new_card == "Ace":
            if self.sum + self.card_values["Ace"][1] <= self.limit:
                self.sum += self.card_values["Ace"][1]
                self.usable_ace = True
            else:
                self.sum += self.card_values["Ace"][0]
        else:
            self.sum += self.card_values[new_card][0]
        
        if self.sum > self.limit and self.usable_ace:
            self.usable_ace = False
            self.sum -= self.card_values["Ace"][1]
            self.sum += self.card_values["Ace"][0]
            
        if self.sum > self.limit:
            self.sum = self.limit + 1
        self.states.append([self.sum, int(self.usable_ace)])
    
    def get_state(self, num_observable_cards=-1):
        return copy(self.states[min(num_observable_cards, len(self.states) - 1)])
    
    def get_state_space(self, num_observable_cards):
        if num_observable_cards == 0:
            return [[None], [None]]
        else:
            return [range(min(self.limit + 2, num_observable_cards * 11 + 1)), [0, 1]]
        


In [293]:
class qValue:
    def __init__(self, state_space, value_initialization):
        self.columns = ['player_sum', 'player_ace', 'croupier_sum', 'croupier_ace', 'action', 'num_exp', 'value']
        state_action_value_pairs = []
        for state_action in itertools.product(*(state_space + [[Hit, Stick]])):
            value = value_initialization(state_action)
            state_action_value = [*(state_action +(0, value))]
            state_action_value_pairs.append(state_action_value)
        self.q = pd.DataFrame(state_action_value_pairs, columns=self.columns)

    def update(self, memory):
        for state, action, reward in memory:
            memorized_state_action = self.q[(self.q.player_sum == state[0]) & (self.q.player_ace == state[1]) &
                                            (self.q.croupier_sum == state[2]) & (self.q.croupier_ace == state[3]) &
                                            (self.q.action == action)]
            assert memorized_state_action.shape[0] == 1, memorized_state_action
            num_exp = memorized_state_action.num_exp.values[0]
            value = memorized_state_action.value.values[0]
            new_num_exp = num_exp + 1
            new_value = (value * num_exp + reward) / float(new_num_exp)
            
            

In [70]:
e = Env()
e.create_policies()

In [68]:
Hit().apply(e, e.player)

In [69]:
e.get_state(e.player)

[22, 0, 0, 0]

In [295]:
e = Env()
e.create_policies()
e.create_value_func()
e.init_round()
print(e.get_state())
while e.step():
    print(e.get_state())
print(e.get_state())
e.player.memory

[11, 0, 5, 0]
[18, 0, 5, 0]
[18, 0, 5, 0]
[18, 0, 12, 0]
[18, 0, 13, 0]
[18, 0, 19, 0]
[18, 0, 19, 0]


[[[11, 0, 5, 0], __main__.Hit, -1], [[18, 0, 5, 0], __main__.Stick, -1]]

In [300]:
e.croupier.cards.get_state(1)
e.get_state(e.player)
e.player.get_state()
e.player.cards.cards

e.croupier.cards.cards
# e.player.get_action(e.get_state(e.player)).apply(e, e.player)
q = e.player.q.p

In [308]:
a = q[(q.player_ace == 1) & (q.player_sum == 8) & (q.Action == Hit) & (q.croupier_sum == 10) & (q.croupier_ace == 0)]

In [322]:
a.shape[0] == 1
a.iloc[0].value = 10

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
