In [156]:
import os
import sys
import pickle as pkl

sys.path.append(os.path.abspath(".."))

from optimal_policy_training.compute_transition_probabilities import *
from utility_functions.utility_functions import *

MIN_BET = 20

In [7]:
with open('player_transitions.pkl','rb') as pklfile: 
    player_transitions = pkl.load(pklfile)

with open('dealer_transitions.pkl', 'rb') as file:
    dealer_transitions = pkl.load(file)

In [173]:
class ValueIteration:
    def __init__(self, player_transitions, dealer_transitions, utility_function, gamma = 1.0):
        self.player_transitions = player_transitions
        self.dealer_transitions = dealer_transitions
        self.gamma = gamma
        self.utility_function = utility_function

    def get_state_actions(self, state):
        if state.is_player_bust():
            return []
        potential_actions = self.player_transitions[state].keys()
        actions = []

        for action in potential_actions:
            if self.player_transitions[state][action] != []:
                actions.append(action)

        return actions  
    

    def is_terminal_state(self, state):
        return self.get_state_actions(state) == [] or state == State(((-1,),(0,)))
    
    def get_reward(self, payoff, initial_bankroll, current_bankroll):
        original_utility = self.utility_function(initial_bankroll, current_bankroll)
        new_utility = self.utility_function(initial_bankroll, current_bankroll + (payoff*MIN_BET))

        return new_utility - original_utility
    
    def process_reward(self, state, initial_bankroll, current_bankroll):
        if state.is_player_bust():
            return self.get_reward(-1, initial_bankroll, current_bankroll)
        
        if state.is_dealer_bust():
            return self.get_reward(1, initial_bankroll, current_bankroll)
        
        if state.player_has_blackjack() and state.dealer_has_blackjack():
            return self.get_reward(0, initial_bankroll, current_bankroll)
        
        if state.player_has_blackjack():
            return self.get_reward(1, initial_bankroll, current_bankroll)
        
        if state.dealer_has_blackjack():
            return self.get_reward(-1, initial_bankroll, current_bankroll)
        
        player_score = state.get_player_value()
        dealer_score = state.get_dealer_value()

        if player_score > dealer_score:
            return self.get_reward(1, initial_bankroll, current_bankroll)
        
        elif player_score < dealer_score:
            return self.get_reward(-1, initial_bankroll, current_bankroll)
        
        else:
            return self.get_reward(0, initial_bankroll, current_bankroll)

    def decide_best_action(self, card_state, initial_bankroll, current_banrkoll):

        actions = self.get_state_actions(card_state)

        if self.is_terminal_state(card_state) or actions == [0]:
            print('stuck in terminal')
            return 0
        
        else:
            action_values = {
                action: self.evaluate_action(card_state, action, initial_bankroll, current_banrkoll) for action in actions
            }

        return max(action_values, key=action_values.get)
    

    def evaluate_dealer_state(self, state, initial_bankroll, current_bankroll):

        if state not in self.dealer_transitions:
            return 0.0
        
        if state.is_dealer_bust():
            return self.process_reward(state, initial_bankroll, current_bankroll)
        
        dealer_transitions = self.dealer_transitions[state]
        dealer_value = 0.0

        if dealer_transitions == []:
            return self.process_reward(state, initial_bankroll, current_bankroll)
            
        for next_state, prob in dealer_transitions:
            if next_state.is_dealer_bust():
                reward = self.process_reward(next_state, initial_bankroll, current_bankroll)
                dealer_value += prob * reward
            else:
                dealer_value += prob * self.evaluate_dealer_state(next_state, initial_bankroll, current_bankroll)
        return dealer_value
            


    def evaluate_action(self, state, action, initial_bankroll, current_bankroll, gamma = 1.0):
        action_value = 0.0

        if action == 1:
            hit_states = self.player_transitions[state][1]
            action_value = 0.0

            for next_state, prob in hit_states:
                if next_state.is_player_bust():
                    reward = self.process_reward(next_state, initial_bankroll, current_bankroll)
                    action_value += prob * reward
                else:
                    action_value += prob * max(
                        self.evaluate_action(next_state, 1, initial_bankroll, current_bankroll, gamma),
                        self.evaluate_action(next_state, 0, initial_bankroll, current_bankroll, gamma)
                        )
                    
            return gamma * action_value
        
        else:
            stand_states = self.player_transitions[state][0]
            action_value = 0.0
            
            for dealer_start_state, prob in stand_states:
                dealer_value = self.evaluate_dealer_state(dealer_start_state, initial_bankroll, current_bankroll)
                action_value += prob * dealer_value

            return gamma * action_value

In [174]:
state = State(((10,3,4), (10,10,10)))
value_iteration = ValueIteration(player_transitions, dealer_transitions, linear_utility)

In [175]:
NUM_DECKS = 1
deck_cards = [2,3,4,5,6,7,8,9,10,10,10,10,11] * 4 * NUM_DECKS

starting_states = get_starting_states(deck_cards)

In [176]:
state = State(((2,2), (2,)))

value_iteration.get_state_actions(state)

[1]

In [179]:
values = {}

for state in starting_states[100:140]:
    print(state)
    values[state] = value_iteration.decide_best_action(state, 200, 80)

Player: [2, 11], Dealer: [-1]
stuck in terminal
Player: [2, 11], Dealer: [2]
Player: [2, 11], Dealer: [3]
Player: [2, 11], Dealer: [4]
Player: [2, 11], Dealer: [5]
Player: [2, 11], Dealer: [6]
Player: [2, 11], Dealer: [7]
Player: [2, 11], Dealer: [8]
Player: [2, 11], Dealer: [9]
Player: [2, 11], Dealer: [10]
Player: [2, 11], Dealer: [11]
Player: [3, 3], Dealer: [-1]
stuck in terminal
Player: [3, 3], Dealer: [2]
Player: [3, 3], Dealer: [3]
Player: [3, 3], Dealer: [4]
Player: [3, 3], Dealer: [5]
Player: [3, 3], Dealer: [6]
Player: [3, 3], Dealer: [7]
Player: [3, 3], Dealer: [8]
Player: [3, 3], Dealer: [9]
Player: [3, 3], Dealer: [10]
Player: [3, 3], Dealer: [11]
Player: [3, 4], Dealer: [-1]
stuck in terminal
Player: [3, 4], Dealer: [2]
Player: [3, 4], Dealer: [3]
Player: [3, 4], Dealer: [4]
Player: [3, 4], Dealer: [5]
Player: [3, 4], Dealer: [6]
Player: [3, 4], Dealer: [7]
Player: [3, 4], Dealer: [8]
Player: [3, 4], Dealer: [9]
Player: [3, 4], Dealer: [10]
Player: [3, 4], Dealer: [11]
Pl

In [182]:
values

{Player: [2, 11], Dealer: [-1]: 0,
 Player: [2, 11], Dealer: [2]: 1,
 Player: [2, 11], Dealer: [3]: 1,
 Player: [2, 11], Dealer: [4]: 1,
 Player: [2, 11], Dealer: [5]: 1,
 Player: [2, 11], Dealer: [6]: 1,
 Player: [2, 11], Dealer: [7]: 1,
 Player: [2, 11], Dealer: [8]: 1,
 Player: [2, 11], Dealer: [9]: 1,
 Player: [2, 11], Dealer: [10]: 1,
 Player: [2, 11], Dealer: [11]: 1,
 Player: [3, 3], Dealer: [-1]: 0,
 Player: [3, 3], Dealer: [2]: 1,
 Player: [3, 3], Dealer: [3]: 1,
 Player: [3, 3], Dealer: [4]: 1,
 Player: [3, 3], Dealer: [5]: 1,
 Player: [3, 3], Dealer: [6]: 1,
 Player: [3, 3], Dealer: [7]: 1,
 Player: [3, 3], Dealer: [8]: 1,
 Player: [3, 3], Dealer: [9]: 1,
 Player: [3, 3], Dealer: [10]: 1,
 Player: [3, 3], Dealer: [11]: 1,
 Player: [3, 4], Dealer: [-1]: 0,
 Player: [3, 4], Dealer: [2]: 1,
 Player: [3, 4], Dealer: [3]: 1,
 Player: [3, 4], Dealer: [4]: 1,
 Player: [3, 4], Dealer: [5]: 1,
 Player: [3, 4], Dealer: [6]: 1,
 Player: [3, 4], Dealer: [7]: 1,
 Player: [3, 4], Dealer: 

In [89]:
starting_states

[Player: [-1], Dealer: [0],
 Player: [2, 2], Dealer: [-1],
 Player: [2, 2], Dealer: [2],
 Player: [2, 2], Dealer: [3],
 Player: [2, 2], Dealer: [4],
 Player: [2, 2], Dealer: [5],
 Player: [2, 2], Dealer: [6],
 Player: [2, 2], Dealer: [7],
 Player: [2, 2], Dealer: [8],
 Player: [2, 2], Dealer: [9],
 Player: [2, 2], Dealer: [10],
 Player: [2, 2], Dealer: [11],
 Player: [2, 3], Dealer: [-1],
 Player: [2, 3], Dealer: [2],
 Player: [2, 3], Dealer: [3],
 Player: [2, 3], Dealer: [4],
 Player: [2, 3], Dealer: [5],
 Player: [2, 3], Dealer: [6],
 Player: [2, 3], Dealer: [7],
 Player: [2, 3], Dealer: [8],
 Player: [2, 3], Dealer: [9],
 Player: [2, 3], Dealer: [10],
 Player: [2, 3], Dealer: [11],
 Player: [2, 4], Dealer: [-1],
 Player: [2, 4], Dealer: [2],
 Player: [2, 4], Dealer: [3],
 Player: [2, 4], Dealer: [4],
 Player: [2, 4], Dealer: [5],
 Player: [2, 4], Dealer: [6],
 Player: [2, 4], Dealer: [7],
 Player: [2, 4], Dealer: [8],
 Player: [2, 4], Dealer: [9],
 Player: [2, 4], Dealer: [10],
 Pla

In [105]:
state = State(((2,2), (2,)))
player_transitions[state].keys()

dict_keys([1, 0])