In [1]:
import numpy as np
from pandas import DataFrame
import matplotlib
import matplotlib.pyplot as plt
import gym
import blackjack as bj  #no joke

In [2]:
env = bj.BlackjackEnv()
boost = True
Verbose = False

In [70]:
def q_learning_action(player_hand, usable_ace, dealer_first_card, Q, epsilon):
    #epsilon-greedy exploration
    if np.random.random() < epsilon:
        return env.action_space.sample()
    
    #Q-Learning optimization
    return np.argmax(Q[player_hand - 4, usable_ace, dealer_first_card - 1]) 
    
    
def decrease_rate(x):
    if x < 0.2:
        return 1 - 0.1 * x / 0.2
    elif x < 0.6:
        return 0.9 - 0.8 * (x - 0.2) / 0.4
    elif 0.6 < x < 0.8:
        return 0.1 - 0.1 * (x - 0.6) / 0.2
    return 0

# x = np.linspace(0,1,100)
# y = [decrease_rate(el) for el in x]
# plt.plot(x, y)
# plt.show()
    


def main(lr=0.1, gamma=0.8, epsilon=0.2, train=False, Q=None, Verbose=False):
    avg_win = 0
    avg_tie = 0
    nb_games = 100000


    if train:
        #states: 4 to 21 (all the player hands possible); whether the player has a soft ace or not; and dealer's first card out (1 to 10)
        #actions: 0 (stand) or 1 (hit)
        Q = np.zeros((18, 2, 10, 2))
        
        if boost:
            ###########################################
            #IS IT CHEATING TO TELL THIS TO THE AGENT?#
            ###########################################
            #we want to hit if we have 11 or less
            Q[:8, :, :] = [0, 1]
            #we want to stand if we have 21
            Q[17, :, :] = [1, 0]


    #for loop to run nb_games blackjack games
    for i_game in range(nb_games):
        player_hand, dealer_first_card, usable_ace = env.reset()
        player_hand = bj.sum_hand(player_hand)
        usable_ace = (usable_ace) * 1

        #theoretically ,there cannot be more than 11 passes (4*aces, 4*two, 3*three)
        for t in range(11):
            if i_game == 0 and t == 0:
                action = env.action_space.sample()
            ###########################################
            #IS IT CHEATING TO TELL THIS TO THE AGENT?#
            ###########################################
            elif player_hand <= 11:
                action = 1
            else:
                #epsilon = decrease_explo_rate(i_game / nb_games)
                action = q_learning_action(player_hand, usable_ace, dealer_first_card, Q, epsilon)
            observation, reward, done, info = env.step(action)
            new_player_hand, dealer_first_card, usable_ace = observation
            new_player_hand = bj.sum_hand(new_player_hand)
            usable_ace = (usable_ace) * 1

            if train:
                lr = 0.1 * decrease_explo_rate(i_game / nb_games)
                #q_learning update
                Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] *= 1 - lr
                #Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] += lr * reward   
                
                ############
                #PROBLEM???#
                ############
                if new_player_hand > 21:
                    Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] += lr * reward
                    #Q[player_hand - 4, usable_ace, dealer_first_card - 1, 1 - action] += lr * (-reward + gamma)
                else:
                    Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] += lr * (reward + gamma * np.argmax(Q[new_player_hand - 4, usable_ace, dealer_first_card - 1]))
                        

            player_hand = new_player_hand

            
            if Verbose:
                print("Pass {} - Player's score:".format(t), player_hand)
                if player_hand > 21:
                    print("Player has been busted.")


            if done:
                if Verbose:
                    dealer_hand = bj.score(env.dealer)
                    print("Dealer's score:", dealer_hand)
                    if dealer_hand > 21:
                        print("Dealer has been busted.")
                
                #if the player won the game
                if reward == 1.:
                    avg_win += 1
                    if Verbose:
                        print("GAME WON")
                        print()
                #if there has been a draw
                elif reward == 0:
                    avg_tie += 1
                    if Verbose:
                        print("TIE")
                        print()
                #if the player lost the game
                else:
                    if Verbose:
                        print("GAME LOST")
                        print()
                break
                
                
        epsilon *= 0.99
        lr *= 0.99


    if Verbose:
        print("Average winning score with q-learning:", 100 * avg_win / nb_games, "%")
        print("Ties:", 100 * avg_tie / nb_games, "%  ||  Losses:", 100 * (1 - (avg_win + avg_tie) / nb_games), "%")

    env.close()

    if train:
        return Q
    #percentage of winning games
    return round(100 * avg_win / nb_games, 2), round(100 * avg_tie / nb_games, 2)

In [None]:
Q_table = main(train=True)
print(main(train=False, Q=Q_table))

# To print Normal Play Strategy score

In [5]:
#Building Normal Play Q-table
Q_normal_play = np.zeros((18, 2, 10, 2))
Q_normal_play[:8, :, :] = [0, 1]
Q_normal_play[8, 0, 0:3] = [0, 1]
Q_normal_play[8, 0, 3:6] = [1, 0]
Q_normal_play[8, 0, 6:] = [0, 1]
Q_normal_play[9:13, 0, 0] = [0, 1]
Q_normal_play[9:13, 0, 1:6] = [1, 0]
Q_normal_play[9:13, 0, 6:] = [0, 1]
Q_normal_play[13:, 0, :] = [1, 0]
Q_normal_play[8, 1, :] = [0, 1]
Q_normal_play[9:14, 1, :] = [0, 1]
Q_normal_play[14, 1, 0] = [0, 1]
Q_normal_play[14, 1, 1:8] = [1, 0]
Q_normal_play[14, 1, 8:] = [0, 1]
Q_normal_play[15:, 1, :] = [1, 0]

# list_i = [i for i in range(1, 11)]
# list_j = [i for i in range(4, 22)]
# print("Hard Ace case")
# print(DataFrame(Q_normal_play[:, 0, :, 0], list_j, list_i))
# print("Soft Ace case")
# print(DataFrame(Q_normal_play[:, 1, :, 0], list_j, list_i))


print(main(train=False, Q=Q_normal_play))


err = 0
for i in range(18):
    for j in range(2):
        for k in range(10):
            if np.argmax(Q_table[i, j, k]) != np.argmax(Q_normal_play[i, j, k]):
                err += 1
err /= 3.6
print(str(err) + "%")

(36.44, 8.45)
5.833333333333333%


# To print the Q-table

In [36]:
cards = [i for i in range (2, 11)]
cards.append(1)

dealer_first_cards = [str(i) + "" for i in cards]
dealer_first_cards[-2] += " or any face"
dealer_first_cards[-1] = "ace"
player_cards = [str(i) for i in range(4, 21)]


combinations = []
for i in range(4, 21):
    lst = []
    for k in cards:
        pair = []
        if np.max(Q_table[i - 4, 0, k - 1]) - 0.5 > 0.:
            pair.append("HIT")
        else:
            pair.append("STAND")
            
        if np.max(Q_table[i - 4, 1, k - 1]) - 0.5 > 0.:
            pair.append("HIT")
        else:
            pair.append("STAND")
        lst.append(pair)
    combinations.append(lst)

combinations = np.array(combinations)
print("With a soft ace")
print(DataFrame(np.array(combinations[:, :, 0]), player_cards, dealer_first_cards))  #begins at 13
print("With a hard ace")
print(DataFrame(np.array(combinations[:, :, 1]), player_cards, dealer_first_cards))  #begins at 12

With a soft ace
        2      3      4      5      6      7      8      9 10 or any face  \
4     HIT    HIT    HIT  STAND  STAND    HIT    HIT    HIT            HIT   
5   STAND  STAND    HIT  STAND  STAND  STAND    HIT    HIT            HIT   
6   STAND    HIT  STAND  STAND    HIT  STAND    HIT  STAND            HIT   
7   STAND  STAND    HIT  STAND  STAND    HIT  STAND    HIT            HIT   
8   STAND  STAND  STAND  STAND  STAND  STAND    HIT    HIT            HIT   
9   STAND  STAND  STAND  STAND  STAND  STAND  STAND    HIT          STAND   
10  STAND  STAND  STAND  STAND  STAND  STAND  STAND  STAND          STAND   
11  STAND  STAND  STAND  STAND  STAND  STAND  STAND  STAND          STAND   
12  STAND  STAND  STAND  STAND  STAND  STAND  STAND  STAND          STAND   
13  STAND  STAND  STAND  STAND  STAND  STAND  STAND  STAND          STAND   
14  STAND  STAND  STAND  STAND  STAND  STAND  STAND  STAND          STAND   
15  STAND  STAND  STAND  STAND  STAND  STAND  STAND  STAND  

# To run several batches with different learning rate / gamma / epsilon

In [7]:
lr_list = [1, 0.5, 0.1, 1e-3, 1e-5, 1e-10, 0]
gamma_list = [1, 0.8, 0.5, 0.3, 0.1, 1e-3, 0]
epsilon_list = [1, 0.5, 0.3, 0.2, 0.1, 1e-3, 0]         #ADD A DECAYING EXPLORATION FACTOR

results = np.zeros((7, 7, 7))

for i in range(7):
    for j in range(7):
        print(j)
        for k in range(7):
            results[i, j, k] = main(train=True, lr=lr_list[i], gamma=gamma_list[j], epsilon=epsilon_list[k])[0]
print(results)
indices = np.argmax(results)
print(indices)
print("Average winning score with q-learning:", results[indices], "%")

0


KeyboardInterrupt: 