In [1]:
import numpy as np
from pandas import DataFrame
import matplotlib
import matplotlib.pyplot as plt
import gym
import blackjack as bj  #no joke

In [2]:
env = bj.BlackjackEnv()
boost = True
Verbose = False

In [16]:
def q_learning_action(player_hand, usable_ace, dealer_first_card, Q, epsilon):
    #epsilon-greedy exploration
    if np.random.random() < epsilon:
        return env.action_space.sample()
    
    #Q-Learning optimization
    return np.argmax(Q[player_hand - 4, usable_ace, dealer_first_card - 1]) 
    
    
def decrease_rate(x):
    if x < 0.2:
        return 1 - 0.1 * x / 0.2
    elif x < 0.6:
        return 0.9 - 0.8 * (x - 0.2) / 0.4
    elif 0.6 < x < 0.8:
        return 0.1 - 0.1 * (x - 0.6) / 0.2
    return 0

# x = np.linspace(0,1,100)
# y = [decrease_rate(el) for el in x]
# plt.plot(x, y)
# plt.show()
    


def main(lr=0.1, gamma=0.8, epsilon=0.2, train=False, Q=None, Verbose=False):
    avg_win = 0
    avg_tie = 0
    nb_games = 100000


    if train:
        #states: 4 to 21 (all the player hands possible); whether the player has a soft ace or not; and dealer's first card out (1 to 10)
        #actions: 0 (stand) or 1 (hit)
        Q = np.zeros((18, 2, 10, 2))
        
        if boost:
            ###########################################
            #IS IT CHEATING TO TELL THIS TO THE AGENT?#
            ###########################################
            #we want to hit if we have 11 or less
            Q[:8, :, :] = [0, 1]
            #we want to stand if we have 21
            Q[17, :, :] = [1, 0]


    #for loop to run nb_games blackjack games
    for i_game in range(nb_games):
        player_hand, dealer_first_card, usable_ace = env.reset()
        player_hand = bj.sum_hand(player_hand)
        usable_ace = (usable_ace) * 1

        #theoretically ,there cannot be more than 11 passes (4*aces, 4*two, 3*three)
        for t in range(11):
            if i_game == 0 and t == 0:
                action = env.action_space.sample()
            ###########################################
            #IS IT CHEATING TO TELL THIS TO THE AGENT?#
            ###########################################
            elif player_hand <= 11:
                action = 1
            else:
                #epsilon = decrease_rate(i_game / nb_games)
                action = q_learning_action(player_hand, usable_ace, dealer_first_card, Q, epsilon)
            observation, reward, done, info = env.step(action)
            new_player_hand, dealer_first_card, usable_ace = observation
            new_player_hand = bj.sum_hand(new_player_hand)
            usable_ace = (usable_ace) * 1

            if train:
                #lr = 0.2 * decrease_rate(i_game / nb_games)
                #q_learning update
                Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] *= 1 - lr
                #Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] += lr * reward   
                
                ############
                #PROBLEM???#
                ############
                if new_player_hand > 21:
                    Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] += lr * reward
                    #Q[player_hand - 4, usable_ace, dealer_first_card - 1, 1 - action] += lr * (-reward + gamma)
                else:
                    Q[player_hand - 4, usable_ace, dealer_first_card - 1, action] += lr * (reward + gamma * np.argmax(Q[new_player_hand - 4, usable_ace, dealer_first_card - 1]))
                        

            player_hand = new_player_hand

            
            if Verbose:
                print("Pass {} - Player's score:".format(t), player_hand)
                if player_hand > 21:
                    print("Player has been busted.")


            if done:
                if Verbose:
                    dealer_hand = bj.score(env.dealer)
                    print("Dealer's score:", dealer_hand)
                    if dealer_hand > 21:
                        print("Dealer has been busted.")
                
                #if the player won the game
                if reward == 1.:
                    avg_win += 1
                    if Verbose:
                        print("GAME WON")
                        print()
                #if there has been a draw
                elif reward == 0:
                    avg_tie += 1
                    if Verbose:
                        print("TIE")
                        print()
                #if the player lost the game
                else:
                    if Verbose:
                        print("GAME LOST")
                        print()
                break
                
                
        epsilon *= 0.99
        lr *= 0.99


    if Verbose:
        print("Average winning score with q-learning:", 100 * avg_win / nb_games, "%")
        print("Ties:", 100 * avg_tie / nb_games, "%  ||  Losses:", 100 * (1 - (avg_win + avg_tie) / nb_games), "%")

    env.close()

    if train:
        return Q
    #percentage of winning games
    return round(100 * avg_win / nb_games, 2), round(100 * avg_tie / nb_games, 2)

In [20]:
Q_table = main(train=True)
print(main(train=False, Q=Q_table))

35.93


Testing different learning rates

In [19]:
lst = [1, 0.5, 0.3, 0.2, 0.1, 1e-2, 1e-3, 1e-5, 1e-10, 0]

for lr in lst:
    Q_table = main(train=True, lr=lr)
    print(main(train=False, Q=Q_table, lr=lr))

(35.23, 7.84)
(33.63, 7.72)
(36.41, 8.4)
(32.97, 7.64)
(34.29, 7.22)
(35.33, 8.19)
(35.25, 8.3)
(35.88, 8.48)
(33.84, 7.93)
(37.07, 6.8)


Testing different discount rates

In [14]:
lst = [1, 0.5, 0.3, 0.2, 0.1, 1e-2, 1e-3, 1e-5, 1e-10, 0]

for gamma in lst:
    Q_table = main(train=True, gamma=gamma)
    print(main(train=False, Q=Q_table, gamma=gamma))

(36.13, 9.53)
(36.23, 9.41)
(36.31, 9.83)
(36.32, 9.63)
(36.97, 9.47)
(37.38, 9.34)
(36.83, 9.52)
(36.9, 9.62)
(36.0, 9.31)
(36.74, 9.33)


Testing different exploration rates

In [15]:
lst = [1, 0.5, 0.3, 0.2, 0.1, 1e-2, 1e-3, 1e-5, 1e-10, 0]

for epsilon in lst:
    Q_table = main(train=True, epsilon=epsilon)
    print(main(train=False, Q=Q_table, epsilon=epsilon))

(36.66, 9.45)
(36.73, 9.32)
(36.59, 9.53)
(35.84, 9.23)
(36.76, 9.28)
(36.06, 9.12)
(36.31, 9.75)
(36.95, 9.7)
(34.98, 9.25)
(36.12, 8.99)


Testing best?

In [23]:
lst = [1, 0.5, 0.3, 0.2, 0.1, 1e-2, 1e-3, 1e-5, 1e-10, 0]

results = np.zeros((10, 10, 10))

for i, lr in enumerate(lst):
    for j, gamma in enumerate(lst):
        for k, epsilon in enumerate(lst):
            Q_table = main(train=True, lr=lr, gamma=gamma, epsilon=epsilon)
            result = main(train=False, Q=Q_table, lr=lr, gamma=gamma, epsilon=epsilon)
            results[i, j, k] = result[0]
            print("lr = " + str(lr) + " || gamma = " + str(gamma) + " || epsilon = " + str(epsilon))
            print(result)

lr = 1 || gamma = 1 || epsilon = 1
(34.39, 7.59)
lr = 1 || gamma = 1 || epsilon = 0.5
(32.89, 7.91)
lr = 1 || gamma = 1 || epsilon = 0.3
(35.16, 8.04)
lr = 1 || gamma = 1 || epsilon = 0.2
(31.35, 7.28)
lr = 1 || gamma = 1 || epsilon = 0.1
(35.1, 8.36)
lr = 1 || gamma = 1 || epsilon = 0.01
(32.83, 8.24)
lr = 1 || gamma = 1 || epsilon = 0.001
(31.52, 7.89)
lr = 1 || gamma = 1 || epsilon = 1e-05
(31.53, 6.91)
lr = 1 || gamma = 1 || epsilon = 1e-10
(33.81, 7.82)
lr = 1 || gamma = 1 || epsilon = 0
(33.28, 7.92)
lr = 1 || gamma = 0.5 || epsilon = 1
(35.13, 7.75)
lr = 1 || gamma = 0.5 || epsilon = 0.5
(31.69, 6.97)
lr = 1 || gamma = 0.5 || epsilon = 0.3
(36.04, 7.85)
lr = 1 || gamma = 0.5 || epsilon = 0.2
(33.97, 7.63)
lr = 1 || gamma = 0.5 || epsilon = 0.1
(35.09, 9.04)
lr = 1 || gamma = 0.5 || epsilon = 0.01
(34.17, 7.61)
lr = 1 || gamma = 0.5 || epsilon = 0.001
(35.32, 8.83)
lr = 1 || gamma = 0.5 || epsilon = 1e-05
(35.35, 9.16)
lr = 1 || gamma = 0.5 || epsilon = 1e-10
(32.76, 7.17)
lr = 1

KeyboardInterrupt: 

# To print the Q-table

In [29]:
cards = [i for i in range (2, 11)]
cards.append(1)

dealer_first_cards = [str(i) + "" for i in cards]
dealer_first_cards[-2] += " or any face"
dealer_first_cards[-1] = "ace"
player_cards = [str(i) for i in range(4, 22)]


combinations = []
for i in range(4, 22):
    lst = []
    for k in cards:
        pair = []
        if np.argmax(Q_table[i - 4, 0, k - 1]) == 0:
            pair.append("STAND")
        else:
            pair.append("HIT")
            
        if np.argmax(Q_table[i - 4, 1, k - 1]) == 0:
            pair.append("STAND")
        else:
            pair.append("HIT")
        lst.append(pair)
    combinations.append(lst)

combinations = np.array(combinations)
print("With a soft ace")
print(DataFrame(np.array(combinations[:, :, 0]), player_cards, dealer_first_cards))  #begins at 13
print("With a hard ace")
print(DataFrame(np.array(combinations[:, :, 1]), player_cards, dealer_first_cards))  #begins at 12

With a soft ace
        2      3      4      5      6      7      8      9 10 or any face  \
4     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
5     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
6     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
7     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
8     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
9     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
10    HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
11    HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
12    HIT    HIT  STAND  STAND    HIT    HIT    HIT    HIT          STAND   
13    HIT  STAND  STAND    HIT    HIT    HIT    HIT    HIT            HIT   
14    HIT    HIT    HIT  STAND    HIT    HIT    HIT  STAND            HIT   
15  STAND    HIT    HIT    HIT  STAND    HIT    HIT    HIT  

# To print Normal Play Strategy score

In [25]:
#Building Normal Play Q-table
Q_normal_play = np.zeros((18, 2, 10, 2))
Q_normal_play[:8, :, :] = [0, 1]
Q_normal_play[8, 0, 0:3] = [0, 1]
Q_normal_play[8, 0, 3:6] = [1, 0]
Q_normal_play[8, 0, 6:] = [0, 1]
Q_normal_play[9:13, 0, 0] = [0, 1]
Q_normal_play[9:13, 0, 1:6] = [1, 0]
Q_normal_play[9:13, 0, 6:] = [0, 1]
Q_normal_play[13:, 0, :] = [1, 0]
Q_normal_play[8, 1, :] = [0, 1]
Q_normal_play[9:14, 1, :] = [0, 1]
Q_normal_play[14, 1, 0] = [0, 1]
Q_normal_play[14, 1, 1:8] = [1, 0]
Q_normal_play[14, 1, 8:] = [0, 1]
Q_normal_play[15:, 1, :] = [1, 0]

# list_i = [i for i in range(1, 11)]
# list_j = [i for i in range(4, 22)]
# print("Hard Ace case")
# print(DataFrame(Q_normal_play[:, 0, :, 0], list_j, list_i))
# print("Soft Ace case")
# print(DataFrame(Q_normal_play[:, 1, :, 0], list_j, list_i))


print(main(train=False, Q=Q_normal_play))


err = 0
for i in range(18):
    for j in range(2):
        for k in range(10):
            if np.argmax(Q_table[i, j, k]) != np.argmax(Q_normal_play[i, j, k]):
                err += 1
err /= 3.6
print(str(err) + "%")

(38.69, 9.31)
14.722222222222221%


To print Normal Play strategy

In [30]:
combinations_normal_play = []
for i in range(4, 22):
    lst = []
    for k in cards:
        pair = []
        if np.argmax(Q_normal_play[i - 4, 0, k - 1]) == 0.:
            pair.append("STAND")
        else:
            pair.append("HIT")
            
        if np.argmax(Q_normal_play[i - 4, 1, k - 1]) == 0.:
            pair.append("STAND")
        else:
            pair.append("HIT")
        lst.append(pair)
    combinations_normal_play.append(lst)

combinations_normal_play = np.array(combinations_normal_play)
print("With a soft ace")
print(DataFrame(np.array(combinations_normal_play[:, :, 0]), player_cards, dealer_first_cards))  #begins at 13
print("With a hard ace")
print(DataFrame(np.array(combinations_normal_play[:, :, 1]), player_cards, dealer_first_cards))  #begins at 12

With a soft ace
        2      3      4      5      6      7      8      9 10 or any face  \
4     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
5     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
6     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
7     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
8     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
9     HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
10    HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
11    HIT    HIT    HIT    HIT    HIT    HIT    HIT    HIT            HIT   
12    HIT    HIT  STAND  STAND  STAND    HIT    HIT    HIT            HIT   
13  STAND  STAND  STAND  STAND  STAND    HIT    HIT    HIT            HIT   
14  STAND  STAND  STAND  STAND  STAND    HIT    HIT    HIT            HIT   
15  STAND  STAND  STAND  STAND  STAND    HIT    HIT    HIT  

# To run several batches with different learning rate / gamma / epsilon

In [7]:
lr_list = [1, 0.5, 0.1, 1e-3, 1e-5, 1e-10, 0]
gamma_list = [1, 0.8, 0.5, 0.3, 0.1, 1e-3, 0]
epsilon_list = [0.5, 0.3, 0.2, 0.1, 1e-3, 0]


results = np.zeros((7, 7, 7))

for i in range(7):
    for j in range(7):
        print("Gamma = " + str(gamma_list[j]))
        for k in range(6):
            result = main(train=True, lr=lr_list[i], gamma=gamma_list[j], epsilon=epsilon_list[k])
            print(result)
            results[i, j, k] = result

indices = np.argmax(results)
print(indices)
print("Average winning score with q-learning:", results[indices], "%")

[[[[ 0.00000000e+00  8.01971727e-01]
   [ 0.00000000e+00  8.02977696e-01]
   [ 0.00000000e+00  7.84277358e-01]
   [ 0.00000000e+00  7.90283295e-01]
   [ 0.00000000e+00  7.11715028e-01]
   [ 0.00000000e+00  7.40749231e-01]
   [ 0.00000000e+00  7.89243914e-01]
   [ 0.00000000e+00  8.02357069e-01]
   [ 0.00000000e+00  7.95358249e-01]
   [ 0.00000000e+00  8.00000055e-01]]

  [[ 0.00000000e+00  9.32199795e-01]
   [ 0.00000000e+00  9.57487428e-01]
   [ 0.00000000e+00  9.76661600e-01]
   [ 0.00000000e+00  9.61546261e-01]
   [ 0.00000000e+00  9.96905503e-01]
   [ 0.00000000e+00  8.90783158e-01]
   [ 0.00000000e+00  9.28091810e-01]
   [ 0.00000000e+00  8.00438934e-01]
   [ 0.00000000e+00  9.30496233e-01]
   [ 0.00000000e+00  9.20623516e-01]]]


 [[[ 0.00000000e+00  8.00128647e-01]
   [ 0.00000000e+00  7.96521577e-01]
   [ 0.00000000e+00  7.91848453e-01]
   [ 0.00000000e+00  7.98845298e-01]
   [ 0.00000000e+00  6.47032110e-01]
   [ 0.00000000e+00  7.98328211e-01]
   [ 0.00000000e+00  7.86452388e