In [1]:
import numpy as np
import random
# from environment import environment

### Run this if it's the first time

In [4]:
qTable = np.zeros((env.numStates, len(env.actions)))

### Run this if importing from csv file

In [None]:
qTable = np.genfromtxt('qTable.csv', delimiter=',')

In [62]:
from deck import deck
from player import player

class environment():
    def __init__(self):
        self.deck = deck()
        self.player = player(500)
        self.casino = player(10000000)  # Money of casino doesn't matter
        self.minBet = 10
        self.runningCount = 0

        # Casino's first card, player score, true card count
        self.state = [-1, -1, 0]

        # Environment's parameterisation
        arr = []
        param = 0
        for pScore in range(2, 22):
            for cCard in range(1, 11):
                for possibleCount in range(-40,40):
                    arr.append([cCard, pScore, possibleCount])
        self.states = arr
        self.numStates = len(arr)
        self.actions = (0, 1)

        self.deck.shuffle()

    def firstDeal(self):
        self.player.hand = []
        self.casino.hand = []
        for i in range(2):
            self.player.draw(self.deck.deal())
            self.casino.draw(self.deck.deal())

        # Setting state
        self.state[0] = self.casino.hand[0]
        self.state[1] = self.player.score

        # Keeping a simple running count
        for x in self.player.hand:
            self.updateRunCount(x)
        self.updateRunCount(self.casino.hand[0])

        self.state[2] = self.calcTCount()


    def step(self, action):
        # 0 means draw , 1 means stand
        initState = self.state

        if action == 0:
            c = self.deck.deal()
            self.player.draw(c)

            self.updateRunCount(c)


        if self.player.score <= 21:
            # Update state only if it's less than 21
            self.state[1] = self.player.score

        self.state[2] = self.calcTCount()

        reward = self.reward(self.state)
        return self.state, reward

    def reward(self, state):
        while self.casino.score <= 16:
            self.casino.draw(self.deck.deal())

        if self.player.score > 21:
            return -1.1

        if self.isPlayerWin():
            return 1
        return -1 # Higher weightage on preventing losses

    def status(self):
        print('player: ', self.player.hand, self.player.score)
        print('casino: ', self.casino.hand, self.casino.score)
        print('player wins: ', self.player.wins,
              '| player losses: ', self.player.losses,
              '| win ratio: ', self.player.winRatio)
        print('state: ', self.state)
        print('-----------------------------------')

    def isPlayerWin(self):
        if self.casino.score > 21 and self.player.score > 21:  # If both bust, casino wins
            return False
        elif self.player.score > 21:
            return False
        elif self.casino.score > 21:
            return True
        elif self.casino.score >= self.player.score:
            return False
        else:
            return True

    def reset(self):
        if self.deck.endReached():
            self.deck = deck()
            self.deck.shuffle()
            self.firstDeal()
            self.runningCount = 0
            return self.state
        self.player.hand = []
        self.casino.hand = []
        self.state = [-1, -1, 0]
        self.firstDeal()
        return self.state

    def updateRunCount(self, card):
        if card >= 10 or card == 1:
            self.runningCount -= 1
        elif card <= 6 and card != 1:
            self.runningCount += 1

    def calcTCount(self):
        tCount = round(self.runningCount / self.deck.getDecksLeft(), 0)
        return tCount

    def parameterise(self, stateToParam):
        return self.states.index(stateToParam)


In [94]:
env = environment()

# qTable = np.zeros((env.numStates, len(env.actions)))
qTable = np.genfromtxt('qTable.csv', delimiter=',')

episodes = 10000
epsilon = 0.9 # Chance we take a random choice 
lRate = 0.7 # Learning rate
discount = 0.45 # Discount for the maximum of the next step

for episode in range(episodes):
    # Print episode status every 100 episodes
    if episode % 100 == 0: 
        print(f'Current episode: {episode}')

    state = env.reset()

    stateInd = env.parameterise(state)

    if np.random.uniform(0,1) < epsilon:
        action = env.actions[random.randint(0,1)]
    else:
        action = np.argmax(qTable[stateInd, :])

    newState, reward = env.step(action)
    newStateInd = env.parameterise(newState)
    qTable[stateInd, action] = qTable[stateInd, action] + lRate * (reward + discount * np.argmax(qTable[newStateInd, :])) # Q table formula


    # Slowly reduce the number of randomness
    epsilon -= 0.01

Current episode: 0
Current episode: 100
Current episode: 200
Current episode: 300
Current episode: 400
Current episode: 500
Current episode: 600
Current episode: 700
Current episode: 800
Current episode: 900
Current episode: 1000
Current episode: 1100
Current episode: 1200
Current episode: 1300
Current episode: 1400
Current episode: 1500
Current episode: 1600
Current episode: 1700
Current episode: 1800
Current episode: 1900
Current episode: 2000
Current episode: 2100
Current episode: 2200
Current episode: 2300
Current episode: 2400
Current episode: 2500
Current episode: 2600
Current episode: 2700
Current episode: 2800
Current episode: 2900
Current episode: 3000
Current episode: 3100
Current episode: 3200
Current episode: 3300
Current episode: 3400
Current episode: 3500
Current episode: 3600
Current episode: 3700
Current episode: 3800
Current episode: 3900
Current episode: 4000
Current episode: 4100
Current episode: 4200
Current episode: 4300
Current episode: 4400
Current episode: 4500


In [95]:
# Check populated squares in Q table
populated = 0
for x in range(len(qTable)):
    if qTable[x][0] != 0 or qTable[x][1] != 0:
        populated += 1
        print(x, qTable[x])

print(populated)

1627 [-0.385  0.   ]
1628 [-0.385  0.   ]
1630 [-0.385  0.   ]
1632 [-0.385  0.   ]
1633 [-0.7  0. ]
1634 [-0.7  -0.14]
1635 [-0.385  0.   ]
1636 [ 2.345 -0.525]
1637 [-1.785 -1.155]
1638 [-5.355 -4.76 ]
1639 [-3.71  -3.395]
1640 [-13.16 -13.23]
1641 [-5.635 -5.39 ]
1642 [-1.82 -2.45]
1643 [-1.085 -1.155]
1644 [-1.085 -0.91 ]
1645 [-0.385  0.63 ]
1646 [-1.085 -0.77 ]
1647 [-0.7  0. ]
1651 [-0.7   -0.385]
1652 [-0.7  0. ]
1653 [1.015 0.   ]
1715 [-0.385  0.   ]
1716 [1.015 0.   ]
1717 [2.73 0.  ]
1718 [-1.47  -0.035]
1719 [-0.595  1.505]
1720 [-0.7   12.005]
1721 [-1.61  4.06]
1722 [-0.42 -2.31]
1723 [-0.7    4.025]
1724 [2.275 0.   ]
1725 [1.015 0.   ]
1726 [-0.77 -0.77]
1727 [-1.085  0.245]
1728 [-0.385  0.   ]
1730 [-0.385  0.   ]
1734 [-0.385  0.   ]
1792 [-0.7  0. ]
1793 [-0.385  0.   ]
1794 [-0.7  0. ]
1795 [-0.385  0.   ]
1796 [-0.385  1.015]
1797 [-0.7   1.26]
1798 [ 0.245 -0.77 ]
1799 [-1.785 -0.56 ]
1800 [-0.035  4.34 ]
1801 [-1.75   5.985]
1802 [ 7.7   -0.525]
1803 [-1.4  -0.

In [115]:
# Testing the model ooh spicy
sumWRatio = 0

for _ in range(50):
    wins = 0
    total = 0

    for episode in range(100):
        done = False
        state = env.reset()

        stateInd = env.parameterise(state)

        if qTable[stateInd, 0] != 0 and qTable[stateInd, 1] != 0:
            choice = np.argmax(qTable[stateInd])
        
            nextObs, reward = env.step(choice)

            if reward == 1:
                wins += 1
            total += 1

        if total != 0:
            sumWRatio += (wins/total)


print(sumWRatio/50)


40.088876361060336


In [92]:
# Save current q table so that it can be used later
np.savetxt("qTable.csv", qTable, delimiter=",")

In [114]:
# Control - random choice
sumWRatio = 0

for _ in range(50):
    wins = 0
    total = 0

    for episode in range(100):
        done = False
        state = env.reset()

        stateInd = env.parameterise(state)

        choice = np.random.choice((0,1))
    
        nextObs, reward = env.step(choice)

        if reward == 1:
            wins += 1
        total += 1

        if total != 0:
            sumWRatio += (wins/total)


print(sumWRatio/50)


33.699262178017605


In [113]:
# Control - Hit if less than 16
sumWRatio = 0

for _ in range(50):
    wins = 0
    total = 0

    for episode in range(100):
        done = False
        state = env.reset()

        stateInd = env.parameterise(state)

        stateToTest = state
        print(state)
        if stateToTest[1] < 16:
            nextObs, reward = env.step(0)
            stateToTest = nextObs
            print('hit: ', stateToTest)

        if reward == 1:
            wins += 1
        total += 1

        if total != 0:
            sumWRatio += (wins/total)


print(sumWRatio/50)


[10, 19, -0.0]
[10, 19, -0.0]
[4, 12, 0.0]
hit:  [4, 20, 0.0]
[2, 10, 0.0]
hit:  [2, 16, 1.0]
[8, 7, 1.0]
hit:  [8, 17, 1.0]
[10, 15, 1.0]
hit:  [10, 15, 1.0]
[5, 5, 1.0]
hit:  [5, 15, 1.0]
[8, 18, 1.0]
[10, 11, 1.0]
hit:  [10, 16, 1.0]
[8, 20, 1.0]
[10, 17, 0.0]
[7, 11, 1.0]
hit:  [7, 12, 1.0]
[5, 17, 1.0]
[4, 19, 1.0]
[8, 16, 1.0]
[9, 21, 0.0]
[8, 15, 0.0]
hit:  [8, 15, 0.0]
[6, 16, 0.0]
[8, 8, 1.0]
hit:  [8, 18, 1.0]
[6, 21, 0.0]
[1, 17, 0.0]
[2, 17, 0.0]
[8, 16, 0.0]
[10, 11, 0.0]
hit:  [10, 14, 0.0]
[9, 21, -0.0]
[10, 5, 0.0]
hit:  [10, 13, 0.0]
[5, 15, 0.0]
hit:  [5, 19, 0.0]
[10, 21, -0.0]
[10, 10, 0.0]
hit:  [10, 13, 0.0]
[4, 12, 1.0]
hit:  [4, 13, 0.0]
[6, 7, 1.0]
hit:  [6, 17, 1.0]
[10, 16, 1.0]
[7, 15, 1.0]
hit:  [7, 20, 1.0]
[2, 20, 1.0]
[1, 10, 1.0]
hit:  [1, 13, 1.0]
[9, 11, 2.0]
hit:  [9, 21, 2.0]
[3, 10, 4.0]
hit:  [3, 14, 4.0]
[10, 21, 2.0]
[3, 20, 2.0]
[5, 16, 2.0]
[5, 12, 2.0]
hit:  [5, 12, 2.0]
[2, 19, 2.0]
[10, 13, 2.0]
hit:  [10, 17, 2.0]
[1, 12, 2.0]
hit:  [1, 12