In [4]:
import keras.backend as K
from Agent2 import Agent2, ReplayBuffer2
from Agent import Agent, ReplayBuffer
from PokerGame import HUPoker
from TransformVillain import transformStateVillain


In [5]:
lr = 0.001
gamma = 1
n_actions = 12
epsilon = 1
batch_size = 32
input_dims = (102,)
stacksize = 10
fname = 'trainedModels/PokerAI10BB.h5'
layerSizes = (256,128,64, 32, 16)

In [6]:
newAgent = Agent2(lr, gamma, n_actions, 0.2, batch_size, input_dims, stacksize, fname = fname, handEval=True, layerSizes=layerSizes,mem_size=100000)
oldAgent = Agent2(lr, gamma, n_actions, 0, batch_size, input_dims, stacksize, fname = fname, handEval=True, layerSizes=layerSizes,mem_size=0)
newAgent.load_model()
oldAgent.load_model()

In [7]:
stack_hero = stacksize
stack_villain = stacksize
agentHero = newAgent
agentVillain = oldAgent

In [8]:
game = HUPoker(stack_hero, stack_villain, agentHero, agentVillain)

In [None]:
#normal hand by hand approach in learning
import numpy as np

n_rounds = 1001
score = 0
for i in range(n_rounds):

    done =True

    while done:
        observation, done = game.reset()

    while not done:

        error = False
        action = agentHero.choose_action(observation)
        observation = np.append(observation,[game.pwinHero, game.plooseHero, game.pwinAvg, game.plooseAvg, game.stdWinAvg])

        try: 
            done = game.step(action) #sometimes the chosen actions lead to too many moves on a street (over 20) This happens when players both chose to engage in minraise battles.
                                    #we want to severely punish this behavior.
        except IndexError: #error happens when more than 20 actions are attepted on a single street.
            done = True
            error = True

        observation_ = game.observation #if street changed from 
        observation_ = np.append(observation_,[game.pwinHero, game.plooseHero, game.pwinAvg, game.plooseAvg, game.stdWinAvg])
        if not done:
            reward = 0
        else:
            if error:
                reward = -stacksize
            else:
                reward = 0.5+game.stack_sb - game.starting_hero if game.position == 0 else 1+game.stack_bb - game.starting_hero #to take into account the blinds are posted anyway regardless of action.
        score += reward - 0.5 if game.position == 0 else reward - 1
        avgScore = round(score//(i+0.00001),2)
        agentHero.store_transition(observation, action, reward, observation_, done)
        observation = game.observation
        
        
        loss = agentHero.learn()
        
        
        
    
    print("hands played: ", i+1, "reward round:", reward, "total score:", score, "avg score", avgScore, "epsilon", agentHero.epsilon)
    if i %10 ==0:
        states, actions, rewards, states_, dones = agentHero.memory.sample_buffer(2)
        q_eval = agentHero.q_eval.predict(states)
        pos = states[0:1][0][0]
        street = states[0:1][0][3]
        hand = (states[0:1][0][4],states[0:1][0][10],states[0:1][0][5],states[0:1][0][11])
        pot = states[0:1][0][7]
        board = states[0:1][0][-15:-5]
        print("pos", pos, "street", street)
        print("hand", hand)
        print("pot", pot)
        print("board",board)
        print(q_eval[0:1])


    if i%1000 == 0:
        #agentHero.save_model()





In [9]:
import ipyparallel as ipp
cluster = ipp.Cluster(n=8)
await cluster.start_cluster() # or cluster.start_cluster_sync() without await

Starting 8 engines with <class 'ipyparallel.cluster.launcher.LocalEngineSetLauncher'>


<Cluster(cluster_id='1643645003-eral', profile='default', controller=<running>, engine_sets=['1643645004'])>

In [10]:
rc = cluster.connect_client_sync()
rc.wait_for_engines(n=8)

100%|█████████████████████████████████████████| 8/8 [00:04<00:00,  1.71engine/s]


In [11]:
from keras import backend as K
import pickle
fname = 'trainedModels/PokerAI10BB.h5'
layerSizes = (256,128,64, 32, 16)

nDec = 10
lr = 0.00001
epsilon = 0# not meaningful for agentLearn, because he is not playing against anyone. Still all parameters must be included in agent.
batch_size = 16
input_dims = (102,)
stacksize = 10
nCores = 8
agentLearn = Agent2(gamma = gamma, n_actions=n_actions,lr = lr, batch_size = batch_size, input_dims=input_dims, stacksize=stacksize, epsilon = 0, fname = fname, layerSizes=layerSizes, mem_size = nDec*nCores)

try:
    agentLearn.load_model()
    print('Model loaded')
except:
    print('no Model loaded')
K.set_value(agentLearn.q_eval.optimizer.learning_rate, lr)
weightsHero = agentLearn.q_eval.get_weights()

agentHero= Agent2(lr = 0.1, gamma=1, n_actions=12,epsilon=0.1, batch_size=16,
        input_dims=(102,), stacksize =stacksize, 
        mem_size=nDec, 
        epsilon_end=0.05,
        fname=fname, handEval=True,layerSizes=layerSizes)
agent_vil= Agent2(lr = 0.1, gamma=1, n_actions=12,epsilon=0, batch_size=16,
    input_dims=(102,), stacksize =stacksize, 
    mem_size=2, 
    epsilon_end=0.05,
    fname=fname,layerSizes=layerSizes)




env = HUPoker(stacksize, stacksize, agentHero, agent_vil)
rc[:]['agentHero']=agentHero
rc[:]['agent_vil']=agent_vil
rc[:]['weightsHero']=weightsHero

#We want a diversity of opponents in the environment to prevent overfitting, and only update the weights of Villains from time to time, to keep the environment stable enough.
#The current weights for villain are saved in the weights file.
try:
    with open("weightsVil10BB.txt", "rb") as weightsFile:   # Unpickling
        newWeights = pickle.load(weightsFile)
    for i in range(len(newWeights)):
        rc[i]['weightsVil']=newWeights[i]

except:
    rc[:]['weightsVil']=weightsHero
    print("exception!")


rc[:]['ReplayBuffer2']= ReplayBuffer2
rc[:]['env']= env
rc[:]['nDec'] = nDec
rc[:]['stacksize']=stacksize


def play(instance):
    import numpy as np

    agentHero.q_eval.set_weights(weightsHero)
    agent_vil.q_eval.set_weights(weightsVil)
    n_rounds = 1
    env.reset()

    for i in range(n_rounds):
        agentHero.memory = ReplayBuffer2(nDec, (107,))

        while agentHero.memory.mem_cntr < nDec:

            done =True

            while done:
                observation, done = env.reset()
                hand = env.holecards

            while not done:

                error = False
                action = agentHero.choose_action(observation)
                observation = np.append(observation,[env.pwinHero, env.plooseHero, env.pwinAvg, env.plooseAvg, env.stdWinAvg])

                try: 
                    done = env.step(action) #sometimes the chosen actions lead to too many moves on a street (over 20) This happens when players both chose to engage in minraise battles.
                                            #we want to severely punish this behavior.
                except IndexError: #error happens when more than 20 actions are attepted on a single street.
                    done = True
                    error = True

                observation_ = env.observation
                observation_ = np.append(observation_,[env.pwinHero, env.plooseHero, env.pwinAvg, env.plooseAvg, env.stdWinAvg])
                if not done:
                    reward = 0
                else:
                    if error:
                        reward = -stacksize
                    else:
                        reward = 0.5+env.stack_sb - env.starting_hero if env.position == 0 else 1+env.stack_bb - env.starting_hero #to take into account the blinds are posted anyway regardless of action.

                agentHero.store_transition(observation, action, reward, observation_, done)
                    #If the agent is playing against himself to learn, we can use villains play to train also. Everything except for villains hand can be derived from the observation. So we only have to save villains cards.

                observation = env.observation


        memoryDict = {'states': agentHero.memory.state_memory[:nDec], 'states_': agentHero.memory.new_state_memory[:nDec], 
                      'rewards':agentHero.memory.reward_memory[:nDec], 'actions': agentHero.memory.action_memory[:nDec], 
                      'terminals': agentHero.memory.terminal_memory[:nDec]}

        agentHero.memory.mem_cntr =0
    return memoryDict

Model loaded


2022-01-31 17:03:31.292646: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ram://e5eea2e1-6aa2-408a-86e1-8c4856a3e99f/assets
INFO:tensorflow:Assets written to: ram://6fb9cb25-0aad-440d-a34b-e4a046b71974/assets
INFO:tensorflow:Assets written to: ram://d750dafc-2219-47b5-9f7f-62fcf7beaa3b/assets
INFO:tensorflow:Assets written to: ram://6f038c4c-6209-4ec9-b4ba-05e003c03fdf/assets


In [12]:
import time
import numpy as np
import gc
nCores = 8#how many parallel cores were running.
oldWeightsCount = 1

for j in range (6000):
    print (j)
    memoryList = rc[:].map_sync(play,range(nCores))

    states = []
    states_=[]
    rewards=[]
    actions=[]
    terminals=[]
    
    for i in range(nCores):
        states.append(memoryList[i]['states'])
        states_.append(memoryList[i]['states_'])
        rewards.append(memoryList[i]['rewards'])
        actions.append(memoryList[i]['actions'])
        terminals.append(memoryList[i]['terminals'])
    
    state_memory=np.concatenate(states,axis=0)
    new_state_memory = np.concatenate(states_,axis=0)
    reward_memory = np.concatenate(rewards,axis=0)
    action_memory = np.concatenate(actions,axis=0)
    terminal_memory = np.concatenate(terminals,axis=0)
    
    agentLearn.memory.state_memory = state_memory
    agentLearn.memory.new_state_memory = new_state_memory
    agentLearn.memory.reward_memory = reward_memory
    agentLearn.memory.action_memory = action_memory
    agentLearn.memory.terminal_memory = terminal_memory
    agentLearn.memory.mem_cntr = nCores*nDec
    
    r1 = np.random.randint(0, nCores*nDec)
    r2 = np.random.randint(0, nCores*nDec)
    observation = agentLearn.memory.state_memory[r1]
    print(observation[0:6],agentLearn.q_eval.predict(np.array([observation])))
    observation = agentLearn.memory.state_memory[r2]
    print(observation[0:6],agentLearn.q_eval.predict(np.array([observation])))

 
    agentLearn.learnMass()   
    agentLearn.save_model()
    
    gc.collect()
    weights = agentLearn.q_eval.get_weights()
    rc[:]['weightsHero']=weights #send current weights of heros model to the engines.
    rc[7]['weightsVillain']=weights #one opponent always has the current weights.
    #update villain weights. After some steps, one of the villain networks gets updated.
    frequencyUpdateVillain = 200
    
    if j%frequencyUpdateVillain ==0:
        print("exchanging weights of",oldWeightsCount)
        print (rc[oldWeightsCount]['weightsVil'][0])
        rc[oldWeightsCount]['weightsVil']= weights
        print (rc[oldWeightsCount]['weightsVil'][0])
        if oldWeightsCount == 6:
            oldWeightsCount = 0
        else:
            oldWeightsCount+=1

#save villain weights
    if j%500 ==0:
        weightsVil = rc[:]['weightsVil']
        with open("weightsVil10BB.txt", "wb") as weightsFile:   #Pickling
            pickle.dump(weightsVil, weightsFile)

weightsVil = rc[:]['weightsVil']
with open("weightsVil10BB.txt", "wb") as weightsFile:   #Pickling
    pickle.dump(weightsVil, weightsFile)


0
[ 0.  1.  0.  2.  4. 13.] [[-7.7907867  1.1726943 -9.967381  -9.958546  -9.79442   -9.960839
  -9.946443  -9.995526  -9.99062   -9.941962  -9.896823  -9.924006 ]]
[ 0.  7.  7.  3.  8. 13.] [[-5.4198103   1.9907929  -1.4511646   0.05822321 -5.5303807  -4.3762226
  -1.1364164   0.05092499 -0.57884383 -0.44234785 -3.3161635  -5.1548615 ]]
exchanging weights of 1
[[ 0.13891762  0.08042129  0.07012077 ... -0.12694766  0.01740397
   0.05817319]
 [ 0.17516549  0.03936422 -0.05128472 ...  0.00323276  0.02193645
   0.1100174 ]
 [ 0.13496321 -0.06152176  0.04033399 ...  0.03131422 -0.0251929
  -0.08066276]
 ...
 [ 0.0882116  -0.04021711 -0.0473093  ...  0.03347914  0.02602953
   0.07811441]
 [ 0.11722501 -0.10244302  0.0959695  ... -0.05610787  0.05985689
  -0.09885324]
 [ 0.048425    0.09800332 -0.06000081 ...  0.0316143  -0.11625958
  -0.01036678]]
[[ 0.13850917  0.07992314  0.06913903 ... -0.12545016  0.01711711
   0.05705527]
 [ 0.17504795  0.03951159 -0.05156729 ...  0.00361738  0.0209875

TimeoutError: Result not ready.

In [10]:
#save villain weights
weightsVil = rc[:]['weightsVil']
with open("weightsVil10BB.txt", "wb") as weightsFile:   #Pickling
    pickle.dump(weightsVil, weightsFile)