In [1]:
import rlcard
import collections
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from rlcard.utils import Logger, tournament
from rlcard.agents import RandomAgent
import pickle

In [2]:
random.seed(42)
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32

In [3]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)    # double-ended queue
    
    def put(self, transition):
        self.buffer.append(transition)

    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)

    def size(self):
        return len(self.buffer)

In [4]:
class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(36, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,3)
        else : 
            return out.argmax().item()   

In [5]:
def train(q, q_target, memory, optimizer,loss_list):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)
        q_out = q(s)
        

        q_a = q_out.gather(1,a)
        # DQN
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)

        target = r + gamma * max_q_prime * done_mask
        target = target.type(torch.FloatTensor)
        # MSE Loss
        loss = F.mse_loss(q_a, target)
        loss_list.append(loss)
        #print(i,loss)
        #print('-------------------------------')
        # Smooth L1 Loss
        #loss = F.smooth_l1_loss(q_a, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss_list

In [6]:
env=rlcard.make('leduc-holdem')
s=env.reset()
print(s)

actions_lst=['call','raise','fold','check']
'''for i in range(20):
    s=env.reset()
    print(s)
    print('--------------------------------------------------')
    #print(s)
    while (env.is_over() is False):
        print(s[1],env.get_state(s[1])['raw_obs'])
        print(s[0]['obs'])
        a=s[0]['legal_actions']
        #print(s)
        a=random.choice(list(a.keys()))
        print(s[1],actions_lst[a])
        s=env.step(a)
        
        
    print(env.get_payoffs())'''

'''s1=env.step(1)
print(s1)
s2=env.step(0)
print(s2)'''

({'legal_actions': OrderedDict([(0, None), (1, None), (2, None)]), 'obs': array([1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), 'raw_obs': {'hand': 'SJ', 'public_card': None, 'all_chips': [1, 2], 'my_chips': 1, 'legal_actions': ['call', 'raise', 'fold'], 'current_player': 0}, 'raw_legal_actions': ['call', 'raise', 'fold'], 'action_record': []}, 0)


's1=env.step(1)\nprint(s1)\ns2=env.step(0)\nprint(s2)'

In [31]:
def main():
    env = rlcard.make('leduc-holdem')
    env.set_agents([RandomAgent(num_actions=env.num_actions)])

    model=torch.load('logs/dqn_model.pt')
    model.eval()
    pay_0 = 0
    pay_1 = 0
    for it in range(10): 
        for n_epi in range(100):
            epsilon = max(0.01, 0.08 - 0.01*(n_epi/200))
            s,id = env.reset()
            while not env.is_over():
                if id == 0:
                    a=model.sample_action(torch.from_numpy(s['obs']).float(),epsilon)
                else:
                    a=random.randint(0,3)

                if a not in s['legal_actions'].keys():
                    a=2
                s_prime, id_prime = env.step(a)
                s=s_prime
                id = id_prime
            payoffs = env.get_payoffs()
            pay_0 = pay_0+payoffs[0]
            pay_1 = pay_1+payoffs[1]
        
        

        print("score_id_0 : {:.1f}, score_id_1 : {:.1f}".format(pay_0, pay_1))



In [32]:
if __name__ == '__main__':
    main()

score_id_0 : 57.0, score_id_1 : -57.0
score_id_0 : 95.0, score_id_1 : -95.0
score_id_0 : 139.5, score_id_1 : -139.5
score_id_0 : 216.5, score_id_1 : -216.5
score_id_0 : 276.5, score_id_1 : -276.5
score_id_0 : 319.5, score_id_1 : -319.5
score_id_0 : 362.5, score_id_1 : -362.5
score_id_0 : 411.0, score_id_1 : -411.0
score_id_0 : 477.0, score_id_1 : -477.0
score_id_0 : 502.5, score_id_1 : -502.5
