In [1]:
import numpy as np
import pandas as pd

from pettingzoo.classic import texas_holdem_v4

env = texas_holdem_v4.env(render_mode="ansi")

In [2]:
class Q:
    def __init__(self):
        self.q = pd.DataFrame(columns=[f's_{i}' for i in range(72)] + ['a', 'v', 'c'])

    def find(self, s, a=None):
        filtered_rows = True
        for i,v in enumerate(s):
            current_filter = (self.q[f's_{i}'] == v)
            filtered_rows &= current_filter
        if a is None:
            return filtered_rows
        else:
            return (filtered_rows & (self.q['a'] == a))

    def __getitem__(self, index):
        s,a = index
        v = self.q.loc[self.find(s,a)].v
        if len(v.values):
            return v.item()
        else:
            return 0
    def __setitem__(self, index, value):
        s,a = index
        if value == 0:
            return
        
        cases = self.q.loc[self.find(s,a)]
        if cases.shape[0]>0:
            # cases['v'] = value
            # cases['c'] = cases.c.item()+1
            self.q.loc[self.find(s,a), ['v','c']] = [value, cases.c.item()+1]
        else:
            if self.q.shape[0] < 1e9:
                self.q.loc[self.q.shape[0]] = [i for i in s]+[a, value, 1]

    
    def __repr__(self):
        return self.q.to_string()


q = Q()
print(q)

Empty DataFrame
Columns: [s_0, s_1, s_2, s_3, s_4, s_5, s_6, s_7, s_8, s_9, s_10, s_11, s_12, s_13, s_14, s_15, s_16, s_17, s_18, s_19, s_20, s_21, s_22, s_23, s_24, s_25, s_26, s_27, s_28, s_29, s_30, s_31, s_32, s_33, s_34, s_35, s_36, s_37, s_38, s_39, s_40, s_41, s_42, s_43, s_44, s_45, s_46, s_47, s_48, s_49, s_50, s_51, s_52, s_53, s_54, s_55, s_56, s_57, s_58, s_59, s_60, s_61, s_62, s_63, s_64, s_65, s_66, s_67, s_68, s_69, s_70, s_71, a, v, c]
Index: []


In [3]:
class BasePlayer:
    def __init__(self, name):
        self.name = name
        self.wins = 0
        self.acum = 0

        self.hist = []
        self.current_reward = []

        self.n = 0
        self.avg = 0
    
    def reward(self, r):
        if r != 0:
            self.hist.append(r)
        self.current_reward.append(r)
    
    def learn(self, s, a, r, s_):
        self.acum += sum(self.current_reward)
        self.wins += int(sum(self.current_reward)>0)
        self.n += 1
        self.avg += (sum(self.current_reward) - self.avg) / self.n
        self.current_reward=[]

    def act(self, s, mask, best=False):
        if mask[0]: # Call
            return 0 
        if mask[3]: # Check
            return 3 
        if mask[2]: # Fold
            return 2 

class Player(BasePlayer):
    def __init__(self, name, q=None, gamma=.9, alpha=.9):
        self.Q = (q or Q())
        self.gamma = gamma
        self.alpha = alpha

        self.buffer = {}
        super().__init__(name)

    def learn(self, s, a, r, s_):        
        if s is not None and a is not None:
            if tuple(s) not in self.buffer:
                self.buffer[tuple(s)] = [self.Q[s,x] for x in range(4)]
            if tuple(s_) not in self.buffer:
                self.buffer[tuple(s_)] = [self.Q[s_,x] for x in range(4)]
            
            delta = r + self.gamma*np.max(self.buffer[tuple(s_)]) - self.buffer[tuple(s)][a]
            self.Q[s,a] =  self.buffer[tuple(s)][a] + self.alpha*delta

        super().learn(s, a, r, s_)

    def act(self, s, mask):
        # s = binary_to_decimal(s)
        if tuple(s) not in self.buffer:
            self.buffer = { tuple(s): [self.Q[s,a] for a in range(4)]}
            
        v = np.array([a if mask[i] else -np.inf for i, a in enumerate(self.buffer[tuple(s)])])
        return np.random.choice(np.flatnonzero(v == v.max()))
        # return np.argmax([self.Q[s,a] if mask[a] else -np.inf for a in range(4)])
        
        
agents = {name: Player(name) for name in ['player_0', 'player_1'] }
# agents = {
#     'player_0': Player('player_0', .9, .9),
#     'player_1': BasePlayer('player_1')  
# }

# {name: agent.acum for name, agent in agents.items()}

In [4]:
def train(agents, max_iter):
    s, a = None, None
    for rounds in range(max_iter):
        env.reset(seed=42)
        # print(rounds)
        for agent in env.agent_iter():
            player = agents[agent]
            observation, r, termination, truncation, info = env.last()
            
            s_ = observation['observation']
            mask = observation["action_mask"]
            
            player.reward(r)
            if termination or truncation:
                a_ = None
            else:
                a_ = player.act(s_, mask)

            player.learn(s,a,r,s_)
            s = s_
            a = a_

            # print(s, a, r)
            env.step(a)
    env.close()

# {name: agent.wins for name, agent in agents.items()}

# Treino

In [5]:
max_iter = int(1e1)
gamma = .999
alpha = .99

shared_q = Q()
agents = {
    'player_0': Player('player_0', shared_q, gamma, alpha),
    # 'player_1': BasePlayer('player_1')  
    # 'player_1': Player('player_0', gamma, alpha),
    'player_1': Player('player_0', shared_q, gamma, alpha),
}

# train(max_iter)
# {name: agent.wins for name, agent in agents.items()}

In [6]:
max_iter = int(1e3)
train(agents, max_iter)
{name: agent.wins for name, agent in agents.items()}

{'player_0': 999, 'player_1': 1}

In [7]:
{name: agent.acum for name, agent in agents.items()}

{'player_0': 999.0, 'player_1': -999.0}

In [11]:
# import pandas as pd 
# hist = pd.DataFrame(
#     {"r": agents['player_0'].hist}
# )
# hist.to_csv('qlearning_p0.csv', index=False)

# hist

# Trainado vs Basico

In [8]:
new_agents = {
    'player_0': Player('player_0', shared_q, gamma, alpha),
    'player_1': BasePlayer('player_1')  
}
train(new_agents, 100)
{name: agent.wins for name, agent in new_agents.items()}

{'player_0': 0, 'player_1': 100}