#Setup
Install rlcard and import modules

In [None]:
!pip install rlcard
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rlcard
  Downloading rlcard-1.0.9.tar.gz (265 kB)
[K     |████████████████████████████████| 265 kB 7.7 MB/s 
Building wheels for collected packages: rlcard
  Building wheel for rlcard (setup.py) ... [?25l[?25hdone
  Created wheel for rlcard: filename=rlcard-1.0.9-py3-none-any.whl size=322178 sha256=a0132d6705b3c9b2d48be538ca8d53abb3f40f08c0c8c40e7995de1ded7ad855
  Stored in directory: /root/.cache/pip/wheels/d7/6c/14/931032d53068211d4e0ee697f24844b90652600fd5c91544c2
Successfully built rlcard
Installing collected packages: rlcard
Successfully installed rlcard-1.0.9
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.1-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.1


In [None]:
import rlcard

import torch
import torchinfo

import random
import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

#Neural Network

In [None]:
class CardInput(torch.nn.Module):
  def __init__(self):
    super(CardInput, self).__init__()
    # 60 for my hand, 60 for visible card
    self.layer1 = torch.nn.Linear(120, 164)

  def forward(self, x):
    x = self.layer1(x)

    return x

class HiddenNetwork(torch.nn.Module):
  def __init__(self):
    super(HiddenNetwork, self).__init__()

    self.layer1 = torch.nn.Linear(164, 224)
    self.layer2 = torch.nn.Linear(224, 86)


  def forward(self, x):
    x = self.layer1(x)
    x = torch.nn.functional.relu(x)
    x = self.layer2(x)
    x = torch.nn.functional.relu(x)

    return x

class ActorBlock(torch.nn.Module):
  def __init__(self):
    super(ActorBlock, self).__init__()

    self.layer1 = torch.nn.Linear(86, 61)

  def forward(self, x):
    x = self.layer1(x)
    x = torch.nn.functional.softmax(x, dim=0)

    return x
  
class CriticBlock(torch.nn.Module):
  def __init__(self):
    super(CriticBlock, self).__init__()

    self.layer1 = torch.nn.Linear(86, 1)

  def forward(self, x):
    x = self.layer1(x)

    return x


In [None]:
class UnoChamp(torch.nn.Module):
  def __init__(self):
    super(UnoChamp, self).__init__()

    self.cardInput = CardInput()

    self.hiddneNetwork = HiddenNetwork()

    self.actorBlock = ActorBlock()
    self.criticBlock = CriticBlock()

  def forward(self, x):
    x = self.cardInput(x)

    x = self.hiddneNetwork(x)

    actor = self.actorBlock(x)
    critic = self.criticBlock(x)

    return (critic, actor)




In [None]:
torchinfo.summary(UnoChamp(), input_size=(1, 120))

Layer (type:depth-idx)                   Output Shape              Param #
UnoChamp                                 [1, 1]                    --
├─CardInput: 1-1                         [1, 164]                  --
│    └─Linear: 2-1                       [1, 164]                  19,844
├─HiddenNetwork: 1-2                     [1, 86]                   --
│    └─Linear: 2-2                       [1, 224]                  36,960
│    └─Linear: 2-3                       [1, 86]                   19,350
├─ActorBlock: 1-3                        [1, 61]                   --
│    └─Linear: 2-4                       [1, 61]                   5,307
├─CriticBlock: 1-4                       [1, 1]                    --
│    └─Linear: 2-5                       [1, 1]                    87
Total params: 81,548
Trainable params: 81,548
Non-trainable params: 0
Total mult-adds (M): 0.08
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.33
Estimated Total Size (MB): 0.33

#Custom Loss

In [None]:
class MultiLoss(torch.nn.Module):
    def __init__(self):
        super(MultiLoss, self).__init__()

    def forward(self, log_probs, expected_advantage, advantage):
        log_probs = np.array(log_probs)
        actor_loss = log_probs.sum() * abs(expected_advantage - advantage).item()
        critic_loss = torch.nn.functional.huber_loss(expected_advantage, advantage)

        total_loss = actor_loss + (critic_loss * 0.5)

        return total_loss

#Q Learning Agent

In [None]:
class QLearnAgent():
  def __init__(self):
    self.use_raw = False # required for RL Card Env 
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.model = UnoChamp().to(self.device)
    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=3e-3)
    self.num_actions = 61

  # Action taken when learning=False
  def step(self, state): 
    action, _ = self.eval_step(state)
    return action

  # Action taken when learning=True
  def eval_step(self, state):
    cards = self.clean_state(state)
    value, raw_probs = self.model(cards)

    value = value.cpu()
    raw_probs = raw_probs.detach().cpu().numpy()

    probs = []
    for i in range(self.num_actions):
      if i in state['legal_actions'].keys():
        probs.append(raw_probs[i])
      else: 
        probs.append(0)

    choice = np.argmax(probs)

    return choice, probs

  def update(self, state, next_state, log_probs, reward):
        state = self.clean_state(state)
        next_state = self.clean_state(next_state)

        value, *_ = self.model(state[None, ...])
        next_value, *_ = self.model(next_state[None, ...])
        td_target = reward + next_value

        loss = MultiLoss()(log_probs, td_target, value)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.detach().cpu().item()
    
  def clean_state(self, state):
    cards = state['obs']
    cards = np.array([cards[1], cards[3]]).flatten()
    cards = torch.tensor(cards).float().to(self.device)

    return cards


#Random Agent

In [None]:
class RandomAgent():
  def __init__(self, num_actions):
    self.num_actions = num_actions
    self.use_raw = False 
  
  def step(self, state):
    return random.choice(list(state['legal_actions']))
  
  def eval_step(self, state):
    probs = []
    for i in range(self.num_actions):
      if i in state['legal_actions'].keys():
        probs.append(1/len(state['legal_actions']))
      else: 
        probs.append(0)

    # List all the probabilities for each actions, this will help with training for non random agents

    return self.step(state), probs

  def update(self, state, next_state, log_probs, reward):
    pass

  def clean_state(self, state):
    cards = state['obs']
    cards = np.array([cards[1], cards[3]]).flatten()
    cards = torch.tensor(cards).float().to(self.device)

    return cards
  

# Defining The Agents

In [None]:
env = rlcard.make("uno", config={'seed': 1})

learningAgent = QLearnAgent()
randomAgent = RandomAgent(env.num_actions)

env.set_agents([learningAgent, randomAgent])

# Useful Methods

In [None]:
def playGame(seed=1, learn=False):
  env = rlcard.make("uno", config={'seed': seed})
  env.set_agents([learningAgent, randomAgent])

  state, player_id = env.reset()
  action, probs = env.agents[player_id].eval_step(state)

  last_states = [state, state]
  last_probs = [probs, probs]

  # Loop to play the game
  turns = [0, 0]
  while not env.is_over():
    # Agent plays
    action, probs = env.agents[player_id].eval_step(state)

    # Environment steps
    next_state, next_player_id = env.step(action, env.agents[player_id].use_raw)
    turns[player_id] += 1

    reward = 0
    if env.is_over():
      reward = 1
    if learn:
      env.agents[player_id].update(state, next_state, probs, reward)

    last_states[player_id] = state
    last_probs[player_id] = probs
    # Set the state and player
    state = next_state
    player_id = next_player_id

  if env.get_payoffs()[0] == 1:
    player_id = 1
  else:
    player_id = 0
  env.get_state(player_id)
  reward = -1
  if learn:
    env.agents[player_id].update(last_states[player_id], state, last_probs[player_id], reward)
  
  payoffs = env.get_payoffs()


  return payoffs, turns

In [None]:
def distributionGraph(percentages, title=""):
  mean = np.average(percentages)
  top = 30

  plt.vlines(mean, ymin=0, ymax=top, color="g")
  plt.hist(percentages, bins=10, range=(0.0, 1.0) )
  plt.xlabel(f"Win Percentage (Average: {mean})")
  plt.ylabel("Number of seeds in bin")
  plt.ylim(top=top)
  plt.title(title)
  
  plt.show()
  return mean

In [None]:
def winPercentageGraphs(seeds=100, games=10, display=False, title="", seedStart=0):
  winningPerc=[]
  colors=[]
  scoreboard = [0,0]
  seedClassify = [0, 0, 0]
  for i in tqdm(range(seedStart, (games * seeds) + seedStart)):
    seed = i // games
    payoff, _ = playGame(seed=seed, learn=False)
    if payoff[0] == 1:
      scoreboard[0]+= 1
    else: 
      scoreboard[1] += 1
    if (i+1) % games == 0:
      if display:
        print(f"seed: {i//games:4d}\t Scoreboard: {scoreboard}" )
      perc = scoreboard[0] / games
      if perc >= 0.70:
        seedClassify[2] += 1
        colors.append('green')
      elif perc <= 0.40: 
        seedClassify[0] += 1
        colors.append('red')
      else: 
        seedClassify[1] += 1
        colors.append('blue')
      winningPerc.append(perc)
      scoreboard = [0, 0]
  print(f"Losing  Seeds:{seedClassify[2]:3d}")
  print(f"Neutral Seeds:{seedClassify[1]:3d}")
  print(f"Winning Seeds:{seedClassify[0]:3d}")

  mean = distributionGraph(winningPerc, title)

  return mean


# Counts

In [None]:
baselineGames = 50
seeds = 100
trainingGames=5

# Before Training

In [None]:
winPercentageGraphs(seeds=seeds, games=baselineGames, title="Before Training")
winPercentageGraphs(seeds=seeds, games=baselineGames, title="Before Training, Excluded Seeds", seedStart=seeds)


# Training

In [None]:
def trainModel(seeds=100, games=10, display=False):
  winPerc = []
  scoreboard = [0,0]
  seedClassify = [0, 0, 0]
  for i in tqdm(range(games * seeds)):
    seed = i // games
    payoff, turns = playGame(seed=seed, learn=True)
    if payoff[0] == 1:
      scoreboard[0]+= 1
    else: 
      scoreboard[1] += 1
    if (i+1) % games == 0:
      if display:
        print(f"seed: {i//games:4d}\t Scoreboard: {scoreboard}" )
      if scoreboard[0] / games >= 0.70:
        seedClassify[2] += 1
      elif scoreboard[0] / games <= 0.40: 
        seedClassify[0] += 1
      else: 
        seedClassify[1] += 1

      winPerc.append(scoreboard[0] / games)
      scoreboard = [0, 0]


  return seedClassify, winPerc


In [None]:
means=[]

for i in range(15):
  _ ,wins = trainModel(seeds=seeds, games=trainingGames)
  mean = distributionGraph(wins,  title=f"Epoch {i}")
  means.append(mean)

plt.plot(means)


# After Training

In [None]:
winPercentageGraphs(seeds=seeds, games=baselineGames, title="After Training")
winPercentageGraphs(seeds=seeds, games=baselineGames, title="After Training, Excluded Seeds", seedStart=seeds)