In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from collections import deque
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import random
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'

In [None]:
class Actor(nn.Module):
    def __init__(self, input_nodes, hidden_layer_size, output_nodes):
        super(Actor, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_nodes, input_nodes))
        self.layers.append(nn.SELU())
        for size in hidden_layer_size:
            self.layers.append(nn.Linear(input_nodes, size))
            self.layers.append(nn.SELU())
            input_nodes = size
        self.layers.append(nn.Linear(input_nodes, output_nodes))
        self.layers.append(nn.Softmax(dim=-1))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
class Critic(nn.Module):
    def __init__(self, input_nodes, hidden_layer_size):
        super(Critic, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_nodes, input_nodes))
        self.layers.append(nn.SELU())
        for size in hidden_layer_size:
            self.layers.append(nn.Linear(input_nodes, size))
            self.layers.append(nn.SELU())
            input_nodes = size
        self.layers.append(nn.Linear(input_nodes, 1))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
def train(states, actions, rewards, next_states, dones, actor, critic, actor_optimizer, critic_optimizer):
    # Compute value targets

    max_norm = 1

    next_state_values = critic(next_states).squeeze(-1)

    # Convert dones to tensor if they're not
    if not isinstance(dones, torch.Tensor):
        dones = torch.tensor(dones).to(states.device)

    # Convert dones to float if they're boolean
    if dones.dtype == torch.bool:
        dones = dones.float()

    targets = rewards + (1 - dones) * gamma * next_state_values

    # Update critic
    critic_values = critic(states)
    critic_loss = F.mse_loss(critic_values, targets.detach())

    critic_optimizer.zero_grad()
    critic_loss.backward()
    torch_utils.clip_grad_norm_(critic.parameters(), max_norm)
    critic_optimizer.step()

    # Update actor
    log_probs = torch.log(actor(states))

    #print(log_probs[actions])

    actor_loss = -(critic(states) * log_probs[actions]).mean()
    actor_optimizer.zero_grad()
    actor_loss.backward()
    torch_utils.clip_grad_norm_(actor.parameters(), max_norm)
    actor_optimizer.step()

In [None]:
# Implement replay buffer
class ReplayBuffer(object):

    def __init__(self, maxlength):
        self.buffer = deque()
        self.number = 0
        self.maxlength = maxlength

    def append(self, experience):
        self.buffer.append(experience)
        self.number += 1
        if(self.number > self.maxlength):
            self.pop()

    def pop(self):
        while self.number > self.maxlength:
            self.buffer.popleft()
            self.number -= 1

    def sample(self, batchsize):
        inds = np.random.choice(len(self.buffer), batchsize, replace=False)
        return [self.buffer[idx] for idx in inds]

In [None]:
import torch
import numpy as np
from Agent import Agent
from Game import Game

import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils as torch_utils

In [None]:
lr = 1e-4 # learning rate for gradient update
batchsize = 64  # batchsize for buffer sampling
maxlength = 1000  # max number of tuples held by buffer
tau = 1000  # time steps for target update
episodes = 10000  # number of episodes to run
initialize = 1000  # initial time steps before start updating
epsilon = .1  # constant for exploration
gamma = .99 # discount
hidden_dims=[128, 512, 256, 64] # hidden dimensions

obssize = 120
actsize = 54

In [None]:
# initialize networks
actor = Actor(obssize, hidden_dims, actsize)
critic = Critic(obssize, hidden_dims)
actor_optimizer = torch.optim.Adam(actor.parameters(), lr=lr)
critic_optimizer = torch.optim.Adam(critic.parameters(), lr=lr)

def lecun_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight, gain=1.0)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

actor.apply(lecun_init)
critic.apply(lecun_init)

'''
if next(Qprincipal.qfunction.parameters()).is_cuda:
    print("Model is running on GPU")
else:
    print("Model is running on CPU")
'''

game = Game(9)
buffer = ReplayBuffer(1000)

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)
print(torch.cuda.device_count())

In [None]:
import os

wins = []
unif_agent = Agent()
maxWinRate = 0
baseline = 1
actor = actor.to(device)
critic = critic.to(device)

In [None]:
rrecord = []
totalstep = 0

for ite in range(episodes):
    obs = torch.from_numpy(game.reset(display=0)).float().to(device)
    done = False
    rsum = 0

    while not done:
        totalstep += 1
        if np.random.rand() < max(epsilon, 1-(1-epsilon)/episodes/0.6*ite):
            action = random.choice(range(actsize))
        else:
            action_probs = actor(obs.to(device))
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs.detach().cpu().numpy())


        obs_tmp, _, done = game.step(action)
        obs_tmp = torch.from_numpy(obs_tmp).float().to(device)

        if game.done:
            if game.winner == 0:
                reward = 1
                wins.append(1)
            else:
                wins.append(0)
            obs_post = obs_tmp
        else:
            op_action = np.argmax(game.hand[game.player],axis=0)[0]
            obs_post, _, done = game.step(op_action)
            if game.done:
                if game.winner == 0:
                    reward = 1
                    wins.append(1)
                else:
                    wins.append(0)
            else:
                reward = 0

        rsum += reward

        if isinstance(obs_post, np.ndarray):
          obs_post = torch.from_numpy(obs_post).float()
        elif torch.is_tensor(obs_post):
          obs_post = obs_post.float()

        buffer.append((obs, action, reward, obs_post, done))

        if totalstep > initialize:
          samples = buffer.sample(batchsize)
          states = [torch.tensor(sample[0], dtype=torch.float32).clone().detach().cpu() for sample in samples]
          states = torch.stack(states).to(device)

          #print(states)
          actions = [torch.tensor(sample[1], dtype=torch.long).clone().detach().cpu() for sample in samples]
          actions = torch.stack(actions).to(device)

          rewards = [torch.tensor(sample[2], dtype=torch.float32) for sample in samples]
          rewards = torch.stack(rewards).to(device)

          states_post = [torch.tensor(sample[3], dtype=torch.float32).clone().detach().cpu() for sample in samples]
          states_post = torch.stack(states_post).to(device)

          dones = [torch.tensor(sample[4], dtype=torch.float32) for sample in samples]

          dones = torch.stack(dones).to(device)


          train(states, actions, rewards, states_post, dones, actor, critic, actor_optimizer, critic_optimizer)

        obs = obs_post

    rrecord.append(rsum)
    disp_number = 50
    if ite % disp_number == 0:
        if wins:
            win_rate = int(np.mean(wins[-disp_number:])*100)
        else:
            win_rate = 0
        print('iteration {} ave reward {}, win rate {}'.format(ite, np.mean(rrecord[-disp_number:]), win_rate))

    if ite > 100:
        ave100 = np.mean(wins[-100:])
        if ave100 > 0.9:
            torch.save(actor.state_dict(), 'models/actor_solved')
            torch.save(critic.state_dict(), 'models/critic_solved')
            print("Solved after %d episodes." % ite)
            break


In [None]:
torch.save(actor.state_dict(), 'models/actor_model_'+str(ite)+'.pth' )
torch.save(critic.state_dict(), 'models/critic_model_'+str(ite)+'.pth')

In [None]:
# plot [episode, reward] history
x = [i+1 for i in range(len(wins))]
wr = [np.mean(wins[i-1000:i]) for i in range(len(wins))]
plt.plot(x, wr)
# plt.title('episode rewards')
plt.xlabel('episodes')
plt.ylabel('win rate')
plt.show()

In [None]:
import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

rrecord = []
totalstep = 0
wins = []
for ite in range(1000):
    obs = torch.from_numpy(game.reset(display=0)).float().to(device)
    done = False
    rsum = 0

    while not done:
        totalstep += 1
        if np.random.rand() < max(epsilon, 1-(1-epsilon)/episodes/0.6*ite):
            action = random.choice(range(actsize))
        else:
            action_probs = actor(obs.to(device))
            #print(action_probs)
            #actionMask = np.array([(i % 6 in game.validCard[game.player]) and (i // 6 in game.validStone[game.player]) for i in range(actsize)])
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs.detach().cpu().numpy())


        obs_tmp, _, done = game.step(action)
        obs_tmp = torch.from_numpy(obs_tmp).float().to(device)

        if game.done:
            if game.winner == 0:
                reward = 1
                wins.append(1)
            else:
                wins.append(0)
            obs_post = obs_tmp
        else:
            op_action = np.argmax(game.hand[game.player],axis=0)[0]
            obs_post, _, done = game.step(op_action)
            if game.done:
                if game.winner == 0:
                    reward = 1
                    wins.append(1)
                else:
                    wins.append(0)
            else:
                reward = 0

        rsum += reward

        if isinstance(obs_post, np.ndarray):
          obs_post = torch.from_numpy(obs_post).float()
        elif torch.is_tensor(obs_post):
          obs_post = obs_post.float()

        
        obs = obs_post

print(np.mean(wins))
