In [1]:
import collections

from kaggle_environments import make, evaluate
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Loading environment football failed: No module named 'gfootball'


In [2]:
class Net(nn.Module):
    def __init__(self, dim_in, hidden_dim, dim_out):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(dim_in, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, dim_out)
    
    def forward(self, input):

        x = F.relu(self.fc1(input))
        x = F.relu(self.fc2(x))
        return F.relu(self.fc3(x))

In [3]:
Experience = collections.namedtuple('Experience',
           field_names=['state', 'action', 'reward', 'next_state', 'done'])
class ExperienceReplay:
  def __init__(self, capacity):
      self.buffer = []
      self.capacity = capacity
  def __len__(self):
      return len(self.buffer)
  def append(self, experience):
      if len(self.buffer) >= self.capacity:
          self.buffer.pop()
      self.buffer.append(experience)

  def sample(self, batch_size, device):
      indices = np.random.choice(len(self.buffer), batch_size,
                replace=False)
      zipped = list(zip(*[self.buffer[i] for i in indices]))
      return torch.tensor(zipped[0], dtype = torch.float).to(device), torch.tensor(zipped[1], dtype = torch.long).to(device), torch.tensor(zipped[2], dtype = torch.float).to(device), torch.tensor(zipped[3], dtype = torch.float).to(device), torch.tensor(zipped[4], dtype = torch.float).to(device)


In [4]:
def takeAction(actionList, device,epsilon):
        if np.random.random() < epsilon:
            return torch.tensor(np.random.choice(len(actionList))).to(device)
        else:
            return torch.argmax(actionList).to(device)

In [5]:
def changeReward(reward):
    if reward == None:
        return -10
    if reward == 0:
        return 1/42
    else:
        return reward

In [6]:
def generateEpisodes(amount, model,replayBuffer, env,device,epsilon):
    batchReward = 0
    firstQValues = []
    start = True
    steps = 0
    with torch.no_grad(): 
        for _ in range(amount):
            done = False
            env.reset()
            trainer = env.train([None, "random"])
            obs = trainer.reset()
            while not done:
                tensor = torch.tensor(obs.board, dtype = torch.float).to(device)
                qValues = model(tensor)
                action = takeAction(qValues, device,epsilon)
                if start:
                    start = False
                    firstQValues.append(qValues)
                old_obs = obs
                obs, reward, done, info = trainer.step(action.item())
                reward = changeReward(reward)
                exp = Experience(old_obs.board, action, reward, obs.board, float(done))
                replayBuffer.append(exp)
                batchReward+=reward
                steps+=1
    return batchReward/amount, firstQValues, steps/amount

In [7]:
def train(model, qModel, replayBuffer, optimizer, loss_function, device,batchSize, alpha, gamma):
    states, actions, rewards, next_states, dones = replayBuffer.sample(batchSize, device)
    optimizer.zero_grad()
    value = torch.index_select(model(states), 1 , actions)[1]
    qValue = torch.max(qModel(next_states),1)[0]
    target = rewards + (gamma * qValue)*(1-dones)
    loss = loss_function(value, target)
    loss.backward()
    optimizer.step()
    for param in model.parameters():
        param.grad.data.clamp_(-1, 1)
    return loss    

In [8]:
def get_win_percentages(agent1, agent2, n_rounds=100):
    # Use default Connect Four setup
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
    print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))

In [9]:

def agent1(obs, config):
    model = Net(config.columns*config.rows, 300, config.columns)
    model.load_state_dict(torch.load("model_state"))
    with torch.no_grad():
        state = torch.tensor(obs['board'], dtype=torch.float)
        print(state)
        col = model(state).argmax().item()
        return col

In [10]:
def train2(hiddenDim, episodes, batchSize, device):
    env = make("connectx")
    env.render()
    model = Net(env.configuration.columns*env.configuration.rows, hiddenDim, env.configuration.columns).to(device)
    qModel = Net(env.configuration.columns*env.configuration.rows, hiddenDim, env.configuration.columns).to(device)
    qModel.load_state_dict(model.state_dict())
    qModel.eval()
    loss_function = nn.MSELoss()
    optimizer = optim.Adam(params = model.parameters(), lr=0.1)
    buffer = ExperienceReplay(10000)
    idx = 0
    while True:
        epsilon =  (0.9-0.1)*max((100-idx)/100,0) + 0.1
        batchReward, firstQValues, steps = generateEpisodes(episodes, model, buffer, env,device, epsilon)
        loss = train(model, qModel, buffer, optimizer,loss_function,device,batchSize, 0.01, 0.99)  
        if idx % 20 == 0:
            qModel.load_state_dict(model.state_dict())
            print("idx: " + str(idx) + " meanReward generateEpisodes: " +  str(batchReward) + " meanLoss: " + str(loss/batchSize))
            print("first qs" + str(firstQValues))
            print("steps: " + str(steps) )
            torch.save(model.state_dict(), "model_state")
            get_win_percentages(agent1, "random")
        idx+=1
    

In [11]:
train2(300, 100, 32, device)

idx: 0 meanReward generateEpisodes: -2.347619047619075 meanLoss: tensor(0.0042, device='cuda:0', grad_fn=<DivBackward0>)
first qs[tensor([0.0148, 0.0000, 0.0086, 0.0000, 0.0000, 0.0344, 0.0026],
       device='cuda:0')]
steps: 10.34
Agent 1 Win Percentage: 0.7
Agent 2 Win Percentage: 0.02
Number of Invalid Plays by Agent 1: 28
Number of Invalid Plays by Agent 2: 0
idx: 20 meanReward generateEpisodes: -2.104285714285744 meanLoss: tensor(25.1431, device='cuda:0', grad_fn=<DivBackward0>)
first qs[tensor([1.3702, 0.0000, 0.0000, 4.3049, 0.0000, 0.0000, 0.0000],
       device='cuda:0')]
steps: 10.06
Agent 1 Win Percentage: 0.76
Agent 2 Win Percentage: 0.04
Number of Invalid Plays by Agent 1: 20
Number of Invalid Plays by Agent 2: 0
idx: 40 meanReward generateEpisodes: -2.907142857142891 meanLoss: tensor(9.2475, device='cuda:0', grad_fn=<DivBackward0>)
first qs[tensor([0., 0., 0., 0., 0., 0., 0.], device='cuda:0')]
steps: 8.68
Agent 1 Win Percentage: 0.74
Agent 2 Win Percentage: 0.06
Number 

KeyboardInterrupt: 

In [4]:
env = make("connectx", debug=True)
model = Net(env.configuration.columns*env.configuration.rows, 300, env.configuration.columns)
model.load_state_dict(torch.load("model_state"))
def agent(observation, configuration):
    with torch.no_grad():
        result = model(torch.tensor(observation.board, dtype = torch.float))
        print(result)
        result = int(torch.argmax(result))
        return result

In [5]:
env.reset()
# Play as the first agent against default "random" agent.
env.run([agent, "random"])
env.render(mode="ipython", width=500, height=450)

tensor([36.6298, 43.0849, 33.7678, 36.0474, 42.9718, 43.7050, 40.6634])
tensor([37.8858, 44.3841, 33.5597, 35.1042, 44.7194, 45.5465, 41.7299])
tensor([39.0033, 46.4624, 33.7962, 33.1300, 44.6875, 47.8918, 44.8106])
tensor([38.9665, 46.3897, 33.7872, 33.2183, 44.7269, 47.7831, 44.6775])


In [23]:
model = Net(env.configuration.columns*env.configuration.rows, 300, env.configuration.columns)
model.load_state_dict(torch.load("model_state"))
def agent1(obs, config):
    with torch.no_grad():
        state = torch.tensor(obs['board'], dtype=torch.float)
        col = model(state).argmax().item()
        return col

In [16]:
get_win_percentages(agent1, 'negamax')

Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 0.39
Number of Invalid Plays by Agent 1: 61
Number of Invalid Plays by Agent 2: 0


In [17]:
model = Net(env.configuration.columns*env.configuration.rows, 300, env.configuration.columns)
model.load_state_dict(torch.load("model_state"))
for param in model.parameters():
  print(param.data)

tensor([[-0.0904,  0.0919,  0.1277,  ..., -0.0106,  0.1755, -0.1241],
        [-0.0072,  0.1276,  0.0015,  ...,  0.0168,  0.0184,  0.0091],
        [ 0.1542, -0.0082,  0.0211,  ..., -0.0545, -0.0861, -0.1002],
        ...,
        [-0.1185,  0.0440, -0.0660,  ..., -0.0116, -0.0241, -0.1110],
        [-0.0826, -0.1225, -0.0858,  ...,  0.0593,  0.0191, -0.0637],
        [ 0.1158, -0.1356, -0.1062,  ..., -0.0026, -0.1227, -0.1448]])
tensor([ 0.0595,  0.0143,  0.0345,  0.0101, -0.0403,  0.0387,  0.0478, -0.0193,
        -0.1549,  0.0101, -0.1366, -0.0428,  0.0471, -0.0494,  0.0072, -0.0443,
         0.0056, -0.1984, -0.0797, -0.0769, -0.2006, -0.1961, -0.0321, -0.1634,
         0.0785, -0.1429, -0.0294, -0.0915,  0.0035,  0.0097,  0.0786,  0.0014,
        -0.1533, -0.0602,  0.0042, -0.2130,  0.1134, -0.1177, -0.0034, -0.0033,
        -0.0710, -0.1049, -0.1177,  0.0416, -0.2251, -0.0176, -0.0630,  0.0462,
        -0.0784,  0.0256, -0.1192, -0.1507, -0.0229,  0.0679,  0.0833, -0.0909,
      