In [1]:
!pip install kaggle



In [13]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"devshah2021","key":"7ca39cb5f414dd2979e6b8aa38f5f8dd"}'}

In [14]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [15]:
! cp kaggle.json ~/.kaggle/

In [16]:
! chmod 600 ~/.kaggle/kaggle.json

In [17]:
! kaggle datasets download arevel/chess-games

Downloading chess-games.zip to /content
100% 1.45G/1.45G [01:04<00:00, 27.8MB/s]
100% 1.45G/1.45G [01:04<00:00, 23.9MB/s]


In [18]:
!pip install chess -q

In [19]:
letter_2_num = {'a': 0, 'b': 1, 'c': 2, 'd':3, 'e':4, 'f':5, 'g':6, 'h':7}

In [20]:
num_2_letter = {0: 'a', 1:'b', 2:'c', 3:'d', 4:'e', 5:'f', 6:'g', 7:'h'}

we will represent the pawns as 1 or -1 depending on whether or not it's black or white. this helps us turn a chess-board to a matrix

In [21]:
import numpy as np
import re

In [22]:
def create_rep_layer(board, type):
  s = str(board) #get string of board
  s = re.sub(f'[^{type}{type.upper()} \n]', '.', s) #first we replace everything except our desired piece with a period
  s = re.sub(f'{type}', '-1', s) #replace black with -1
  s = re.sub(f' {type.upper()}', '1', s) #replace whites with 1
  s = re.sub(f'\.', '0', s) #replace the dots with 0s
  board_mat = []
  for row in s.split('\n'):
    row = row.split(' ')
    row = [int(x) for x in row] #replace string numbers with integers
    board_mat.append(row) # add to matrix
  return np.array(board_mat) #return numpy matrix

In [23]:
def board2rep(board):
  pieces = ['p', 'r', 'n', 'b', 'q', 'k'] ## chess pieces
  layers = []
  for piece in pieces:
    layers.append(create_rep_layer(board, piece)) #create feature map for each type of piece
  board_rep = np.stack(layers) #create a 3d tensor (which we will give to the CNN)
  return board_rep

In [24]:
def move_2_rp(move, board):
  board.push_san(move).uci() # converts the board into UCI format
  # for ex: d4e5 => this implies take the piece in position d4 and move to e5
  move = str(board.pop()) #

  from_output_layer = np.zeros((8,8))
  from_row = 8 - int(move[1])
  from_column = letter_2_num[move[0]]
  from_output_layer[from_row, from_column] = 1

  to_output_layer = np.zeros((8,8))
  to_row = 8 - int(move[3])
  tow_column = letter_2_num[move[2]]
  to_output_layer[to_row, tow_column] = 1

  return np.stack([from_output_layer, to_output_layer])


In [25]:
def create_move_list(s):
  return re.sub('\d*\. ', '',s).split(' ')[:-1]
  # this will give us a list of moves which we can loop through
  # and convert into matrix representation

In [26]:
import pandas as pd

In [27]:
!unzip /content/chess-games.zip

Archive:  /content/chess-games.zip
  inflating: chess_games.csv         


In [29]:
chess_data_raw = pd.read_csv('/content/chess_games.csv', usecols=['AN','WhiteElo'])

In [30]:
chess_data = chess_data_raw[chess_data_raw['WhiteElo']>2000]

In [36]:
chess_data = chess_data[['AN']]
chess_data = chess_data[-chess_data['AN'].str.contains('{')] # filters out the stuff that contains "{"

In [37]:
chess_data = chess_data[chess_data['AN'].str.len()>20] # filter out the short games

In [38]:
print(chess_data.shape[0])

883376


In [41]:
import chess

In [59]:
import torch
from torch.utils.data import Dataset

class ChessDataset(Dataset):
  def __init__(self, games):
    super(ChessDataset, self).__init__()
    self.games = games

  def __len__(self):
    return 40_000

  def __getitem__(self, index):
    game_i = np.random.randint(self.games.shape[0])
    random_game = chess_data['AN'].values[game_i] # pick a random game
    moves = create_move_list(random_game)
    game_state_i = np.random.randint(len(moves)-1)  # pick a random move from the game
    next_move = moves[game_state_i]
    moves = moves[:game_state_i]
    board = chess.Board()
    for move in moves:
      board.push_san(move)
    x = board2rep(board) # convert to matrix
    y = move_2_rp(next_move, board) # convert to matrix
    if game_state_i % 2 == 1: # if the move index is even (black's turn)
      x *= -1  # then we multiply board matrix by -1
      # this way the CNN will always know to play the pieces that are represented by positive values
    return x,y


In [60]:
from torch.utils.data import DataLoader

data_train = ChessDataset(chess_data['AN'])
data_train_loader = DataLoader(data_train, batch_size=32,shuffle=True,drop_last=True)

# drop_last will drop the last mini_batch if there aren't enough examples for the right size

In [61]:
from torch import nn

In [62]:
class module(nn.Module):
  def __init__(self, hidden_size):
    super(module, self).__init__()
    self.conv1 = nn.Conv2d(hidden_size, hidden_size, 3, stride=1, padding=1)
    self.conv2 = nn.Conv2d(hidden_size, hidden_size, 3, stride=1, padding=1)
    # 2 convolutional layers
    self.bn1 = nn.BatchNorm2d(hidden_size)
    self.bn2 = nn.BatchNorm2d(hidden_size)
    # bn layers normalize the inputs of each layer. mean of 0 + std of 1
    # this helps reduce covariate shifts. prevents the slowing down of training when data is
    # distributed across different batches.
    self.activation1 = nn.SELU()
    self.activation2 = nn.SELU()

  def forward(self,x):
    x_input = torch.clone(x)
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.activation1(x)
    x = self.conv2(x)
    x = self.bn2(x)
    x = x + x_input
    x = self.activation(x)
    return x



More about BatchNorm:

1. During training, for each mini-batch of data, BatchNorm computes the mean and standard deviation for each channel across all the elements in the mini-batch.

2. It then normalizes the activations of the current mini-batch using these computed means and standard deviations.

3. After normalization, BatchNorm applies a scaling parameter (gamma) and a shifting parameter (beta) to the normalized activations.

4. The output of the BatchNorm layer is the scaled and shifted normalized activations.

**Mean of Zero:**
Having a mean of zero ensures that the data is centered around the origin. This can be beneficial for a few reasons:

1. Symmetry Breaking: A mean of zero helps to break symmetries in the network. When the input data has a non-zero mean, it could lead to slower convergence or difficulties in learning. By centering the data at zero, we encourage the weights to update in a more balanced manner.

2. Faster Convergence: Neural networks tend to converge faster when the input data is centered. This is because gradients flow more easily through the network, especially in symmetric structures like deep networks.

3. Avoids Biases: With a zero mean, the network doesn't have to worry about learning a bias term initially, as it can start with the assumption that the data is centered.


**Standard Deviation of One:**
Normalizing the data to have a standard deviation of one has several advantages:

1. Stable Gradients: When data has a standard deviation that varies widely, the gradients during backpropagation can also vary widely. This can lead to exploding or vanishing gradients, which hinders training. By normalizing to have a standard deviation of one, we keep the gradients within a reasonable range, making training more stable.

2. Similar Scale: Ensuring that the data has a standard deviation of one helps in having a consistent scale of input for each layer. This can improve the learning speed and performance of the network.

3. Independence from Scale: Normalizing to a fixed standard deviation removes the dependency of the learning process on the scale of the input. The network becomes more robust to changes in input scaling.

In [63]:
import torch.nn.functional as F

In [64]:
class ChessNet(nn.Module):

    def __init__(self, hidden_layers=4, hidden_size=200):
        super(ChessNet, self).__init__()
        self.hidden_layers = hidden_layers
        self.input_layer = nn.Conv2d(6, hidden_size, 3, stride=1, padding=1)
        self.module_list = nn.ModuleList([module(hidden_size) for i in range(hidden_layers)])

        # this creates a module list from our module class above
        self.output_layer = nn.Conv2d(hidden_size, 2, 3, stride=1, padding=1)

    def forward(self, x):

        x = self.input_layer(x)
        x = F.relu(x)

        for i in range(self.hidden_layers):
            x = self.module_list[i](x)

        x = self.output_layer(x)

        return x

In [65]:
def checkmate_single(board):
    board = board.copy()
    legal_moves = list(board.legal_moves)
    for move in legal_moves:
        board.push_uci(str(move))
        if board.is_checkmate():
            move = board.pop()
            return move
        _ = board.pop()
    return None

In [66]:
def distribution_over_moves(vals):
    probs = np.array(vals)
    probs = np.exp(probs)
    probs = probs / probs.sum()
    probs = probs ** 3
    probs = probs / probs.sum()
    return probs

In [67]:
def predict(x):
    model.eval()
    with torch.no_grad():
        outputs = model(x)
        return outputs.cpu().numpy()

In [68]:
def choose_move(board, player, color):

    legal_moves = list(board.legal_moves)

    move = checkmate_single(board)

    if move is not None:
        return move

    x = torch.Tensor(board2rep(board)).float().to('cuda')
    if color == chess.BLACK:
        x *= -1
    x = x.unsqueeze(0)
    move = predict(x)
    # print(move)
    vals = []
    froms = [str(legal_move)[:2] for legal_move in legal_moves]
    froms = list(set(froms))
    for from_ in froms:
        # print(move[0,:,:][0][0])
        val = move[0,:,:][0][8-int(from_[1]), letter_2_num[from_[0]]]
        # print(from_)
        vals.append(val)

    probs = distribution_over_moves(vals)

    chosen_from = str(np.random.choice(froms, size=1, p=probs)[0])[:2]

    vals = []
    for legal_move in legal_moves:
        from_ = str(legal_move)[:2]
        if from_ == chosen_from:
            to = str(legal_move)[2:]
            # print(move[0,:,:][0])
            # print(move[0,:,:][1])
            val = move[0,:,:][1][8 - int(to[1]), letter_2_num[to[0]]]
            vals.append(val)
        else:
            vals.append(0)
    chosen_move = legal_moves[np.argmax(vals)]
    return chosen_move

In [69]:
metric_from = nn.CrossEntropyLoss()
metric_to = nn.CrossEntropyLoss()

In [None]:
import torch.optim as optim

model = ChessNet(hidden_layers=4, hidden_size=200)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
record = []
num_epochs = 5
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(data_train_loader):
        inputs = inputs.float()
        labels = labels.float() # convert labels to float
        optimizer.zero_grad()
        outputs = model(inputs)
        output_from = outputs[:, 0, :]
        output_to = outputs[:, 1, :]
        y_from = labels[:, 0, :]
        y_to = labels[:, 1, :]
        loss_from = nn.CrossEntropyLoss()(output_from, y_from.argmax(dim=1))
        loss_to = nn.CrossEntropyLoss()(output_to, y_to.argmax(dim=1))
        loss = loss_from + loss_to
        loss.backward()
        optimizer.step()
        record.append(loss.item())
        if i % 1000 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, len(data_train_loader), loss.item()))