In [1]:
import _pickle as pickle
import os
import gc
from tqdm.auto import tqdm
from multiprocess import Pool
from chessboard import display
from chess_env import *
from tf_transformers import *
from chess_utils import *
import tensorflow as tf
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile, protocol=4)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

pygame 2.0.1 (SDL 2.0.14, Python 3.8.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Tokenization

In [None]:
df= load('44M_fen_dataset', 'data')

In [None]:
df.head()

In [None]:
def f(x):
    from chess_utils import get_pos_centric_rep, pos_to_ind, paw_to_ind
    import numpy as np
    pa, po = get_pos_centric_rep(x)
    pa = np.array([paw_to_ind[elt] for elt in pa]).astype('int8')
    po = np.array([pos_to_ind[elt] for elt in po]).astype('int16')
    return pa, po

In [None]:
fens = df['fen'].values#[:100000]

In [None]:
%%time
p = Pool(32)
unzipped =  list(zip(*p.map(f, fens)))
p.close()
pawns, pos = np.array(unzipped[0]).astype('int8'), np.array(unzipped[1]).astype('int16')

In [None]:
pawns = np.concatenate([np.zeros((pawns.shape[0], 1)), pawns], axis = 1).astype('int8')
pos = np.concatenate([np.zeros((pos.shape[0], 1)), pos], axis = 1).astype('int16')

In [None]:
rank =df['rank'].values.astype('int16')
color = df['color'].values
color[color == 'w'] = 0
color[color == 'b'] = 1
color = color.astype('int8')
save((pawns, pos, rank, color), '44M_tokens_pos', 'data')

## Data Preparation

In [None]:
(pawns, pos, rank, color) = load('44M_tokens_pos', 'data')

In [None]:
## Masking
p0 = 0.015
p1 = 0.25

mask0 = np.random.choice(np.array([0,1]).astype('int8'), p = [1-p0,p0],size = (pawns.shape[0], pawns.shape[1])).astype('int8')
print('mask_0 initialized')
mask1 = np.random.choice(np.array([0,1]).astype('int8'), p = [1-p1,p1],size = (pawns.shape[0], pawns.shape[1])).astype('int8')
print('mask_1 initialized')
mask0 *= (pawns == 0)*1
print('mask_0 created')
mask1 *= (pawns != 0)*1
print('mask_1 created')
mask = mask0 + mask1
print('mask_created')

del mask0
del mask1
gc.collect()

pawns_out = pawns * mask + (1-mask)*13
pawns = (pawns * (1-mask)) + (13 * mask)

pawns = pawns.astype('int8')
pawns_out = pawns_out.astype('int8')

In [None]:
ind = 10
pawns[ind]

In [None]:
pawns_out[ind]

In [None]:
plt.figure(figsize = (25,15))
plt.plot(pawns[ind])
plt.plot(pawns_out[ind])

In [None]:
from sklearn.model_selection import train_test_split

X_train_pa, X_test_pa, X_train_po, X_test_po = train_test_split(pawns, pos, test_size=0.2, random_state=42)
y_train, y_test, _ , _ = train_test_split(pawns_out, color, test_size=0.2, random_state=42)
color_train, color_test, rank_train , rank_test = train_test_split(color, rank, test_size=0.2, random_state=42)

X_train = {
    'pawns' : X_train_pa.astype('int8'),
    'pos' : X_train_po.astype('int8'),
    'color' : color_train[:,None].astype('int8'),
    'rank' : rank_train[:,None].astype('int16')
}

X_test = {
    'pawns' : X_test_pa.astype('int8'),
    'pos' : X_test_po.astype('int8'),
    'color' : color_test[:,None].astype('int8'),
    'rank' : rank_test[:,None].astype('int16')
}

y_train = y_train.astype('int8')
y_test = y_test.astype('int8')

del pawns
del pos
del rank
del color
gc.collect()

In [None]:
save((X_train, X_test, y_train, y_test), 'dl_ready_data_pos', 'data')

## Pretraining

In [3]:
(X_train, X_test, y_train, y_test) = load('dl_ready_data_pos', 'data')
X_train['rank'][X_train['rank']>=512] = 511
X_test['rank'][X_test['rank']>=512] = 511

In [None]:
pawns = X_test['pawns']
pawns_out = y_test

In [None]:
# for elt in X_train:
#     X_train[elt] = X_train[elt].astype('float32')
#     X_test[elt] = X_test[elt].astype('float32')
# y_train = y_train.astype('float32')
# y_test = y_test.astype('float32')

In [4]:
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
# policy = mixed_precision.Policy('float32')
mixed_precision.set_global_policy(policy)

# tf.config.optimizer.set_jit(True)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: GeForce RTX 3090, compute capability 8.6


In [5]:
class ChessEncoder(tf.keras.layers.Layer):
    def __init__(self, num_layers = 2, d_model = 512, num_heads = 8, dff = 1024, input_vocab_size = 10000, maximum_position_encoding = 512, num_types = 2, rate=0.1, bidirectional_encoder = True):
        super(ChessEncoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding_pawns = tf.keras.layers.Embedding(14, d_model)
        self.embedding_pos = tf.keras.layers.Embedding(66, d_model)
        self.embedding_col = tf.keras.layers.Embedding(3, d_model)
        self.embedding_ran = tf.keras.layers.Embedding(512, d_model)
    

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
        self.bidirectional_encoder = bidirectional_encoder
        
    def call(self, x, 
             pos, 
             training, 
             color = None, rank = None
            ):
        """
        Two arguments to pass:
            x : the input sequence of the transformer
            training : bool, whether to train or not for dropout
        
        """
        seq_len = tf.shape(x)[1]
#         print(x)
        
        # adding embedding and position encoding.
        x = self.embedding_pawns(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model,x.dtype))
        x += self.embedding_pos(pos)
        
#         print(x)
        
        if color is not None:
            col = self.embedding_col(color)
            col = tf.squeeze(col, axis = 1)
            col = tf.keras.layers.RepeatVector(seq_len)(col)
            x += col
            
        if rank is not None:
            ran = self.embedding_ran(rank)
            ran = tf.squeeze(ran, axis = 1)
            ran = tf.keras.layers.RepeatVector(seq_len)(ran)
            x += ran
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask = None)

        return x  # (batch_size, input_seq_len, d_model)

In [None]:
# ## Pawn Centric
# import tensorflow as tf
# import numpy as np

# inputs = {
#     'pawns' : tf.keras.Input(shape = (67,)),
#     'pos' : tf.keras.Input(shape = (67,)),
#     'color' : tf.keras.Input(shape = (1,)),
#     'rank' : tf.keras.Input(shape = (1,))
# }

# chessencoder = ChessEncoder(num_layers = 8, d_model = 512, num_heads = 8, dff = 1024, 
#                             input_vocab_size = 10000, maximum_position_encoding = 66, num_types = 2, 
#                             rate=0.1, bidirectional_encoder = True)

# encoded = chessencoder(
#             inputs['pawns'],
#             inputs['pos'],
#             training = True,
#             color = inputs['color'],
#             rank = inputs['rank']
# )

# pred = tf.keras.layers.Dense(66, activation = 'softmax')(encoded)

# model = tf.keras.Model(inputs, pred)
# model.summary()

In [6]:
## Pos Centric

import tensorflow as tf
import numpy as np

inputs = {
    'pawns' : tf.keras.Input(shape = (65,)),
    'pos' : tf.keras.Input(shape = (65,)),
    'color' : tf.keras.Input(shape = (1,)),
    'rank' : tf.keras.Input(shape = (1,))
}

chessencoder = ChessEncoder(num_layers = 4, d_model = 512, num_heads = 8, dff = 1024, 
                            input_vocab_size = 10000, maximum_position_encoding = 66, num_types = 2, 
                            rate=0.1, bidirectional_encoder = True)

encoded = chessencoder(
            inputs['pawns'],
            inputs['pos'],
            training = True,
            color = inputs['color'],
            rank = inputs['rank']
)

pred = tf.keras.layers.Dense(14, activation = 'softmax')(encoded)

model = tf.keras.Model(inputs, pred)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 65)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 65)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
______________________________________________________________________________________________

In [None]:
X_d = {elt:X_test[elt][:100000] for elt in X_test}
y_d = y_test[:100000]

x = model.predict(X_d, verbose = 1, batch_size = 2048)
# x = model(X_d, training = True)


In [None]:
mixed precision 1M bs 1024: 149s
half precision 1M bs 1024:  125s
simple precision 1M bs 1024: 200s

In [7]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=False, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 13))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def acc_end(true, pred):
    true = tf.cast(true, dtype = pred.dtype)
    
    mask = tf.math.logical_not(tf.math.equal(true, 13))
    mask = tf.cast(mask, dtype = true.dtype)
    
    n_mask = tf.math.equal(mask, 0)
    n_mask = tf.cast(mask, dtype = true.dtype)
    n_mask = tf.math.reduce_sum(n_mask)
    
    en_true = true * mask

    p = tf.math.argmax(pred, axis = -1)
    p = tf.cast(p, dtype = true.dtype)
    en_pred = p * mask  
    
    equal = tf.reduce_sum(tf.cast(tf.math.equal(en_true,en_pred), true.dtype))
    
    total = tf.reduce_sum(tf.cast(tf.math.logical_not(tf.math.equal(true, 151)),dtype= true.dtype))
    
    sum_equal = equal - n_mask
    total = total - n_mask
    
    return sum_equal / total

def acc(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 13))
    mask = tf.cast(mask, dtype=metric.dtype)

    
    

lr = 3e-5
optimizer = tf.keras.optimizers.Adam(lr, 1e-8)
metrics = ['sparse_categorical_accuracy', acc_end]

model.compile(optimizer = optimizer, loss = loss_function, metrics = metrics)

In [None]:
early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=4, verbose=1, 
                                                mode='auto', restore_best_weights=True)
reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, 
                                                  mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

callbacks = [early, reduce]

epochs = 5
batch_size = 2048
validation_batch_size = 2*batch_size

history = model.fit(X_train, y_train, validation_data = (X_test, y_test),
                   epochs = epochs, batch_size = batch_size, validation_batch_size = validation_batch_size, 
                   callbacks = callbacks)

Epoch 1/5

In [None]:
model.save_weights('./checkpoints/pos_centric/pretrained/pretrained_loss_')

In [None]:
X_d = {elt:X_test[elt][:100000] for elt in X_test}
y_d = y_test[:100000]

x = model.predict(X_d, verbose = 1, batch_size = 4*2048)

In [None]:
pred = np.argmax(x, axis = -1)

In [None]:
ind = 10
y_d[ind]

In [None]:
pred[ind]

In [None]:
plt.figure(figsize = (25,15))
plt.plot(y_d[y_d != 13])
plt.plot(pred[y_d != 13])