## Encode chess position

In [1]:
import sys
sys.path.insert(1, "lib/")
from dataset_utils import nb_channels, encode_position, store_many_hdf5
from ScoreGetter import ScoreGetter, Engine

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from multiprocessing import current_process

We load a dataset containing millions of unique chess position represented by the [FEN](https://fr.wikipedia.org/wiki/Notation_Forsyth-Edwards) notation.

In [2]:
df = pd.read_csv("/media/gaetan/HDD/IA/Chess/Datasets/fen_dataset.csv")
df = df.sample(frac=1)

FileNotFoundError: [Errno 2] No such file or directory: '/home/gaetan/HDD/IA/Chess/Datasets/fen_dataset.csv'

We create our train/test dataset.

Each position is encoded as an image of $8\times8$ "pixels" with $15$ channels
($12$ representing each chess pieces, $1$ for the actual player, $1$
for the en-passant square and $1$ for the castling rights). Along this encoded position, we store
the score given by Stockfish 14.

In [None]:
score_getter = ScoreGetter("bin/stockfish", Engine.STOCKFISH)

nb_position = 700000
position = np.zeros((nb_position, 8, 8, nb_channels), dtype=np.float32)
scores = np.zeros(nb_position, dtype=np.float32)

# Status bar configuration
current = current_process()
pos = current._identity[0]-1 if len(current._identity) > 0 else 0
pbar = tqdm(total=nb_position, desc="Encoding", position=pos)

count = 0
for i in range(nb_position):
  pbar.update(1)
  fen = df["board"][i]
  try:
    scores[i] = score_getter.get_score(fen)
    position[i] = encode_position(fen)
    count += 1
  except Exception as e:
    if str(e) == "[Errno 32] Broken pipe":
        score_getter.restart()
    continue
scores = scores[:count].reshape(-1, 1)
position = position[:count]
print(f"Number of positions: {position.shape[0]}")

We look at the distribution of the scores

In [None]:
plt.hist(scores, bins=100, density=True)
plt.xlabel("Evalution (centipawn)")
plt.title("Distribution of evaluations")
_ = plt.savefig("results/distribution.pdf")

In [None]:
pd.Series(scores).describe()

We save the resulting dataset using HDF5.

In [None]:
directory = "/media/gaetan/HDD/IA/Chess/Datasets/SE_ResNet/"
store_many_hdf5(position, scores, directory, tag="_test")