## Encode chess position

In [None]:
from lib.dataset_utils import encode_position, store_many_hdf5
from lib.score_getter import ScoreGetter, Engine

import chess
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from multiprocessing import current_process

We load a dataset containing millions of unique chess position represented by the [FEN](https://fr.wikipedia.org/wiki/Notation_Forsyth-Edwards) notation.

In [None]:
df = pd.read_csv("/media/gaetan/HDD/IA/Chess/Datasets/fen_dataset.csv", nrows=2_000_000)
df = df.sample(frac=1)

We create our train/test dataset.

Each position is encoded as an image of $8\times8$ "pixels" with $15$ channels
($12$ representing each chess pieces, $1$ for the actual player, $1$
for the en-passant square and $1$ for the castling rights). Along this encoded position, we store
the score given by Stockfish 14.

In [None]:
score_getter = ScoreGetter("bin/stockfish", Engine.STOCKFISH)

nb_positions = 20_000
positions = []
scores = []

# Status bar configuration
current = current_process()
pos = current._identity[0]-1 if len(current._identity) > 0 else 0
pbar = tqdm(total=nb_positions, desc="Encoding", position=pos)

for i in range(nb_positions):
  pbar.update(1)
  fen = df["board"][i]
  board = chess.Board(fen)
  if board.is_game_over(): continue
  try:
      scores.append(score_getter.get_score(board.fen()))
      positions.append(encode_position(board))
  except Exception as e:
    if str(e) == "[Errno 32] Broken pipe":
        score_getter.restart()
    continue
scores = np.array(scores, dtype=np.float32).reshape(-1, 1)
positions = np.array(positions, dtype=np.float32)
print(f"Number of positions: {positions.shape[0]}")

We look at the distribution of the scores

In [None]:
plt.hist(scores, bins=100, density=True)
plt.xlabel("Evalution (centipawn)")
plt.title("Distribution of evaluations")
_ = plt.savefig("results/distribution.pdf")

In [None]:
pd.Series(scores.reshape(-1)).describe()

We save the resulting dataset using HDF5.

In [None]:
directory = "/media/gaetan/HDD/IA/Chess/Datasets/SE_ResNet/"
store_many_hdf5(positions[:3_000_000], scores[:3_000_000], directory, tag="_train")
store_many_hdf5(positions[3_000_000:3_020_000], scores[3_000_000:3_020_000], directory, tag="_validation")
store_many_hdf5(positions[3_020_000:], scores[3_020_000:], directory, tag="_test")