In [1]:
import sys
sys.path.insert(1, 'Classes/')
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from multiprocessing import current_process

from ScoreGetter import ScoreGetter
from dataset_utils import checkIfEarlyMidEnd
from dataset_utils import encodeBoard
from dataset_utils import getColumns

In [2]:
df = pd.read_csv('Datasets/raw_dataset_13M.csv')
boards = df['board'].values

#### We load an engine to get a score from the positions.

In [3]:
score_getter = ScoreGetter('/home/gaetan/Téléchargements/stockfish/stockfish', 'eval', 'go depth 1')

#### We create our dataset by getting an equal number of start, middle and end game positions and encoding them.

In [4]:
total_size = 5000000
batch_size = total_size / 3.0

earlies = []
mids = []
ends = []

current = current_process()
pos = current._identity[0]-1 if len(current._identity) > 0 else 0
pbar = tqdm(total=total_size, desc='Splitting and encoding', position=pos)

for i in range(len(boards)):
    board = boards[i]
    part = checkIfEarlyMidEnd(board)
    
    if len(earlies) < batch_size and part == "early_game":
        earlies.append(np.append(encodeBoard(board), score_getter.getScore(board)))
        pbar.update(1)
    
    elif len(mids) < batch_size and part == "mid_game":
        mids.append(np.append(encodeBoard(board), score_getter.getScore(board)))
        pbar.update(1)
        
    elif len(ends) < batch_size and part == "end_game":
        ends.append(np.append(encodeBoard(board), score_getter.getScore(board)))
        pbar.update(1)
    
    if len(earlies) >= batch_size and len(mids) >= batch_size and len(ends) >= batch_size:
        break
    
pbar.close()

Splitting and encoding: 5000001it [1:40:35, 828.45it/s]                             


In [5]:
data = earlies + mids + ends
random.shuffle(data)
len(data)

5000001

In [None]:
df = pd.DataFrame(data, columns=np.append(getColumns(), 'cp (Stockfish 13)'))
df.to_csv('Datasets/dataset.csv', index=False)
df.head()

#### We create a test dataset.

In [None]:
test_size = 200000
tests = []

current = current_process()
pos = current._identity[0]-1 if len(current._identity) > 0 else 0
pbar = tqdm(total=test_size, desc='Encoding', position=pos)

tot_size = boards.shape[0]
for i in range(test_size):
    idx = np.random.randint(tot_size)
    tests.append(np.append(encodeBoard(boards[idx]), score_getter.getScore(boards[idx])))
    pbar.update(1)
pbar.close()

len(tests)

In [None]:
df = pd.DataFrame(tests, columns = np.append(getColumns(), 'cp (Stockfish 13)'))
df.to_csv('Datasets/test_dataset.csv', index=False)
df.head()