In [1]:
import sys
sys.path.insert(1, 'Classes/')
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from multiprocessing import current_process

from ScoreGetter import ScoreGetter
from dataset_utils import checkIfEarlyMidEnd
from dataset_utils import encodeBoard
from dataset_utils import getColumns

In [2]:
df = pd.read_csv('Datasets/raw_dataset_13M.csv')
boards = df['board'].values

#### We load an engine to get a score from the positions.

In [3]:
score_getter = ScoreGetter('/home/gaetan/Téléchargements/stockfish/stockfish', 'eval', 'go depth 1')

#### We create our dataset by getting an equal number of start, middle and end game positions and encoding them.

In [4]:
total_size = 1000000
batch_size = total_size / 3.0

earlies = []
mids = []
ends = []

current = current_process()
pos = current._identity[0]-1 if len(current._identity) > 0 else 0
pbar = tqdm(total=total_size, desc='Splitting and encoding', position=pos)

for i in range(1000000, len(boards)):
    board = boards[i]
    part = checkIfEarlyMidEnd(board)
    
    if len(earlies) < batch_size and part == "early_game":
        earlies.append(np.append(encodeBoard(board), score_getter.getScore(board)))
        pbar.update(1)
    
    elif len(mids) < batch_size and part == "mid_game":
        mids.append(np.append(encodeBoard(board), score_getter.getScore(board)))
        pbar.update(1)
        
    elif len(ends) < batch_size and part == "end_game":
        ends.append(np.append(encodeBoard(board), score_getter.getScore(board)))
        pbar.update(1)
    
    if len(earlies) >= batch_size and len(mids) >= batch_size and len(ends) >= batch_size:
        break
    
pbar.close()

Splitting and encoding: 1000002it [20:20, 819.28it/s]                            


In [5]:
data = earlies + mids + ends
random.shuffle(data)
len(data)

1000002

In [6]:
df = pd.DataFrame(data, columns=np.append(getColumns(), 'cp (Stockfish 13)'))
df.to_csv('Datasets/dataset1.csv', index=False)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,122,123,124,125,126,127,whites to play,castling rights,en passant square,cp (Stockfish 13)
0,-1.0,4.0,0.0,0.0,-1.0,3.0,-1.0,5.0,-1.0,6.0,...,1.0,3.0,0.0,0.0,1.0,4.0,1.0,15.0,-1.0,26.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,600.0
2,-1.0,4.0,-1.0,2.0,-1.0,3.0,-1.0,5.0,-1.0,6.0,...,1.0,3.0,1.0,2.0,1.0,4.0,0.0,15.0,-1.0,100.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0
4,-1.0,4.0,-1.0,2.0,-1.0,3.0,-1.0,5.0,-1.0,6.0,...,1.0,3.0,1.0,2.0,1.0,4.0,1.0,15.0,-1.0,48.0


#### We create a test dataset.

In [7]:
'''
test_size = 200000
tests = []

current = current_process()
pos = current._identity[0]-1 if len(current._identity) > 0 else 0
pbar = tqdm(total=test_size, desc='Encoding', position=pos)

tot_size = boards.shape[0]
for i in range(test_size):
    idx = np.random.randint(tot_size)
    tests.append(np.append(encodeBoard(boards[idx]), score_getter.getScore(boards[idx])))
    pbar.update(1)
pbar.close()

len(tests)
'''

"\ntest_size = 200000\ntests = []\n\ncurrent = current_process()\npos = current._identity[0]-1 if len(current._identity) > 0 else 0\npbar = tqdm(total=test_size, desc='Encoding', position=pos)\n\ntot_size = boards.shape[0]\nfor i in range(test_size):\n    idx = np.random.randint(tot_size)\n    tests.append(np.append(encodeBoard(boards[idx]), score_getter.getScore(boards[idx])))\n    pbar.update(1)\npbar.close()\n\nlen(tests)\n"

In [8]:
'''
df = pd.DataFrame(tests, columns = np.append(getColumns(), 'cp (Stockfish 13)'))
df.to_csv('Datasets/test_dataset.csv', index=False)
df.head()
'''

"\ndf = pd.DataFrame(tests, columns = np.append(getColumns(), 'cp (Stockfish 13)'))\ndf.to_csv('Datasets/test_dataset.csv', index=False)\ndf.head()\n"