# 2 Data Preprocessing
**Objective:** Learn how data needs to be represented for machine learning algorithms to be applied.

In [17]:
from chinese_checkers.game.ChineseCheckersGame import ChineseCheckersGame
from chinese_checkers.model.CentroidModel import CentroidModel
from chinese_checkers.simulation.DataCatalog import DataCatalog
from chinese_checkers.simulation.GameSimulation import GameSimulation
from chinese_checkers.simulation.SimulationDataSet import SimulationDataSet

from torch import tensor, zeros, save, stack, zeros_like

from typing import List

import os

## 2.1 State Representation

To represent the `ChineseCheckersGame` state for PyTorch consumption, we must transform the board into a one-hot encoded vector. For a given position on the board, we'll represent each potential player's piece using a binary value: 1 if the player has a piece at that position and 0 otherwise.

Given this, for a board of size `s` with `n` players, the length of our one-hot encoded state vector will be $s \times n$.

For clarity, the one-hot encoded representation is defined as:

$$
p_{i,j} =
\begin{cases}
1 & \ \text{if player } j \text{ has a piece at position } i,\\
0 & \ \text{otherwise.}
\end{cases}
$$

We'll begin by focusing on a board of size 4 with 2 players. Later, we can extend our approach to accommodate different board sizes and player counts.


In [2]:
def generate_tensor_for_game(game: ChineseCheckersGame) -> tensor:
    all_positions = game.board.hexagram_points
    encoded_state = tensor([
        [
            1 if position in player.positions else 0
            for position in all_positions
        ]
        for player in game.players
    ])

    return encoded_state

game = ChineseCheckersGame.start_game(number_of_players=2, board_size=4)
game_state = generate_tensor_for_game(game)
print(game_state)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0]])


In [3]:
# we have added this as a method to the ChineseCheckersGame class
game.tensor()

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0]])

## 2.2 Encoding Sequence of Game States

With the capability to encode individual game states, our next step is encoding sequences of game states that represent an entire game. To ensure consistency, we'll set a limit of 400 turns for each game. Each game state within this sequence will occupy a row in our matrix.

In cases where a game concludes before reaching the 400-turn limit, we'll pad the matrix with rows of zeros until it reaches the desired size. This ensures that each game, regardless of its duration, results in a consistent-sized matrix which can be efficiently processed by PyTorch.

In [11]:
def generate_tensor_for_game_sequence(game_sequence: List[ChineseCheckersGame], max_turns = 400):
    game_sequence_tensors = [ game_state.tensor() for game_state in game_sequence ]

    padding_size = max_turns - len(game_sequence)
    if padding_size > 0:
        padding = [zeros_like(game_sequence_tensors[0])] * padding_size
        game_sequence_tensors.extend(padding)

    return stack(game_sequence_tensors)


# Example usage:
simulation = GameSimulation([CentroidModel(), CentroidModel()], max_turns=400)
simulation.simulate_game()
print(f"No. of turns: {len(simulation.games)}")

game_sequence_tensor = generate_tensor_for_game_sequence(simulation.games, simulation.max_turns)
game_sequence_tensor.size()

No. of turns: 287


torch.Size([400, 2, 121])

In [12]:
# we have added this as a method to the GameSimulation class
simulation.tensor(400).size()

torch.Size([400, 2, 121])

## 2.3 Generating Data for Training
Run a lot of naive simulations and save the results to a file. This will be our training data to bootstrap the model.

In [13]:
for version in range(100):
    print(f"Generating data for version: {version:04}")
    simulated_games = []
    labels = []
    draw_count = 0
    while len(simulated_games) < 1000:
        max_turns = 400
        simulation = GameSimulation([CentroidModel(), CentroidModel()], max_turns=max_turns)
        try:
            winner = simulation.simulate_game()
            simulated_games.append(simulation.tensor(max_turns))
            labels.append(0 if winner.player_id == 'Player 0' else 1)

        except Exception as e:
            draw_count += 1
            continue

    print(f"Draws: {draw_count}")
    percent = lambda c : round(c/len(labels)*100, 4)
    print(f"Player 0 wins: {labels.count(0)} ({percent(labels.count(0))}%)")
    print(f"Player 1 wins: {labels.count(1)} ({percent(labels.count(1))}%)")

    dataset = SimulationDataSet(
        player_count=2,
        board_size=4,
        game_length=max_turns,
        name="naive_simulation",
        version=f"{version:04}",
        description="Naive simulation of games using centroid model.",
        data=stack(simulated_games),
        labels=tensor(labels)
    )

    print(f"Saving dataset: {version:04}")

    catalog = DataCatalog("game_data")
    catalog.save_dataset(dataset)

Generating data for version: 0000
Draws: 351
Player 0 wins: 498 (49.8%)
Player 1 wins: 502 (50.2%)
Saving dataset: 0000
Generating data for version: 0001
Draws: 310
Player 0 wins: 504 (50.4%)
Player 1 wins: 496 (49.6%)
Saving dataset: 0001
Generating data for version: 0002
Draws: 317
Player 0 wins: 499 (49.9%)
Player 1 wins: 501 (50.1%)
Saving dataset: 0002
Generating data for version: 0003
Draws: 322
Player 0 wins: 507 (50.7%)
Player 1 wins: 493 (49.3%)
Saving dataset: 0003
Generating data for version: 0004
Draws: 314
Player 0 wins: 496 (49.6%)
Player 1 wins: 504 (50.4%)
Saving dataset: 0004
Generating data for version: 0005
Draws: 301
Player 0 wins: 483 (48.3%)
Player 1 wins: 517 (51.7%)
Saving dataset: 0005


RuntimeError: Can't decrement id ref count (unable to extend file properly)

In [14]:
catalog.list_datasets()

[{'player_count': '2',
  'board_size': '4',
  'game_length': '400',
  'name': 'naive_simulation',
  'version': '0000'},
 {'player_count': '2',
  'board_size': '4',
  'game_length': '400',
  'name': 'naive_simulation',
  'version': '0001'},
 {'player_count': '2',
  'board_size': '4',
  'game_length': '400',
  'name': 'naive_simulation',
  'version': '0002'},
 {'player_count': '2',
  'board_size': '4',
  'game_length': '400',
  'name': 'naive_simulation',
  'version': '0003'},
 {'player_count': '2',
  'board_size': '4',
  'game_length': '400',
  'name': 'naive_simulation',
  'version': '0004'},
 {'player_count': '2',
  'board_size': '4',
  'game_length': '400',
  'name': 'naive_simulation',
  'version': '0005'}]

In [15]:
dataset = catalog.get_dataset(
    player_count=2,
    board_size=4,
    game_length=400,
    name="naive_simulation",
    version="0002"
)
display(tensor(dataset.data).size())
len(dataset.labels)

torch.Size([1000, 400, 2, 121])

1000

### Training Data Storage Size Optimization
Given we could only store a small handful of simulated games, we need to rethink our approach to data storage. We'll begin by exploring the size of our data on disk.

In [18]:
def get_directory_size(path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            total_size += os.path.getsize(file_path)

    return total_size

def human_readable_size(size, decimal_places=2):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            break
        size /= 1024.0
    return f"{size:.{decimal_places}f} {unit}"

directory_path = "game_data"
total_size = get_directory_size(directory_path)
print(f"Total size of directory '{directory_path}': {human_readable_size(total_size)}")

Total size of directory 'game_data': 3.61 GB


lol okay, so maybe I need to clear up some space on my hard drive. But, I think we can do better than this. Let's see how much space we can save by compressing our data.  All the padded zeros are probably a good candidate for compression.  We can simply store the game state in its raw form with player ids and their occupied positions.  The tensors can be reconstructed from this data when loaded.  I think a PyTorch DataLoader or Dataset can handle this for us.
```python
from torch.utils.data import Dataset, DataLoader
```