In [11]:
from collections import deque
import random
import chess
import chess.variant
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.compat.v1.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from IPython.display import display, HTML
import chess.svg
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Concatenate
from tqdm import tqdm
from tensorflow.keras.models import Model
import chess.pgn
import numpy as np
from tensorflow.keras.models import load_model

#chess Variant Antichess



def board_to_input_array(board):
    board_array = np.zeros((8, 8, 12), dtype=np.uint8)
    piece_mapping = {'r': 0, 'n': 1, 'b': 2, 'q': 3, 'k': 4, 'p': 5, 'R': 6, 'N': 7, 'B': 8, 'Q': 9, 'K': 10, 'P': 11}
    #normalize piece values

    
    for square, piece in board.piece_map().items():
        piece_type = piece_mapping[piece.symbol()]
        color = int(piece.color)
        board_array[square // 8, square % 8, piece_type] = color + 1  # Use 0 for empty squares

    return board_array


def state_to_index(board):
    board_array = np.array(board_to_input_array(board))
    return hash(board_array.tostring()) % state_space_size[0]



def choose_action(board,model):
    if np.random.rand() < exploration_prob:
        return np.random.choice(list(board.legal_moves))
    else:
        state_index = state_to_index(board)
        legal_moves_list = list(board.legal_moves)
        if not legal_moves_list:
            return chess.Move.null()
        q_values = model.predict(np.array([board_to_input_array(board)]))[0]
        best_move_index = np.argmax(q_values)
        best_move_uci = legal_moves_list[min(best_move_index, len(legal_moves_list)-1)].uci()
        return chess.Move.from_uci(best_move_uci)
    
# Function to convert a move into an output array
def move_to_output_array(move, legal_moves):
    output_array = np.zeros(action_space_size)
    move_index = list(legal_moves).index(move)
    output_array[move_index] = 1
    return output_array



# Hyperparameters
learning_rate = 0.01
discount_factor = 0.99
exploration_prob = 0.2

# Neural Network Architecture
state_space_size = (8, 8, 12)  # 8x8 board with 12 channels (one for each piece type and color)
action_space_size = 4096



# Initialize a deque for experience replay
experience_replay_buffer = deque(maxlen=10000)

# Neural Network Model alpha zero


# Input layer
input_layer = Input(shape=state_space_size)

# Contracting path
# Convolutional layers
conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')(input_layer)
conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv1)
flatten_layer = Flatten()(conv2)
dense1 = Dense(64, activation='relu')(flatten_layer)
dense2 = Dense(64, activation='relu')(dense1)
output_layer = Dense(action_space_size, activation='softmax')(dense2)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.1), loss=['categorical_crossentropy'], metrics=['accuracy'])


def count_pieces_by_color(board, color):
    piece_types = [chess.PAWN, chess.KNIGHT, chess.BISHOP, chess.ROOK, chess.QUEEN, chess.KING]
    return sum(len(board.pieces(piece_type, color)) for piece_type in piece_types)







def normalize_input(board):
    board_array = np.array(board_to_input_array(board), dtype=np.float16)
    board_array /= 12.0  # Assuming the maximum piece type value is 12
    return board_array



def update_q_table(state, action, reward, next_state):
    state_index = state_to_index(state)
    next_state_index = state_to_index(next_state)
    action_index = list(state.legal_moves).index(action)
    


    # Combine the rewards with weights (you can adjust the weights as needed)
    #total_reward = reward + 0.01 * piece_coordination_reward_value
    total_reward = reward

    # Store the experience in the replay buffer
    experience_replay_buffer.append((state_index, action_index, total_reward, next_state_index))

    # Sample a batch from the replay buffer for training
    batch_size = min(len(experience_replay_buffer), 8)
    if batch_size > 0:
        batch = np.array(random.sample(experience_replay_buffer, batch_size))
        states = np.array([board_to_input_array(chess.Board(fen=chess.STARTING_FEN)) for _ in batch[:, 0]])
        next_states = np.array([board_to_input_array(chess.Board(fen=chess.STARTING_FEN)) for _ in batch[:, 3]])
        q_values = model.predict(states)
        next_q_values = model.predict(next_states)
        
        for i in range(batch_size):
            action_idx = int(batch[i, 1])  # Cast to integer
            q_values[i, action_idx] += learning_rate * (
                batch[i, 2] + discount_factor * np.max(next_q_values[i]) - q_values[i, action_idx]
            )
        
        # Train the model on the batch
        model.train_on_batch(states, q_values)


def display_chess_board(board):
    return display(HTML(chess.svg.board(board=board, size=200)))

def play_game():
    board = chess.variant.GiveawayBoard()
    
    game_states = []
    total_reward = 0

    while not board.is_game_over():
        state = board.copy()
        game_states.append(state.copy())

        # Player 1 (White) makes a move
        white_move = choose_action(board, model)
        if white_move in board.legal_moves:
            board.push(white_move)
        else:
            print("Invalid move by White. Try again.")
            continue

        # Update state and check for game end
        if board.is_game_over():
            break

        # Player 2 (Black) makes a move
        black_move = choose_action(board, model)
        if black_move in board.legal_moves:
            board.push(black_move)
        else:
            print("Invalid move by Black. Try again.")
            continue

        next_state = board.copy()

        # Calculate rewards and update Q-table
        reward = calculate_reward(board) # You need to define this function based on your reward strategy
        update_q_table(state, white_move, reward, next_state)

        # Accumulate the reward
        total_reward += reward

        # Next state becomes the current state for the next iteration
        state = next_state

    game_states.append(board.copy())
    return game_states, board.result(), total_reward

    


def calculate_reward(board):
    reward = 0

    # Reward for losing pieces
    piece_count = len(board.piece_map())
    reward -= (32 - piece_count) * 0.1  # Assuming a standard 32-piece setup


    if board.is_stalemate() or board.is_insufficient_material():
        # Penalize for drawing the game
        reward -= 5
    elif board.is_fivefold_repetition() or board.is_seventyfive_moves():
        # Penalize for other types of draws
        reward -= 5

    return reward




log_dir = "logs/"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
num_games = 10000
total_rewards = []
results = {"white_wins": 0, "black_wins": 0, "draws": 0}
outcomes = []
num_off_white_pieces = []
num_off_black_pieces = []

reward_trend = []

for episode in tqdm(range(num_games), desc="Training"):
    exploration_prob *= 0.99  # Decay exploration probability
    print("Game:", episode)
    game_states, result, total_reward = play_game()

    # Append the total reward to the reward trend list
    reward_trend.append(total_reward)

    # Update results based on the game outcome
    if result == "1-0":
        results["white_wins"] += 1
        outcomes.append(1)
    elif result == "0-1":
        results["black_wins"] += 1
        outcomes.append(0)
    elif result == "1/2-1/2":
        results["draws"] += 1
        outcomes.append(0.5)  # Fix here: Append 0.5 for draws

    total_rewards.append(total_reward)
    
    # Usage
    white_pieces = count_pieces_by_color(game_states[-1], chess.WHITE)
    black_pieces = count_pieces_by_color(game_states[-1], chess.BLACK)
    
    num_off_white_pieces.append(white_pieces)
    num_off_black_pieces.append(black_pieces)
    
    # Display the total reward for each game
    print("Total Reward for Game {}: {}".format(episode, total_reward))
    print("Game Outcome:", result)
    #game length
    print("Game Length:", len(game_states))

# Display statistics
average_reward = sum(total_rewards) / num_games
print("Average Total Reward:", average_reward)

# Extract the FEN of the final position
final_position_fen = game_states[-1].fen()
print("Final Position FEN:", final_position_fen)

# Display the last game
for state in game_states:
    display_chess_board(state)

# Display results
print("\nResults Summary:")
print("White Wins:", results["white_wins"])
print("Black Wins:", results["black_wins"])
print("Draws:", results["draws"])

# Plot trend lines
plt.plot(outcomes, label="Game Outcomes")
plt.xlabel("Games")
plt.ylabel("Outcome (1 for White Win, 0 for Draw, 0.5 for Loss)")
plt.legend()
plt.title("Game Outcomes Trend")

plt.figure()
plt.plot(num_off_white_pieces, label="White Pieces")
plt.plot(num_off_black_pieces, label="Black Pieces")
plt.xlabel("Games")
plt.ylabel("Number of Pieces")
plt.legend()
plt.title("Number of Pieces Trend")

# Plot rewards
plt.figure()
plt.plot(reward_trend, label="Rewards")
plt.xlabel("Games")
plt.ylabel("Total Reward")
plt.legend()
plt.title("Reward Trend")

plt.show()

# Save the model

model.save("v1_10k.h5")  # Change the file name as needed

Training:   0%|          | 0/10000 [00:00<?, ?it/s]

Game: 0


  return hash(board_array.tostring()) % state_space_size[0]




Training:   0%|          | 1/10000 [00:02<7:31:27,  2.71s/it]

Total Reward for Game 0: -48.69999999999999
Game Outcome: 0-1
Game Length: 31
Game: 1


Training:   0%|          | 2/10000 [00:04<5:40:20,  2.04s/it]

Total Reward for Game 1: -27.700000000000003
Game Outcome: 0-1
Game Length: 25
Game: 2


Training:   0%|          | 3/10000 [00:06<5:28:28,  1.97s/it]

Total Reward for Game 2: -41.00000000000001
Game Outcome: 0-1
Game Length: 30
Game: 3


Training:   0%|          | 4/10000 [00:07<5:02:23,  1.82s/it]

Total Reward for Game 3: -37.5
Game Outcome: 1-0
Game Length: 24
Game: 4


Training:   0%|          | 5/10000 [00:09<5:08:14,  1.85s/it]

Total Reward for Game 4: -43.199999999999996
Game Outcome: 0-1
Game Length: 30
Game: 5


Training:   0%|          | 6/10000 [00:11<5:13:55,  1.88s/it]

Total Reward for Game 5: -42.00000000000001
Game Outcome: 0-1
Game Length: 30
Game: 6


Training:   0%|          | 7/10000 [00:14<5:55:09,  2.13s/it]

Total Reward for Game 6: -63.60000000000004
Game Outcome: 0-1
Game Length: 38
Game: 7


Training:   0%|          | 8/10000 [00:16<5:44:50,  2.07s/it]

Total Reward for Game 7: -40.5
Game Outcome: 0-1
Game Length: 28
Game: 8


Training:   0%|          | 9/10000 [00:18<5:33:13,  2.00s/it]

Total Reward for Game 8: -37.800000000000004
Game Outcome: 0-1
Game Length: 28
Game: 9


Training:   0%|          | 10/10000 [00:20<5:39:22,  2.04s/it]

Total Reward for Game 9: -42.8
Game Outcome: 0-1
Game Length: 31
Game: 10


Training:   0%|          | 11/10000 [00:21<5:27:18,  1.97s/it]

Total Reward for Game 10: -34.5
Game Outcome: 0-1
Game Length: 27
Game: 11


Training:   0%|          | 12/10000 [00:24<5:33:34,  2.00s/it]

Total Reward for Game 11: -43.50000000000001
Game Outcome: 1-0
Game Length: 29
Game: 12


Training:   0%|          | 13/10000 [00:25<5:08:20,  1.85s/it]

Total Reward for Game 12: -23.6
Game Outcome: 1-0
Game Length: 21
Game: 13


Training:   0%|          | 14/10000 [00:27<5:21:46,  1.93s/it]

Total Reward for Game 13: -45.5
Game Outcome: 0-1
Game Length: 31
Game: 14


Training:   0%|          | 15/10000 [00:28<4:47:48,  1.73s/it]

Total Reward for Game 14: -20.900000000000002
Game Outcome: 1-0
Game Length: 19
Game: 15


Training:   0%|          | 16/10000 [00:31<5:41:59,  2.06s/it]

Total Reward for Game 15: -77.1
Game Outcome: 1-0
Game Length: 41
Game: 16


Training:   0%|          | 17/10000 [00:33<5:50:05,  2.10s/it]

Total Reward for Game 16: -45.1
Game Outcome: 1-0
Game Length: 29
Game: 17


Training:   0%|          | 18/10000 [00:35<5:42:31,  2.06s/it]

Total Reward for Game 17: -35.9
Game Outcome: 0-1
Game Length: 29
Game: 18


Training:   0%|          | 19/10000 [00:37<5:27:23,  1.97s/it]

Total Reward for Game 18: -37.00000000000001
Game Outcome: 1-0
Game Length: 26
Game: 19


Training:   0%|          | 20/10000 [00:39<5:21:07,  1.93s/it]

Total Reward for Game 19: -38.0
Game Outcome: 1-0
Game Length: 28
Game: 20


Training:   0%|          | 21/10000 [00:41<5:28:58,  1.98s/it]

Total Reward for Game 20: -43.500000000000014
Game Outcome: 0-1
Game Length: 32
Game: 21


Training:   0%|          | 22/10000 [00:43<5:37:09,  2.03s/it]

Total Reward for Game 21: -56.5
Game Outcome: 0-1
Game Length: 33
Game: 22


Training:   0%|          | 23/10000 [00:45<5:21:32,  1.93s/it]

Total Reward for Game 22: -31.400000000000006
Game Outcome: 0-1
Game Length: 26
Game: 23


Training:   0%|          | 24/10000 [00:46<5:00:23,  1.81s/it]

Total Reward for Game 23: -25.700000000000003
Game Outcome: 0-1
Game Length: 23
Game: 24


Training:   0%|          | 25/10000 [00:48<4:47:23,  1.73s/it]

Total Reward for Game 24: -27.1
Game Outcome: 0-1
Game Length: 24
Game: 25


Training:   0%|          | 26/10000 [00:50<5:03:05,  1.82s/it]

Total Reward for Game 25: -43.60000000000001
Game Outcome: 1-0
Game Length: 29
Game: 26
Total Reward for Game 26: -34.1

Training:   0%|          | 27/10000 [00:52<5:02:45,  1.82s/it]


Game Outcome: 1-0
Game Length: 26
Game: 27


Training:   0%|          | 28/10000 [00:54<4:53:17,  1.76s/it]

Total Reward for Game 27: -31.3
Game Outcome: 1-0
Game Length: 23
Game: 28


Training:   0%|          | 29/10000 [00:55<4:57:47,  1.79s/it]

Total Reward for Game 28: -34.6
Game Outcome: 0-1
Game Length: 27
Game: 29


Training:   0%|          | 30/10000 [00:57<4:42:23,  1.70s/it]

Total Reward for Game 29: -33.10000000000001
Game Outcome: 1-0
Game Length: 22
Game: 30


Training:   0%|          | 31/10000 [00:58<4:34:35,  1.65s/it]

Total Reward for Game 30: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 31


Training:   0%|          | 32/10000 [01:00<4:47:30,  1.73s/it]

Total Reward for Game 31: -38.900000000000006
Game Outcome: 0-1
Game Length: 29
Game: 32


Training:   0%|          | 33/10000 [01:02<4:54:15,  1.77s/it]

Total Reward for Game 32: -37.3
Game Outcome: 0-1
Game Length: 28
Game: 33


Training:   0%|          | 34/10000 [01:04<4:52:19,  1.76s/it]

Total Reward for Game 33: -31.6
Game Outcome: 0-1
Game Length: 25
Game: 34


Training:   0%|          | 35/10000 [01:05<4:42:50,  1.70s/it]

Total Reward for Game 34: -25.500000000000004
Game Outcome: 0-1
Game Length: 24
Game: 35


Training:   0%|          | 36/10000 [01:08<5:23:47,  1.95s/it]

Total Reward for Game 35: -51.70000000000001
Game Outcome: 0-1
Game Length: 34
Game: 36


Training:   0%|          | 37/10000 [01:11<6:33:04,  2.37s/it]

Total Reward for Game 36: -86.2
Game Outcome: 1-0
Game Length: 44
Game: 37


Training:   0%|          | 38/10000 [01:13<6:11:42,  2.24s/it]

Total Reward for Game 37: -33.800000000000004
Game Outcome: 1-0
Game Length: 25
Game: 38


Training:   0%|          | 39/10000 [01:15<5:44:50,  2.08s/it]

Total Reward for Game 38: -33.300000000000004
Game Outcome: 1-0
Game Length: 24
Game: 39


Training:   0%|          | 40/10000 [01:17<5:38:43,  2.04s/it]

Total Reward for Game 39: -37.50000000000001
Game Outcome: 0-1
Game Length: 27
Game: 40


Training:   0%|          | 41/10000 [01:19<5:22:07,  1.94s/it]

Total Reward for Game 40: -37.30000000000001
Game Outcome: 1-0
Game Length: 24
Game: 41


Training:   0%|          | 42/10000 [01:21<5:35:15,  2.02s/it]

Total Reward for Game 41: -45.20000000000001
Game Outcome: 0-1
Game Length: 31
Game: 42


Training:   0%|          | 43/10000 [01:23<5:17:38,  1.91s/it]

Total Reward for Game 42: -32.10000000000001
Game Outcome: 1-0
Game Length: 22
Game: 43


Training:   0%|          | 44/10000 [01:25<5:30:24,  1.99s/it]

Total Reward for Game 43: -47.10000000000001
Game Outcome: 0-1
Game Length: 31
Game: 44


Training:   0%|          | 45/10000 [01:28<6:48:13,  2.46s/it]

Total Reward for Game 44: -90.40000000000005
Game Outcome: 1/2-1/2
Game Length: 49
Game: 45


Training:   0%|          | 46/10000 [01:31<7:23:51,  2.68s/it]

Total Reward for Game 45: -84.0
Game Outcome: 0-1
Game Length: 43
Game: 46


Training:   0%|          | 47/10000 [01:34<7:21:15,  2.66s/it]

Total Reward for Game 46: -58.39999999999999
Game Outcome: 1-0
Game Length: 36
Game: 47


Training:   0%|          | 48/10000 [01:36<6:45:01,  2.44s/it]

Total Reward for Game 47: -43.2
Game Outcome: 1-0
Game Length: 27
Game: 48


Training:   0%|          | 49/10000 [01:38<6:14:12,  2.26s/it]

Total Reward for Game 48: -35.6
Game Outcome: 1-0
Game Length: 24
Game: 49


Training:   0%|          | 50/10000 [01:41<6:38:11,  2.40s/it]

Total Reward for Game 49: -54.900000000000006
Game Outcome: 0-1
Game Length: 35
Game: 50


Training:   1%|          | 51/10000 [01:42<6:07:50,  2.22s/it]

Total Reward for Game 50: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 51


Training:   1%|          | 52/10000 [01:45<6:09:33,  2.23s/it]

Total Reward for Game 51: -45.50000000000001
Game Outcome: 1-0
Game Length: 29
Game: 52


Training:   1%|          | 53/10000 [01:46<5:39:32,  2.05s/it]

Total Reward for Game 52: -19.7
Game Outcome: 0-1
Game Length: 20
Game: 53


Training:   1%|          | 54/10000 [01:48<5:46:34,  2.09s/it]

Total Reward for Game 53: -42.900000000000006
Game Outcome: 1-0
Game Length: 29
Game: 54


Training:   1%|          | 55/10000 [01:50<5:18:27,  1.92s/it]

Total Reward for Game 54: -28.299999999999997
Game Outcome: 1-0
Game Length: 21
Game: 55


Training:   1%|          | 56/10000 [01:52<5:46:43,  2.09s/it]

Total Reward for Game 55: -52.49999999999999
Game Outcome: 0-1
Game Length: 35
Game: 56


Training:   1%|          | 57/10000 [01:55<6:27:01,  2.34s/it]

Total Reward for Game 56: -76.70000000000002
Game Outcome: 0-1
Game Length: 41
Game: 57


Training:   1%|          | 58/10000 [01:58<6:40:46,  2.42s/it]

Total Reward for Game 57: -59.80000000000002
Game Outcome: 0-1
Game Length: 37
Game: 58


Training:   1%|          | 59/10000 [02:00<6:07:24,  2.22s/it]

Total Reward for Game 58: -31.5
Game Outcome: 0-1
Game Length: 25
Game: 59


Training:   1%|          | 60/10000 [02:03<6:41:46,  2.43s/it]

Total Reward for Game 59: -75.10000000000002
Game Outcome: 1-0
Game Length: 39
Game: 60


Training:   1%|          | 61/10000 [02:05<6:49:29,  2.47s/it]

Total Reward for Game 60: -55.5
Game Outcome: 0-1
Game Length: 35
Game: 61


Training:   1%|          | 62/10000 [02:08<6:50:36,  2.48s/it]

Total Reward for Game 61: -53.00000000000001
Game Outcome: 1-0
Game Length: 34
Game: 62


Training:   1%|          | 63/10000 [02:10<6:55:23,  2.51s/it]

Total Reward for Game 62: -56.70000000000001
Game Outcome: 0-1
Game Length: 36
Game: 63


Training:   1%|          | 64/10000 [02:12<6:02:24,  2.19s/it]

Total Reward for Game 63: -21.099999999999998
Game Outcome: 1-0
Game Length: 20
Game: 64


Training:   1%|          | 65/10000 [02:14<5:49:11,  2.11s/it]

Total Reward for Game 64: -36.7
Game Outcome: 1-0
Game Length: 25
Game: 65


Training:   1%|          | 66/10000 [02:16<6:20:10,  2.30s/it]

Total Reward for Game 65: -51.50000000000001
Game Outcome: 0-1
Game Length: 33
Game: 66


Training:   1%|          | 67/10000 [02:19<6:55:37,  2.51s/it]

Total Reward for Game 66: -75.7
Game Outcome: 0-1
Game Length: 41
Game: 67


Training:   1%|          | 68/10000 [02:22<6:44:00,  2.44s/it]

Total Reward for Game 67: -47.60000000000001
Game Outcome: 0-1
Game Length: 31
Game: 68


Training:   1%|          | 69/10000 [02:24<6:52:47,  2.49s/it]

Total Reward for Game 68: -66.80000000000003
Game Outcome: 1-0
Game Length: 36
Game: 69


Training:   1%|          | 70/10000 [02:26<6:07:34,  2.22s/it]

Total Reward for Game 69: -26.200000000000003
Game Outcome: 0-1
Game Length: 23
Game: 70


Training:   1%|          | 71/10000 [02:28<5:40:28,  2.06s/it]

Total Reward for Game 70: -26.9
Game Outcome: 0-1
Game Length: 24
Game: 71


Training:   1%|          | 72/10000 [02:30<6:00:27,  2.18s/it]

Total Reward for Game 71: -49.6
Game Outcome: 0-1
Game Length: 33
Game: 72


Training:   1%|          | 73/10000 [02:32<6:12:34,  2.25s/it]

Total Reward for Game 72: -50.000000000000014
Game Outcome: 1-0
Game Length: 32
Game: 73


Training:   1%|          | 74/10000 [02:35<6:23:55,  2.32s/it]

Total Reward for Game 73: -45.0
Game Outcome: 0-1
Game Length: 33
Game: 74


Training:   1%|          | 75/10000 [02:37<6:16:54,  2.28s/it]

Total Reward for Game 74: -37.10000000000001
Game Outcome: 0-1
Game Length: 28
Game: 75


Training:   1%|          | 76/10000 [02:40<6:34:20,  2.38s/it]

Total Reward for Game 75: -60.39999999999999
Game Outcome: 1-0
Game Length: 34
Game: 76


Training:   1%|          | 77/10000 [02:42<6:36:54,  2.40s/it]

Total Reward for Game 76: -54.3
Game Outcome: 0-1
Game Length: 33
Game: 77


Training:   1%|          | 78/10000 [02:44<6:10:08,  2.24s/it]

Total Reward for Game 77: -34.4
Game Outcome: 0-1
Game Length: 26
Game: 78


Training:   1%|          | 79/10000 [02:46<5:54:21,  2.14s/it]

Total Reward for Game 78: -33.300000000000004
Game Outcome: 0-1
Game Length: 27
Game: 79


Training:   1%|          | 80/10000 [02:49<6:20:18,  2.30s/it]

Total Reward for Game 79: -50.500000000000014
Game Outcome: 0-1
Game Length: 33
Game: 80


Training:   1%|          | 81/10000 [02:51<6:14:58,  2.27s/it]

Total Reward for Game 80: -37.2
Game Outcome: 1-0
Game Length: 26
Game: 81


Training:   1%|          | 82/10000 [02:53<5:56:19,  2.16s/it]

Total Reward for Game 81: -21.2
Game Outcome: 0-1
Game Length: 25
Game: 82


Training:   1%|          | 83/10000 [02:56<6:53:08,  2.50s/it]

Total Reward for Game 82: -89.20000000000006
Game Outcome: 1/2-1/2
Game Length: 43
Game: 83


Training:   1%|          | 84/10000 [02:59<7:42:13,  2.80s/it]

Total Reward for Game 83: -87.10000000000004
Game Outcome: 1-0
Game Length: 44
Game: 84


Training:   1%|          | 85/10000 [03:01<6:55:36,  2.52s/it]

Total Reward for Game 84: -32.300000000000004
Game Outcome: 1-0
Game Length: 24
Game: 85


Training:   1%|          | 86/10000 [03:03<6:36:27,  2.40s/it]

Total Reward for Game 85: -35.9
Game Outcome: 0-1
Game Length: 26
Game: 86


Training:   1%|          | 87/10000 [03:05<6:04:28,  2.21s/it]

Total Reward for Game 86: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 87


Training:   1%|          | 88/10000 [03:07<5:50:45,  2.12s/it]

Total Reward for Game 87: -27.9
Game Outcome: 0-1
Game Length: 25
Game: 88


Training:   1%|          | 89/10000 [03:09<6:02:58,  2.20s/it]

Total Reward for Game 88: -37.1
Game Outcome: 0-1
Game Length: 29
Game: 89


Training:   1%|          | 90/10000 [03:11<5:48:14,  2.11s/it]

Total Reward for Game 89: -30.300000000000004
Game Outcome: 0-1
Game Length: 26
Game: 90


Training:   1%|          | 91/10000 [03:15<6:44:05,  2.45s/it]

Total Reward for Game 90: -80.30000000000003
Game Outcome: 0-1
Game Length: 43
Game: 91


Training:   1%|          | 92/10000 [03:17<6:29:24,  2.36s/it]

Total Reward for Game 91: -36.800000000000004
Game Outcome: 0-1
Game Length: 28
Game: 92


Training:   1%|          | 93/10000 [03:20<7:01:36,  2.55s/it]

Total Reward for Game 92: -70.50000000000001
Game Outcome: 0-1
Game Length: 39
Game: 93


Training:   1%|          | 94/10000 [03:22<6:52:19,  2.50s/it]

Total Reward for Game 93: -46.50000000000001
Game Outcome: 0-1
Game Length: 31
Game: 94


Training:   1%|          | 95/10000 [03:24<6:42:52,  2.44s/it]

Total Reward for Game 94: -43.4
Game Outcome: 0-1
Game Length: 31
Game: 95


Training:   1%|          | 96/10000 [03:27<6:40:53,  2.43s/it]

Total Reward for Game 95: -32.6
Game Outcome: 0-1
Game Length: 26
Game: 96


Training:   1%|          | 97/10000 [03:29<6:05:01,  2.21s/it]

Total Reward for Game 96: -30.000000000000007
Game Outcome: 1-0
Game Length: 21
Game: 97


Training:   1%|          | 98/10000 [03:31<6:02:42,  2.20s/it]

Total Reward for Game 97: -45.000000000000014
Game Outcome: 1-0
Game Length: 27
Game: 98


Training:   1%|          | 99/10000 [03:33<6:04:44,  2.21s/it]

Total Reward for Game 98: -38.0
Game Outcome: 0-1
Game Length: 29
Game: 99


Training:   1%|          | 100/10000 [03:35<6:12:48,  2.26s/it]

Total Reward for Game 99: -55.30000000000001
Game Outcome: 1-0
Game Length: 31
Game: 100


Training:   1%|          | 101/10000 [03:37<5:39:16,  2.06s/it]

Total Reward for Game 100: -23.200000000000003
Game Outcome: 0-1
Game Length: 21
Game: 101


Training:   1%|          | 102/10000 [03:39<5:29:02,  1.99s/it]

Total Reward for Game 101: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 102


Training:   1%|          | 103/10000 [03:42<6:12:28,  2.26s/it]

Total Reward for Game 102: -60.5
Game Outcome: 0-1
Game Length: 36
Game: 103


Training:   1%|          | 104/10000 [03:43<5:51:14,  2.13s/it]

Total Reward for Game 103: -30.1
Game Outcome: 0-1
Game Length: 25
Game: 104


Training:   1%|          | 105/10000 [03:46<6:05:10,  2.21s/it]

Total Reward for Game 104: -50.2
Game Outcome: 1-0
Game Length: 31
Game: 105


Training:   1%|          | 106/10000 [03:48<5:59:56,  2.18s/it]

Total Reward for Game 105: -43.400000000000006
Game Outcome: 0-1
Game Length: 28
Game: 106


Training:   1%|          | 107/10000 [03:50<5:27:31,  1.99s/it]

Total Reward for Game 106: -20.1
Game Outcome: 0-1
Game Length: 21
Game: 107


Training:   1%|          | 108/10000 [03:51<5:14:25,  1.91s/it]

Total Reward for Game 107: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 108


Training:   1%|          | 109/10000 [03:54<5:52:00,  2.14s/it]

Total Reward for Game 108: -56.7
Game Outcome: 0-1
Game Length: 34
Game: 109


Training:   1%|          | 110/10000 [03:57<6:15:18,  2.28s/it]

Total Reward for Game 109: -55.70000000000002
Game Outcome: 0-1
Game Length: 34
Game: 110


Training:   1%|          | 111/10000 [03:58<5:59:18,  2.18s/it]

Total Reward for Game 110: -39.9
Game Outcome: 1-0
Game Length: 25
Game: 111


Training:   1%|          | 112/10000 [04:01<6:18:10,  2.29s/it]

Total Reward for Game 111: -59.00000000000001
Game Outcome: 1-0
Game Length: 33
Game: 112


Training:   1%|          | 113/10000 [04:03<6:02:58,  2.20s/it]

Total Reward for Game 112: -27.700000000000003
Game Outcome: 0-1
Game Length: 24
Game: 113


Training:   1%|          | 114/10000 [04:05<5:44:08,  2.09s/it]

Total Reward for Game 113: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 114


Training:   1%|          | 115/10000 [04:07<6:00:55,  2.19s/it]

Total Reward for Game 114: -53.099999999999994
Game Outcome: 1-0
Game Length: 32
Game: 115


Training:   1%|          | 116/10000 [04:10<6:32:54,  2.39s/it]

Total Reward for Game 115: -64.1
Game Outcome: 1-0
Game Length: 37
Game: 116


Training:   1%|          | 117/10000 [04:13<6:55:19,  2.52s/it]

Total Reward for Game 116: -65.59999999999998
Game Outcome: 1/2-1/2
Game Length: 38
Game: 117


Training:   1%|          | 118/10000 [04:15<6:12:56,  2.26s/it]

Total Reward for Game 117: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 118


Training:   1%|          | 119/10000 [04:17<5:56:22,  2.16s/it]

Total Reward for Game 118: -33.800000000000004
Game Outcome: 0-1
Game Length: 26
Game: 119


Training:   1%|          | 120/10000 [04:19<6:23:21,  2.33s/it]

Total Reward for Game 119: -53.9
Game Outcome: 0-1
Game Length: 35
Game: 120


Training:   1%|          | 121/10000 [04:22<6:20:59,  2.31s/it]

Total Reward for Game 120: -38.300000000000004
Game Outcome: 0-1
Game Length: 29
Game: 121


Training:   1%|          | 122/10000 [04:24<6:21:34,  2.32s/it]

Total Reward for Game 121: -51.599999999999994
Game Outcome: 1-0
Game Length: 30
Game: 122


Training:   1%|          | 123/10000 [04:26<6:12:10,  2.26s/it]

Total Reward for Game 122: -39.6
Game Outcome: 1-0
Game Length: 27
Game: 123


Training:   1%|          | 124/10000 [04:28<5:55:50,  2.16s/it]

Total Reward for Game 123: -39.60000000000001
Game Outcome: 1-0
Game Length: 25
Game: 124


Training:   1%|▏         | 125/10000 [04:30<5:37:32,  2.05s/it]

Total Reward for Game 124: -32.0
Game Outcome: 0-1
Game Length: 24
Game: 125


Training:   1%|▏         | 126/10000 [04:32<5:40:40,  2.07s/it]

Total Reward for Game 125: -39.0
Game Outcome: 0-1
Game Length: 28
Game: 126


Training:   1%|▏         | 127/10000 [04:34<5:24:35,  1.97s/it]

Total Reward for Game 126: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 127


Training:   1%|▏         | 128/10000 [04:36<5:54:36,  2.16s/it]

Total Reward for Game 127: -54.4
Game Outcome: 0-1
Game Length: 33
Game: 128


Training:   1%|▏         | 129/10000 [04:38<5:33:58,  2.03s/it]

Total Reward for Game 128: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 129


Training:   1%|▏         | 130/10000 [04:41<6:01:18,  2.20s/it]

Total Reward for Game 129: -46.000000000000014
Game Outcome: 1-0
Game Length: 29
Game: 130


Training:   1%|▏         | 131/10000 [04:43<6:16:48,  2.29s/it]

Total Reward for Game 130: -49.100000000000016
Game Outcome: 0-1
Game Length: 33
Game: 131


Training:   1%|▏         | 132/10000 [04:45<5:50:37,  2.13s/it]

Total Reward for Game 131: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 132


Training:   1%|▏         | 133/10000 [04:48<7:02:56,  2.57s/it]

Total Reward for Game 132: -87.30000000000005
Game Outcome: 1-0
Game Length: 45
Game: 133


Training:   1%|▏         | 134/10000 [04:50<6:35:44,  2.41s/it]

Total Reward for Game 133: -25.800000000000008
Game Outcome: 0-1
Game Length: 26
Game: 134


Training:   1%|▏         | 135/10000 [04:53<6:41:40,  2.44s/it]

Total Reward for Game 134: -47.00000000000001
Game Outcome: 0-1
Game Length: 32
Game: 135


Training:   1%|▏         | 136/10000 [04:55<6:28:32,  2.36s/it]

Total Reward for Game 135: -40.6
Game Outcome: 0-1
Game Length: 28
Game: 136


Training:   1%|▏         | 137/10000 [04:58<6:30:37,  2.38s/it]

Total Reward for Game 136: -44.30000000000001
Game Outcome: 0-1
Game Length: 31
Game: 137


Training:   1%|▏         | 138/10000 [04:59<6:05:18,  2.22s/it]

Total Reward for Game 137: -37.60000000000001
Game Outcome: 1-0
Game Length: 24
Game: 138


Training:   1%|▏         | 139/10000 [05:01<5:53:31,  2.15s/it]

Total Reward for Game 138: -34.00000000000001
Game Outcome: 0-1
Game Length: 27
Game: 139


Training:   1%|▏         | 140/10000 [05:05<6:43:08,  2.45s/it]

Total Reward for Game 139: -76.69999999999997
Game Outcome: 1/2-1/2
Game Length: 40
Game: 140


Training:   1%|▏         | 141/10000 [05:06<6:10:51,  2.26s/it]

Total Reward for Game 140: -27.0
Game Outcome: 0-1
Game Length: 24
Game: 141


Training:   1%|▏         | 142/10000 [05:08<5:48:40,  2.12s/it]

Total Reward for Game 141: -28.9
Game Outcome: 0-1
Game Length: 24
Game: 142


Training:   1%|▏         | 143/10000 [05:11<6:04:47,  2.22s/it]

Total Reward for Game 142: -47.9
Game Outcome: 0-1
Game Length: 32
Game: 143


Training:   1%|▏         | 144/10000 [05:13<5:50:41,  2.13s/it]

Total Reward for Game 143: -32.300000000000004
Game Outcome: 0-1
Game Length: 26
Game: 144


Training:   1%|▏         | 145/10000 [05:16<7:07:44,  2.60s/it]

Total Reward for Game 144: -83.69999999999999
Game Outcome: 0-1
Game Length: 44
Game: 145


Training:   1%|▏         | 146/10000 [05:18<6:21:01,  2.32s/it]

Total Reward for Game 145: -33.300000000000004
Game Outcome: 1-0
Game Length: 22
Game: 146


Training:   1%|▏         | 147/10000 [05:20<6:35:19,  2.41s/it]

Total Reward for Game 146: -51.300000000000004
Game Outcome: 0-1
Game Length: 34
Game: 147


Training:   1%|▏         | 148/10000 [05:24<7:11:48,  2.63s/it]

Total Reward for Game 147: -67.40000000000002
Game Outcome: 0-1
Game Length: 41
Game: 148


Training:   1%|▏         | 149/10000 [05:25<6:29:13,  2.37s/it]

Total Reward for Game 148: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 149


Training:   2%|▏         | 150/10000 [05:28<6:34:50,  2.41s/it]

Total Reward for Game 149: -50.6
Game Outcome: 0-1
Game Length: 32
Game: 150


Training:   2%|▏         | 151/10000 [05:30<6:17:17,  2.30s/it]

Total Reward for Game 150: -36.6
Game Outcome: 0-1
Game Length: 27
Game: 151


Training:   2%|▏         | 152/10000 [05:32<5:50:52,  2.14s/it]

Total Reward for Game 151: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 152


Training:   2%|▏         | 153/10000 [05:33<5:20:37,  1.95s/it]

Total Reward for Game 152: -18.6
Game Outcome: 0-1
Game Length: 20
Game: 153


Training:   2%|▏         | 154/10000 [05:35<5:36:32,  2.05s/it]

Total Reward for Game 153: -55.49999999999999
Game Outcome: 0-1
Game Length: 30
Game: 154


Training:   2%|▏         | 155/10000 [05:38<6:16:10,  2.29s/it]

Total Reward for Game 154: -60.5
Game Outcome: 0-1
Game Length: 36
Game: 155


Training:   2%|▏         | 156/10000 [05:40<5:40:18,  2.07s/it]

Total Reward for Game 155: -27.500000000000007
Game Outcome: 1-0
Game Length: 20
Game: 156


Training:   2%|▏         | 157/10000 [05:41<5:11:04,  1.90s/it]

Total Reward for Game 156: -23.900000000000006
Game Outcome: 1-0
Game Length: 19
Game: 157


Training:   2%|▏         | 158/10000 [05:44<5:36:43,  2.05s/it]

Total Reward for Game 157: -45.20000000000001
Game Outcome: 0-1
Game Length: 31
Game: 158


Training:   2%|▏         | 159/10000 [05:46<5:24:40,  1.98s/it]

Total Reward for Game 158: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 159


Training:   2%|▏         | 160/10000 [05:48<5:38:03,  2.06s/it]

Total Reward for Game 159: -46.0
Game Outcome: 1-0
Game Length: 28
Game: 160


Training:   2%|▏         | 161/10000 [05:50<5:50:42,  2.14s/it]

Total Reward for Game 160: -43.60000000000001
Game Outcome: 1-0
Game Length: 27
Game: 161


Training:   2%|▏         | 162/10000 [05:52<5:45:56,  2.11s/it]

Total Reward for Game 161: -31.800000000000004
Game Outcome: 0-1
Game Length: 25
Game: 162


Training:   2%|▏         | 163/10000 [05:55<5:56:49,  2.18s/it]

Total Reward for Game 162: -40.20000000000001
Game Outcome: 0-1
Game Length: 29
Game: 163


Training:   2%|▏         | 164/10000 [05:56<5:35:21,  2.05s/it]

Total Reward for Game 163: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 164


Training:   2%|▏         | 165/10000 [05:58<5:23:32,  1.97s/it]

Total Reward for Game 164: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 165


Training:   2%|▏         | 166/10000 [06:00<5:17:42,  1.94s/it]

Total Reward for Game 165: -29.799999999999997
Game Outcome: 0-1
Game Length: 24
Game: 166


Training:   2%|▏         | 167/10000 [06:02<5:18:26,  1.94s/it]

Total Reward for Game 166: -33.10000000000001
Game Outcome: 0-1
Game Length: 25
Game: 167


Training:   2%|▏         | 168/10000 [06:04<5:46:21,  2.11s/it]

Total Reward for Game 167: -49.00000000000001
Game Outcome: 1-0
Game Length: 31
Game: 168


Training:   2%|▏         | 169/10000 [06:07<6:14:49,  2.29s/it]

Total Reward for Game 168: -60.10000000000001
Game Outcome: 1-0
Game Length: 33
Game: 169


Training:   2%|▏         | 170/10000 [06:09<6:01:18,  2.21s/it]

Total Reward for Game 169: -39.900000000000006
Game Outcome: 1-0
Game Length: 25
Game: 170


Training:   2%|▏         | 171/10000 [06:11<5:35:06,  2.05s/it]

Total Reward for Game 170: -23.1
Game Outcome: 0-1
Game Length: 22
Game: 171


Training:   2%|▏         | 172/10000 [06:13<5:54:54,  2.17s/it]

Total Reward for Game 171: -54.9
Game Outcome: 1-0
Game Length: 31
Game: 172


Training:   2%|▏         | 173/10000 [06:15<5:48:58,  2.13s/it]

Total Reward for Game 172: -32.5
Game Outcome: 1-0
Game Length: 26
Game: 173


Training:   2%|▏         | 174/10000 [06:17<5:43:26,  2.10s/it]

Total Reward for Game 173: -32.7
Game Outcome: 0-1
Game Length: 26
Game: 174


Training:   2%|▏         | 175/10000 [06:19<5:29:01,  2.01s/it]

Total Reward for Game 174: -32.4
Game Outcome: 1-0
Game Length: 23
Game: 175


Training:   2%|▏         | 176/10000 [06:21<5:14:45,  1.92s/it]

Total Reward for Game 175: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 176


Training:   2%|▏         | 177/10000 [06:23<5:07:06,  1.88s/it]

Total Reward for Game 176: -23.200000000000003
Game Outcome: 0-1
Game Length: 23
Game: 177


Training:   2%|▏         | 178/10000 [06:27<6:45:55,  2.48s/it]

Total Reward for Game 177: -93.10000000000007
Game Outcome: 1/2-1/2
Game Length: 47
Game: 178


Training:   2%|▏         | 179/10000 [06:29<6:57:52,  2.55s/it]

Total Reward for Game 178: -53.900000000000006
Game Outcome: 1-0
Game Length: 33
Game: 179


Training:   2%|▏         | 180/10000 [06:32<6:59:20,  2.56s/it]

Total Reward for Game 179: -60.800000000000026
Game Outcome: 1-0
Game Length: 33
Game: 180


Training:   2%|▏         | 181/10000 [06:37<9:27:15,  3.47s/it]

Total Reward for Game 180: -152.2
Game Outcome: 1/2-1/2
Game Length: 68
Game: 181


Training:   2%|▏         | 182/10000 [06:40<8:39:45,  3.18s/it]

Total Reward for Game 181: -54.00000000000001
Game Outcome: 1-0
Game Length: 31
Game: 182


Training:   2%|▏         | 183/10000 [06:44<9:01:18,  3.31s/it]

Total Reward for Game 182: -89.50000000000003
Game Outcome: 1/2-1/2
Game Length: 44
Game: 183


Training:   2%|▏         | 184/10000 [06:45<7:45:33,  2.85s/it]

Total Reward for Game 183: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 184


Training:   2%|▏         | 185/10000 [06:47<7:12:45,  2.65s/it]

Total Reward for Game 184: -34.900000000000006
Game Outcome: 0-1
Game Length: 27
Game: 185


Training:   2%|▏         | 186/10000 [06:50<7:12:42,  2.65s/it]

Total Reward for Game 185: -57.90000000000001
Game Outcome: 0-1
Game Length: 34
Game: 186


Training:   2%|▏         | 187/10000 [06:52<6:26:58,  2.37s/it]

Total Reward for Game 186: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 187


Training:   2%|▏         | 188/10000 [06:54<5:59:05,  2.20s/it]

Total Reward for Game 187: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 188


Training:   2%|▏         | 189/10000 [06:55<5:39:14,  2.07s/it]

Total Reward for Game 188: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 189


Training:   2%|▏         | 190/10000 [06:57<5:27:28,  2.00s/it]

Total Reward for Game 189: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 190


Training:   2%|▏         | 191/10000 [06:59<5:28:06,  2.01s/it]

Total Reward for Game 190: -30.5
Game Outcome: 0-1
Game Length: 25
Game: 191


Training:   2%|▏         | 192/10000 [07:02<5:56:30,  2.18s/it]

Total Reward for Game 191: -43.20000000000001
Game Outcome: 0-1
Game Length: 29
Game: 192


Training:   2%|▏         | 193/10000 [07:04<5:45:05,  2.11s/it]

Total Reward for Game 192: -35.10000000000001
Game Outcome: 1-0
Game Length: 23
Game: 193


Training:   2%|▏         | 194/10000 [07:05<5:25:04,  1.99s/it]

Total Reward for Game 193: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 194


Training:   2%|▏         | 195/10000 [07:08<5:48:41,  2.13s/it]

Total Reward for Game 194: -49.400000000000006
Game Outcome: 0-1
Game Length: 32
Game: 195


Training:   2%|▏         | 196/10000 [07:09<5:18:21,  1.95s/it]

Total Reward for Game 195: -20.1
Game Outcome: 0-1
Game Length: 20
Game: 196


Training:   2%|▏         | 197/10000 [07:11<5:08:56,  1.89s/it]

Total Reward for Game 196: -32.400000000000006
Game Outcome: 1-0
Game Length: 22
Game: 197


Training:   2%|▏         | 197/10000 [07:11<5:58:07,  2.19s/it]


KeyboardInterrupt: 