In [5]:
import tensorflow as tf
import numpy as np
import chess
import chess.pgn
import random
from collections import deque
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense
from tensorflow.keras.models import Model
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import TensorBoard
from tqdm import tqdm

# Load PGN data and other utility functions
def load_pgn_data(pgn_file_path):
    print("Loading PGN data from {}...".format(pgn_file_path))
    pgn_data = []
    with open(pgn_file_path) as pgn:
        while True:
            game = chess.pgn.read_game(pgn)
            if game is None:
                break
            board = game.board()
            for move in game.mainline_moves():
                input_array = board_to_input_array(board)
                output_array = move_to_output_array(move, board.legal_moves)
                pgn_data.append((input_array, output_array))
                board.push(move)
    return pgn_data

def board_to_input_array(board):
    board_array = np.zeros((8, 8, 12), dtype=np.uint8)
    piece_mapping = {'r': 0, 'n': 1, 'b': 2, 'q': 3, 'k': 4, 'p': 5, 'R': 6, 'N': 7, 'B': 8, 'Q': 9, 'K': 10, 'P': 11}
    for square, piece in board.piece_map().items():
        piece_type = piece_mapping[piece.symbol()]
        color = int(piece.color)
        board_array[square // 8, square % 8, piece_type] = color + 1
    return board_array

def move_to_output_array(move, legal_moves):
    output_array = np.zeros(action_space_size)
    move_index = list(legal_moves).index(move)
    output_array[move_index] = 1
    return output_array

def state_to_index(board):
    board_array = np.array(board_to_input_array(board))
    return hash(board_array.tostring()) % state_space_size[0]

def choose_action(board, model):
    if np.random.rand() < exploration_prob:
        return np.random.choice(list(board.legal_moves))
    else:
        state_index = state_to_index(board)
        legal_moves_list = list(board.legal_moves)
        if not legal_moves_list:
            return chess.Move.null()
        q_values = model.predict(np.array([board_to_input_array(board)]))[0]
        best_move_index = np.argmax(q_values)
        best_move_uci = legal_moves_list[min(best_move_index, len(legal_moves_list)-1)].uci()
        return chess.Move.from_uci(best_move_uci)

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.9
exploration_prob = 0.2
state_space_size = (8, 8, 12)
action_space_size = 4096

# Initialize a deque for experience replay
experience_replay_buffer = deque(maxlen=1000)

# Create a policy network
def create_policy_network():
    input_layer = Input(shape=state_space_size)
    conv1 = Conv2D(128, (3, 3), activation='relu', padding='same')(input_layer)
    conv2 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv1)
    flatten_layer = Flatten()(conv2)
    dense1 = Dense(128, activation='relu')(flatten_layer)
    dense2 = Dense(64, activation='relu')(dense1)
    output_probs = Dense(action_space_size, activation='softmax')(dense2)  # Output size matches action_space_size
    model = Model(inputs=input_layer, outputs=output_probs)
    return model


policy_model = create_policy_network()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Function to choose an action using the policy network
def choose_policy_action(board, model):
    state_array = board_to_input_array(board)
    state_array = np.expand_dims(state_array, axis=0)
    action_probs = model.predict(state_array)[0]
    legal_moves_list = list(board.legal_moves)
    action_index = np.random.choice(range(len(legal_moves_list)), p=action_probs)
    return legal_moves_list[action_index]

# Define a training loop with REINFORCE (Policy Gradient)
def train_policy_network(policy_model, episodes):
    for episode in range(episodes):
        game_states, _, total_reward = play_game()  # Play a game and collect states and rewards
        for t, state in enumerate(game_states):
            # Compute discounted rewards
            discounted_rewards = [total_reward - sum(game_states[i + t:])[2] for i, _ in enumerate(game_states[t:])]
            state_array = board_to_input_array(state)
            state_array = np.expand_dims(state_array, axis=0)
            action_probs = policy_model.predict(state_array)[0]
            chosen_action = list(state.legal_moves).index(game_states[t + 1].pop())
            log_prob = tf.math.log(action_probs[chosen_action])
            loss = -log_prob * discounted_rewards[t]  # REINFORCE loss
            gradients = tape.gradient(loss, policy_model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, policy_model.trainable_variables))

# Function to play a game
def play_game():
    board = chess.Board()
    game_states = []
    total_reward = 0  # Initialize total_reward

    while not board.is_game_over():
        state = board.copy()
        game_states.append(state.copy())

        # Choose an action using the policy network
        policy_move = choose_policy_action(board, policy_model)

        if policy_move in board.legal_moves:
            board.push(policy_move)
        else:
            print("Invalid move. Try again.")
            continue

        reward = 0

        if board.is_game_over():
            break

        stockfish.set_fen_position(board.fen())
        stockfish_move_uci = stockfish.get_best_move()
        stockfish_move = chess.Move.from_uci(stockfish_move_uci)
        next_state = board.copy()
        board.push(stockfish_move)

        if next_state.is_check():
            reward = 0.1

        if board.result() == "1-0":
            reward += 100  # Win
        elif board.result() == "0-1":
            move_number = len(game_states)
            reward -= 100  # Loss
        elif board.result() == "1/2-1/2":
            reward -= 100  # Draw

        # Capture reward
        if board.is_capture(policy_move):
            reward += piece_value(board.piece_at(policy_move.to_square))

        if board.is_capture(stockfish_move):
            reward -= piece_value(board.piece_at(stockfish_move.to_square))

        # Update policy network with rewards
        discounted_rewards = [total_reward - sum(game_states[i + 1:])[2] for i, _ in enumerate(game_states)]
        state_array = board_to_input_array(state)
        state_array = np.expand_dims(state_array, axis=0)
        action_probs = policy_model.predict(state_array)[0]
        chosen_action = list(state.legal_moves).index(policy_move)
        log_prob = tf.math.log(action_probs[chosen_action])
        loss = -log_prob * discounted_rewards[len(game_states) - 1]  # REINFORCE loss
        gradients = tape.gradient(loss, policy_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, policy_model.trainable_variables))

        # Accumulate the reward
        total_reward += reward

    game_states.append(board.copy())
    return game_states, board.result(), total_reward


log_dir = "logs/"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
num_games = 1
total_rewards = []
results = {"white_wins": 0, "black_wins": 0, "draws": 0}
outcomes = []


reward_trend = []

for episode in tqdm(range(num_games), desc="Training"):
    exploration_prob *= 0.99  # Decay exploration probability
    print("Game:", episode)
    game_states, result, total_reward = play_game()

    # Append the total reward to the reward trend list
    reward_trend.append(total_reward)

    # Update results based on the game outcome
    if result == "1-0":
        results["white_wins"] += 1
        outcomes.append(1)
    elif result == "0-1":
        results["black_wins"] += 1
        outcomes.append(0)
    elif result == "1/2-1/2":
        results["draws"] += 1
        outcomes.append(0.5)  # Fix here: Append 0.5 for draws

    total_rewards.append(total_reward)
    
    # Display the total reward for each game
    print("Total Reward for Game {}: {}".format(episode, total_reward))
    print("Game Outcome:", result)
    #game length
    print("Game Length:", len(game_states))

# Display statistics
average_reward = sum(total_rewards) / num_games
print("Average Total Reward:", average_reward)

# Extract the FEN of the final position
final_position_fen = game_states[-1].fen()
print("Final Position FEN:", final_position_fen)

# Display the last game
for state in game_states:
    display_chess_board(state)

# Display results
print("\nResults Summary:")
print("White Wins:", results["white_wins"])
print("Black Wins:", results["black_wins"])
print("Draws:", results["draws"])

# Plot trend lines
plt.plot(outcomes, label="Game Outcomes")
plt.xlabel("Games")
plt.ylabel("Outcome (1 for White Win, 0 for Draw, 0.5 for Loss)")
plt.legend()
plt.title("Game Outcomes Trend")

# Plot rewards
plt.figure()
plt.plot(reward_trend, label="Rewards")
plt.xlabel("Games")
plt.ylabel("Total Reward")
plt.legend()
plt.title("Reward Trend")

plt.show()


Training:   0%|          | 0/1 [00:00<?, ?it/s]

Game: 0


Training:   0%|          | 0/1 [00:00<?, ?it/s]


ValueError: 'a' and 'p' must have same size