In [27]:
from stockfish import Stockfish
import chess

stockfish_path = "/Users/benitorusconi/Documents/CDS/05_HS23/Reinforcement Learning (cds-117)/engine/stockfish"
stockfish = Stockfish(path=stockfish_path)

def print_board(board):
    print(board)

def play_game():
    board = chess.Board()

    print("Chess game against Stockfish\n")

    while not board.is_game_over():
        print_board(board)

        # Player's move
        player_move = input("Your move (in algebraic notation): ")
        if chess.Move.from_uci(player_move) in board.legal_moves:
            board.push_uci(player_move)
        else:
            print("Invalid move. Try again.")
            continue

        if board.is_game_over():
            break

        # Stockfish's move
        stockfish.set_fen_position(board.fen())
        stockfish_move = stockfish.get_best_move()
        print("Stockfish's move:", stockfish_move)
        board.push_uci(stockfish_move)

    print("\nGame Over")
    print("Result:", board.result())

# Play the game
#play_game()


In [28]:
from stockfish import Stockfish
import chess
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.compat.v1.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from IPython.display import display, HTML
import chess.svg

stockfish_path = "/Users/benitorusconi/Documents/CDS/05_HS23/Reinforcement Learning (cds-117)/engine/stockfish"
stockfish = Stockfish(path=stockfish_path)

learning_rate = 0.001
discount_factor = 0.9
exploration_prob = 0.8

state_space_size = 64
action_space_size = 4096
q_table = np.zeros((state_space_size, action_space_size))

model = Sequential([
    Dense(64, input_shape=(state_space_size,), activation='relu'),
    Dense(128, activation='relu'),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(1028, activation='relu'),
    Dense(2048, activation='relu'),
    Dense(action_space_size, activation='linear')
])

model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

def state_to_index(board):
    return hash(board.fen()) % state_space_size

def choose_action(board):
    if np.random.rand() < exploration_prob:
        return np.random.choice(list(board.legal_moves))
    else:
        state_index = state_to_index(board)
        legal_moves_list = list(board.legal_moves)
        if not legal_moves_list:
            return chess.Move.null()
        best_move_index = np.argmax(q_table[state_index])
        best_move_uci = legal_moves_list[min(best_move_index, len(legal_moves_list)-1)].uci()
        return chess.Move.from_uci(best_move_uci)

def update_q_table(state, action, reward, next_state):
    state_index = state_to_index(state)
    next_state_index = state_to_index(next_state)
    action_index = list(state.legal_moves).index(action)
    best_next_action = np.argmax(q_table[next_state_index])
    q_table[state_index, action_index] += learning_rate * (
        reward + discount_factor * q_table[next_state_index, best_next_action] - q_table[state_index, action_index]
    )

def display_chess_board(board):
    return display(HTML(chess.svg.board(board=board, size=400)))

def play_game():
    board = chess.Board()
    game_states = []

    while not board.is_game_over():
        state = board.copy()
        game_states.append(state.copy())

        rl_move = choose_action(board)
        if rl_move in board.legal_moves:
            board.push(rl_move)
        else:
            print("Invalid move. Try again.")
            continue

        reward = 0

        if board.is_game_over():
            break

        stockfish.set_fen_position(board.fen())
        stockfish_move_uci = stockfish.get_best_move()
        stockfish_move = chess.Move.from_uci(stockfish_move_uci)
        next_state = board.copy()
        board.push(stockfish_move)

        if next_state.is_check():
            reward = 0.5

        if board.result() == "1-0":
            reward = 1000 # Win
        elif board.result() == "0-1":
            reward = -1000  # Loss
        elif board.result() == "1/2-1/2":
            reward = 100  # Draw

        # Capture rewards based on piece values
        if board.is_capture(rl_move):
            captured_piece_value = piece_value(board.piece_at(rl_move.to_square))
            reward += captured_piece_value

        if board.is_capture(stockfish_move):
            captured_piece_value = piece_value(board.piece_at(stockfish_move.to_square))
            reward -= (captured_piece_value *1)

        update_q_table(state, rl_move, reward, next_state)

    game_states.append(board.copy())
    return game_states, board.result()

def piece_value(piece):
    # Assign values to pieces
    if piece is None:
        return 0
    elif piece.piece_type == chess.PAWN:
        return 1
    elif piece.piece_type == chess.KNIGHT:
        return 3
    elif piece.piece_type == chess.BISHOP:
        return 3
    elif piece.piece_type == chess.ROOK:
        return 5
    elif piece.piece_type == chess.QUEEN:
        return 9
    elif piece.piece_type == chess.KING:
        return 100


log_dir = "logs/"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

num_games = 1000
for episode in range(num_games):
    print("Game:", episode, "number of moves:", len(game_states))
    total_reward = 0
    steps = 0
    game_states, result = play_game()

    with tf.summary.create_file_writer(log_dir).as_default():
        tf.summary.scalar('Total Reward', total_reward, step=episode)
        tf.summary.scalar('Steps', steps, step=episode)
        tf.summary.flush()

        for layer in model.layers:
            for weight in layer.weights:
                tf.summary.histogram(weight.name, weight, step=episode)

    if episode == num_games - 1:
        # Display the last game
        for state in game_states:
            display_chess_board(state)


Game: 0 number of moves: 14
Game: 1 number of moves: 19
Game: 2 number of moves: 16
Game: 3 number of moves: 11
Game: 4 number of moves: 18
Game: 5 number of moves: 13
Game: 6 number of moves: 18
Game: 7 number of moves: 12
Game: 8 number of moves: 12
Game: 9 number of moves: 7
Game: 10 number of moves: 11
Game: 11 number of moves: 15
Game: 12 number of moves: 4
Game: 13 number of moves: 14
Game: 14 number of moves: 14
Game: 15 number of moves: 14
Game: 16 number of moves: 8
Game: 17 number of moves: 16
Game: 18 number of moves: 10
Game: 19 number of moves: 12
Game: 20 number of moves: 21
Game: 21 number of moves: 15
Game: 22 number of moves: 17
Game: 23 number of moves: 8
Game: 24 number of moves: 12
Game: 25 number of moves: 14
Game: 26 number of moves: 9
Game: 27 number of moves: 13
Game: 28 number of moves: 16
Game: 29 number of moves: 12
Game: 30 number of moves: 12
Game: 31 number of moves: 12
Game: 32 number of moves: 10
Game: 33 number of moves: 6
Game: 34 number of moves: 14
G

CNN Version

In [None]:
from collections import deque
import random
from stockfish import Stockfish
import chess
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.compat.v1.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from IPython.display import display, HTML
import chess.svg

# Path to Stockfish engine
stockfish_path = "/Users/benitorusconi/Documents/CDS/05_HS23/Reinforcement Learning (cds-117)/engine/stockfish"
stockfish = Stockfish(path=stockfish_path)

# Hyperparameters
learning_rate = 0.01
discount_factor = 0.95
exploration_prob = 0.5

# Neural Network Architecture
state_space_size = (8, 8, 12)  # 8x8 board with 12 channels (one for each piece type and color)
action_space_size = 4096

# Initialize a deque for experience replay
experience_replay_buffer = deque(maxlen=10000)

# Neural Network Model
model = Sequential([
    Conv2D(64, (3, 3), input_shape=state_space_size, activation='relu'),
    Conv2D(128, (3, 3), activation='relu'),
    Conv2D(256, (3, 3), activation='relu'),
    Flatten(),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(1028, activation='relu'),
    Dense(2048, activation='relu'),

    Dense(action_space_size, activation='linear')
])

model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

def state_to_index(board):
    board_array = np.array(board_to_input_array(board))
    return hash(board_array.tostring()) % state_space_size[0]

def board_to_input_array(board):
    board_array = np.zeros((8, 8, 12), dtype=np.uint8)
    piece_mapping = {'r': 0, 'n': 1, 'b': 2, 'q': 3, 'k': 4, 'p': 5, 'R': 6, 'N': 7, 'B': 8, 'Q': 9, 'K': 10, 'P': 11}
    
    for square, piece in board.piece_map().items():
        piece_type = piece_mapping[piece.symbol()]
        color = int(piece.color)
        board_array[square // 8, square % 8, piece_type] = color + 1  # Use 0 for empty squares

    return board_array

def choose_action(board):
    if np.random.rand() < exploration_prob:
        return np.random.choice(list(board.legal_moves))
    else:
        state_index = state_to_index(board)
        legal_moves_list = list(board.legal_moves)
        if not legal_moves_list:
            return chess.Move.null()
        q_values = model.predict(np.array([board_to_input_array(board)]))[0]
        best_move_index = np.argmax(q_values)
        best_move_uci = legal_moves_list[min(best_move_index, len(legal_moves_list)-1)].uci()
        return chess.Move.from_uci(best_move_uci)
    



def piece_coordination_reward(board, current_move):
    # Evaluate piece coordination as the average number of legal moves for all pieces before and after the move
    total_moves_before = 0
    total_moves_after = 0
    total_pieces = 0

    for square, piece in board.piece_map().items():
        if piece.color == board.turn:
            legal_moves_before = list(board.legal_moves)
            total_moves_before += len(legal_moves_before)

            # Make a hypothetical move and evaluate legal moves after the move
            board_copy = board.copy()
            board_copy.push(current_move)
            legal_moves_after = list(board_copy.legal_moves)
            total_moves_after += len(legal_moves_after)

            total_pieces += 1

    # Calculate the change in total moves
    moves_change = total_moves_after - total_moves_before

    # Normalize the reward to be between -1 and 1
    max_moves = max(total_moves_before / total_pieces, 1)
    normalized_reward = (total_moves_before / total_pieces - 1) / max_moves

    # Adjust the reward based on the change in total moves
    reward_adjustment = 0.1
    normalized_reward += reward_adjustment * moves_change / max_moves

    # Scale the reward between -1 and 1
    normalized_reward = max(min(normalized_reward, 1), -1)

    #print("Piece coordination reward:", normalized_reward)
    #print("Total moves before:", total_moves_before)
    #print("Total moves after:", total_moves_after)

    return normalized_reward



def update_q_table(state, action, reward, next_state):
    state_index = state_to_index(state)
    next_state_index = state_to_index(next_state)
    action_index = list(state.legal_moves).index(action)
    
    # Calculate the additional rewards
    piece_coordination_reward_value = piece_coordination_reward(state, action)

    # Combine the rewards with weights (you can adjust the weights as needed)
    #total_reward = reward + 0.01 * piece_coordination_reward_value
    total_reward = reward

    # Store the experience in the replay buffer
    experience_replay_buffer.append((state_index, action_index, total_reward, next_state_index))

    # Sample a batch from the replay buffer for training
    batch_size = min(len(experience_replay_buffer), 8)
    if batch_size > 0:
        batch = np.array(random.sample(experience_replay_buffer, batch_size))
        states = np.array([board_to_input_array(chess.Board(fen=chess.STARTING_FEN)) for _ in batch[:, 0]])
        next_states = np.array([board_to_input_array(chess.Board(fen=chess.STARTING_FEN)) for _ in batch[:, 3]])
        q_values = model.predict(states)
        next_q_values = model.predict(next_states)
        
        for i in range(batch_size):
            action_idx = int(batch[i, 1])  # Cast to integer
            q_values[i, action_idx] += learning_rate * (
                batch[i, 2] + discount_factor * np.max(next_q_values[i]) - q_values[i, action_idx]
            )
        
        # Train the model on the batch
        model.train_on_batch(states, q_values)


def display_chess_board(board):
    return display(HTML(chess.svg.board(board=board, size=400)))

def play_game():
    board = chess.Board()
    game_states = []

    while not board.is_game_over():
        state = board.copy()
        game_states.append(state.copy())

        rl_move = choose_action(board)
        if rl_move in board.legal_moves:
            board.push(rl_move)
        else:
            print("Invalid move. Try again.")
            continue

        reward = 0

        if board.is_game_over():
            break

        stockfish.set_fen_position(board.fen())
        stockfish_move_uci = stockfish.get_best_move()
        stockfish_move = chess.Move.from_uci(stockfish_move_uci)
        next_state = board.copy()
        board.push(stockfish_move)

        if next_state.is_check():
            reward = 0.5

        if board.result() == "1-0":
            reward = 10000  # Win
        elif board.result() == "0-1":
            reward = -100000  # Loss
        elif board.result() == "1/2-1/2":
            reward = 100  # Draw

        # Capture rewards based on piece values
        if board.is_capture(rl_move):
            captured_piece_value = piece_value(board.piece_at(rl_move.to_square))
            reward += captured_piece_value

        if board.is_capture(stockfish_move):
            captured_piece_value = piece_value(board.piece_at(stockfish_move.to_square))
            reward -= captured_piece_value

        update_q_table(state, rl_move, reward, next_state)

    game_states.append(board.copy())
    return game_states, board.result()

def piece_value(piece):
    if piece is None:
        return 0
    elif piece.piece_type == chess.PAWN:
        return 1
    elif piece.piece_type == chess.KNIGHT:
        return 3
    elif piece.piece_type == chess.BISHOP:
        return 3
    elif piece.piece_type == chess.ROOK:
        return 5
    elif piece.piece_type == chess.QUEEN:
        return 9
    elif piece.piece_type == chess.KING:
        return 100

log_dir = "logs/"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

num_games = 10000
for episode in range(num_games):
    print("Game:", episode,"nummber of moves:", len(game_states))
    total_reward = 0
    steps = 0
    game_states, result = play_game()

    with tf.summary.create_file_writer(log_dir).as_default():
        tf.summary.scalar('Total Reward', total_reward, step=episode)
        tf.summary.scalar('Steps', steps, step=episode)
        tf.summary.flush()

        for layer in model.layers:
            for weight in layer.weights:
                tf.summary.histogram(weight.name, weight, step=episode)

    if episode == num_games - 1:
        # Display the last game
        for state in game_states:
            display_chess_board(state)


Game: 0 nummber of moves: 8
Piece coordination reward: 0.95
Total moves before: 320
Total moves after: 320


  return hash(board_array.tostring()) % state_space_size[0]


Piece coordination reward: 0.54
Total moves before: 480
Total moves after: 352
Piece coordination reward: 0.7272727272727273
Total moves before: 528
Total moves after: 448
Piece coordination reward: 0.8294117647058823
Total moves before: 544
Total moves after: 496
Piece coordination reward: 1
Total moves before: 510
Total moves after: 555
Piece coordination reward: 1
Total moves before: 495
Total moves after: 540
Piece coordination reward: 1
Total moves before: 360
Total moves after: 585
Piece coordination reward: 1
Total moves before: 336
Total moves after: 602
Piece coordination reward: 1
Total moves before: 294
Total moves after: 588
Piece coordination reward: 1
Total moves before: 247
Total moves after: 507
Piece coordination reward: 1
Total moves before: 252
Total moves after: 540
Piece coordination reward: 1
Total moves before: 252
Total moves after: 600
Game: 1 nummber of moves: 13
Piece coordination reward: 0.95
Total moves before: 320
Total moves after: 320
Piece coordination 

KeyboardInterrupt: 