In [1]:
from stockfish import Stockfish
import chess

stockfish_path = "/Users/benitorusconi/Documents/CDS/05_HS23/Reinforcement Learning (cds-117)/engine/stockfish"
stockfish = Stockfish(path=stockfish_path)

def print_board(board):
    print(board)

def play_game():
    board = chess.Board()

    print("Chess game against Stockfish\n")

    while not board.is_game_over():
        print_board(board)

        # Player's move
        player_move = input("Your move (in algebraic notation): ")
        if chess.Move.from_uci(player_move) in board.legal_moves:
            board.push_uci(player_move)
        else:
            print("Invalid move. Try again.")
            continue

        if board.is_game_over():
            break

        # Stockfish's move
        stockfish.set_fen_position(board.fen())
        stockfish_move = stockfish.get_best_move()
        print("Stockfish's move:", stockfish_move)
        board.push_uci(stockfish_move)

    print("\nGame Over")
    print("Result:", board.result())

# Play the game
#play_game()


CNN Version

In [21]:
from collections import deque
import random
from stockfish import Stockfish
import chess
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.compat.v1.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from IPython.display import display, HTML
import chess.svg
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Concatenate
from tqdm import tqdm
from tensorflow.keras.models import Model



# Path to Stockfish engine
stockfish_path = "/Users/benitorusconi/Documents/CDS/05_HS23/Reinforcement Learning (cds-117)/engine/stockfish"
stockfish = Stockfish(path=stockfish_path)

#stockfish.set_skill_level(1)
# Hyperparameters
learning_rate = 0.01
discount_factor = 0.85
exploration_prob = 0.2

# Neural Network Architecture
state_space_size = (8, 8, 12)  # 8x8 board with 12 channels (one for each piece type and color)
action_space_size = 4096

# Initialize a deque for experience replay
experience_replay_buffer = deque(maxlen=10000)

# Neural Network Model alpha zero


# Input layer
input_layer = Input(shape=state_space_size)

# Contracting path
# Convolutional layers
conv1 = Conv2D(32, (3, 3), activation='selu', padding='same')(input_layer)
conv2 = Conv2D(64, (3, 3), activation='selu', padding='same')(conv1)


# Flatten layer
flatten_layer = Flatten()(conv2)

# Dense layers
dense1 = Dense(64, activation='selu')(flatten_layer)
dense2 = Dense(32, activation='selu')(dense1)


# Output layer
output_layer = Dense(action_space_size, activation='softmax')(dense2)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001), loss='mse')

# Display the model summary
model.summary()



def state_to_index(board):
    board_array = np.array(board_to_input_array(board))
    return hash(board_array.tostring()) % state_space_size[0]

def board_to_input_array(board):
    board_array = np.zeros((8, 8, 12), dtype=np.uint8)
    piece_mapping = {'r': 0, 'n': 1, 'b': 2, 'q': 3, 'k': 4, 'p': 5, 'R': 6, 'N': 7, 'B': 8, 'Q': 9, 'K': 10, 'P': 11}
    #normalize piece values

    
    for square, piece in board.piece_map().items():
        piece_type = piece_mapping[piece.symbol()]
        color = int(piece.color)
        board_array[square // 8, square % 8, piece_type] = color + 1  # Use 0 for empty squares

    return board_array

def choose_action(board):
    if np.random.rand() < exploration_prob:
        return np.random.choice(list(board.legal_moves))
    else:
        state_index = state_to_index(board)
        legal_moves_list = list(board.legal_moves)
        if not legal_moves_list:
            return chess.Move.null()
        q_values = model.predict(np.array([board_to_input_array(board)]))[0]
        best_move_index = np.argmax(q_values)
        best_move_uci = legal_moves_list[min(best_move_index, len(legal_moves_list)-1)].uci()
        return chess.Move.from_uci(best_move_uci)
    



def piece_coordination_reward(board, current_move):
    # Evaluate piece coordination as the average number of legal moves for all pieces before and after the move
    total_moves_before = 0
    total_moves_after = 0
    total_pieces = 0

    for square, piece in board.piece_map().items():
        if piece.color == board.turn:
            legal_moves_before = list(board.legal_moves)
            total_moves_before += len(legal_moves_before)

            # Make a hypothetical move and evaluate legal moves after the move
            board_copy = board.copy()
            board_copy.push(current_move)
            legal_moves_after = list(board_copy.legal_moves)
            total_moves_after += len(legal_moves_after)

            total_pieces += 1

    # Calculate the change in total moves
    moves_change = total_moves_after - total_moves_before

    # Normalize the reward to be between -1 and 1
    max_moves = max(total_moves_before / total_pieces, 1)
    normalized_reward = (total_moves_before / total_pieces - 1) / max_moves

    # Adjust the reward based on the change in total moves
    reward_adjustment = 0.1
    normalized_reward += reward_adjustment * moves_change / max_moves

    # Scale the reward between -1 and 1
    normalized_reward = max(min(normalized_reward, 1), -1)

    #print("Piece coordination reward:", normalized_reward)
    #print("Total moves before:", total_moves_before)
    #print("Total moves after:", total_moves_after)

    return normalized_reward

def normalize_input(board):
    board_array = np.array(board_to_input_array(board), dtype=np.float16)
    board_array /= 12.0  # Assuming the maximum piece type value is 12
    return board_array



def update_q_table(state, action, reward, next_state):
    state_index = state_to_index(state)
    next_state_index = state_to_index(next_state)
    action_index = list(state.legal_moves).index(action)
    
    # Calculate the additional rewards
    piece_coordination_reward_value = piece_coordination_reward(state, action)

    # Combine the rewards with weights (you can adjust the weights as needed)
    #total_reward = reward + 0.01 * piece_coordination_reward_value
    total_reward = reward

    # Store the experience in the replay buffer
    experience_replay_buffer.append((state_index, action_index, total_reward, next_state_index))

    # Sample a batch from the replay buffer for training
    batch_size = min(len(experience_replay_buffer), 8)
    if batch_size > 0:
        batch = np.array(random.sample(experience_replay_buffer, batch_size))
        states = np.array([board_to_input_array(chess.Board(fen=chess.STARTING_FEN)) for _ in batch[:, 0]])
        next_states = np.array([board_to_input_array(chess.Board(fen=chess.STARTING_FEN)) for _ in batch[:, 3]])
        q_values = model.predict(states)
        next_q_values = model.predict(next_states)
        
        for i in range(batch_size):
            action_idx = int(batch[i, 1])  # Cast to integer
            q_values[i, action_idx] += learning_rate * (
                batch[i, 2] + discount_factor * np.max(next_q_values[i]) - q_values[i, action_idx]
            )
        
        # Train the model on the batch
        model.train_on_batch(states, q_values)


def display_chess_board(board):
    return display(HTML(chess.svg.board(board=board, size=400)))

def play_game():
    
    
    board = chess.Board()
    game_states = []
    total_reward = 0  # Initialize total_reward

    while not board.is_game_over():
        state = board.copy()
        game_states.append(state.copy())

        rl_move = choose_action(board)
        if rl_move in board.legal_moves:
            board.push(rl_move)
        else:
            print("Invalid move. Try again.")
            continue

        reward = 0

        if board.is_game_over():
            break

        stockfish.set_fen_position(board.fen())
        stockfish_move_uci = stockfish.get_best_move()
        stockfish_move = chess.Move.from_uci(stockfish_move_uci)
        next_state = board.copy()
        board.push(stockfish_move)

        if next_state.is_check():
            reward = 0.1

        if board.result() == "1-0":
            reward = 1  # Win
        elif board.result() == "0-1":
            move_number = len(game_states)
            reward -=(1+100/move_number)  # Loss 
        elif board.result() == "1/2-1/2":
            reward += 0.01  # Draw

        # Capture rewards based on piece values
        if board.is_capture(rl_move):
            captured_piece_value = piece_value(board.piece_at(rl_move.to_square))
            reward += captured_piece_value

        if board.is_capture(stockfish_move):
            captured_piece_value = piece_value(board.piece_at(stockfish_move.to_square))
            reward -= captured_piece_value

        # Calculate the change in the number of legal moves
        legal_moves_before = len(list(board.legal_moves))
        legal_moves_after = len(list(next_state.legal_moves))
        moves_change = legal_moves_after - legal_moves_before

        # Give a small reward for more possible moves in the future
        future_moves_reward = 0.05
        reward += future_moves_reward
        
        if rl_move.uci() == stockfish_move.uci():
            move_match_reward = 0.25
            reward += move_match_reward


        update_q_table(state, rl_move, reward, next_state)

        # Accumulate the reward
        total_reward += reward



    game_states.append(board.copy())
    return game_states, board.result(), total_reward  # Return total_reward




def piece_value(piece):
    if piece is None:
        return 0
    elif piece.piece_type == chess.PAWN:
        return 0.1
    elif piece.piece_type == chess.KNIGHT:
        return 0.3
    elif piece.piece_type == chess.BISHOP:
        return 0.3
    elif piece.piece_type == chess.ROOK:
        return 0.5
    elif piece.piece_type == chess.QUEEN:
        return 0.9
    elif piece.piece_type == chess.KING:
        return 0






log_dir = "logs/"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)








num_games = 1000
total_rewards = []
results = {"white_wins": 0, "black_wins": 0, "draws": 0}
outcomes = []


reward_trend = []

for episode in tqdm(range(num_games), desc="Training"):
    print("Game:", episode)
    game_states, result, total_reward = play_game()

    # Append the total reward to the reward trend list
    reward_trend.append(total_reward)

    # Update results based on the game outcome
    if result == "1-0":
        results["white_wins"] += 1
        outcomes.append(1)
    elif result == "0-1":
        results["black_wins"] += 1
        outcomes.append(0)
    elif result == "1/2-1/2":
        results["draws"] += 1
        outcomes.append(0.5)  # Fix here: Append 0.5 for draws

    total_rewards.append(total_reward)
    
    # Display the total reward for each game
    print("Total Reward for Game {}: {}".format(episode, total_reward))
    print("Game Outcome:", result)
    #game length
    print("Game Length:", len(game_states))

# Display statistics
average_reward = sum(total_rewards) / num_games
print("Average Total Reward:", average_reward)

# Extract the FEN of the final position
final_position_fen = game_states[-1].fen()
print("Final Position FEN:", final_position_fen)

# Display the last game
for state in game_states:
    display_chess_board(state)

# Display results
print("\nResults Summary:")
print("White Wins:", results["white_wins"])
print("Black Wins:", results["black_wins"])
print("Draws:", results["draws"])

# Plot trend lines
plt.plot(outcomes, label="Game Outcomes")
plt.xlabel("Games")
plt.ylabel("Outcome (1 for White Win, 0 for Draw, 0.5 for Loss)")
plt.legend()
plt.title("Game Outcomes Trend")

# Plot rewards
plt.figure()
plt.plot(reward_trend, label="Rewards")
plt.xlabel("Games")
plt.ylabel("Total Reward")
plt.legend()
plt.title("Reward Trend")

plt.show()











Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_19 (InputLayer)       [(None, 8, 8, 12)]        0         
                                                                 
 conv2d_36 (Conv2D)          (None, 8, 8, 32)          3488      
                                                                 
 conv2d_37 (Conv2D)          (None, 8, 8, 64)          18496     
                                                                 
 flatten_18 (Flatten)        (None, 4096)              0         
                                                                 
 dense_54 (Dense)            (None, 64)                262208    
                                                                 
 dense_55 (Dense)            (None, 32)                2080      
                                                                 
 dense_56 (Dense)            (None, 4096)              135

Training:   0%|          | 0/1000 [00:00<?, ?it/s]

Game: 0


  return hash(board_array.tostring()) % state_space_size[0]




Training:   0%|          | 1/1000 [00:01<32:56,  1.98s/it]

Total Reward for Game 0: -6.1
Game Outcome: 0-1
Game Length: 15
Game: 1


Training:   0%|          | 2/1000 [00:03<28:32,  1.72s/it]

Total Reward for Game 1: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 2


Training:   0%|          | 3/1000 [00:04<25:54,  1.56s/it]

Total Reward for Game 2: -3.55
Game Outcome: 0-1
Game Length: 10
Game: 3


Training:   0%|          | 4/1000 [00:06<23:33,  1.42s/it]

Total Reward for Game 3: -2.0999999999999996
Game Outcome: 0-1
Game Length: 9
Game: 4


Training:   0%|          | 5/1000 [00:07<23:54,  1.44s/it]

Total Reward for Game 4: -3.0999999999999996
Game Outcome: 0-1
Game Length: 11
Game: 5


Training:   1%|          | 6/1000 [00:09<24:42,  1.49s/it]

Total Reward for Game 5: -4.699999999999999
Game Outcome: 0-1
Game Length: 15
Game: 6


Training:   1%|          | 7/1000 [00:10<24:45,  1.50s/it]

Total Reward for Game 6: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 7


Training:   1%|          | 8/1000 [00:12<27:01,  1.63s/it]

Total Reward for Game 7: -7.949999999999999
Game Outcome: 0-1
Game Length: 16
Game: 8


Training:   1%|          | 9/1000 [00:14<26:14,  1.59s/it]

Total Reward for Game 8: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 9


Training:   1%|          | 10/1000 [00:15<25:09,  1.53s/it]

Total Reward for Game 9: -4.6
Game Outcome: 0-1
Game Length: 11
Game: 10


Training:   1%|          | 11/1000 [00:17<25:52,  1.57s/it]

Total Reward for Game 10: -4.05
Game Outcome: 0-1
Game Length: 16
Game: 11


Training:   1%|          | 12/1000 [00:18<25:20,  1.54s/it]

Total Reward for Game 11: -3.55
Game Outcome: 0-1
Game Length: 10
Game: 12


Training:   1%|▏         | 13/1000 [00:20<26:42,  1.62s/it]

Total Reward for Game 12: -5.0
Game Outcome: 0-1
Game Length: 15
Game: 13


Training:   1%|▏         | 14/1000 [00:22<27:47,  1.69s/it]

Total Reward for Game 13: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 14


Training:   2%|▏         | 15/1000 [00:24<31:19,  1.91s/it]

Total Reward for Game 14: -6.449999999999999
Game Outcome: 0-1
Game Length: 16
Game: 15


Training:   2%|▏         | 16/1000 [00:26<29:35,  1.80s/it]

Total Reward for Game 15: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 16


Training:   2%|▏         | 17/1000 [00:28<31:53,  1.95s/it]

Total Reward for Game 16: -4.699999999999999
Game Outcome: 0-1
Game Length: 17
Game: 17


Training:   2%|▏         | 18/1000 [00:29<25:28,  1.56s/it]

Total Reward for Game 17: -2.1999999999999997
Game Outcome: 0-1
Game Length: 5
Game: 18


Training:   2%|▏         | 19/1000 [00:30<24:49,  1.52s/it]

Total Reward for Game 18: -4.95
Game Outcome: 0-1
Game Length: 14
Game: 19


Training:   2%|▏         | 20/1000 [00:32<25:09,  1.54s/it]

Total Reward for Game 19: -3.3999999999999995
Game Outcome: 0-1
Game Length: 11
Game: 20


Training:   2%|▏         | 21/1000 [00:33<26:19,  1.61s/it]

Total Reward for Game 20: -6.8
Game Outcome: 0-1
Game Length: 15
Game: 21


Training:   2%|▏         | 22/1000 [00:36<28:30,  1.75s/it]

Total Reward for Game 21: -7.499999999999999
Game Outcome: 0-1
Game Length: 17
Game: 22


Training:   2%|▏         | 23/1000 [00:37<27:23,  1.68s/it]

Total Reward for Game 22: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 23


Training:   2%|▏         | 24/1000 [00:39<29:52,  1.84s/it]

Total Reward for Game 23: -4.75
Game Outcome: 0-1
Game Length: 18
Game: 24


Training:   2%|▎         | 25/1000 [00:41<27:49,  1.71s/it]

Total Reward for Game 24: -3.8499999999999996
Game Outcome: 0-1
Game Length: 12
Game: 25


Training:   3%|▎         | 26/1000 [00:43<28:21,  1.75s/it]

Total Reward for Game 25: -7.849999999999999
Game Outcome: 0-1
Game Length: 16
Game: 26


Training:   3%|▎         | 27/1000 [00:44<29:18,  1.81s/it]

Total Reward for Game 26: -4.75
Game Outcome: 0-1
Game Length: 18
Game: 27


Training:   3%|▎         | 28/1000 [00:46<27:07,  1.67s/it]

Total Reward for Game 27: -4.95
Game Outcome: 0-1
Game Length: 14
Game: 28


Training:   3%|▎         | 29/1000 [00:48<27:13,  1.68s/it]

Total Reward for Game 28: -4.45
Game Outcome: 0-1
Game Length: 14
Game: 29


Training:   3%|▎         | 30/1000 [00:49<24:29,  1.52s/it]

Total Reward for Game 29: -3.3
Game Outcome: 0-1
Game Length: 11
Game: 30


Training:   3%|▎         | 31/1000 [00:50<25:45,  1.60s/it]

Total Reward for Game 30: -5.3999999999999995
Game Outcome: 0-1
Game Length: 13
Game: 31


Training:   3%|▎         | 32/1000 [00:52<25:02,  1.55s/it]

Total Reward for Game 31: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 32


Training:   3%|▎         | 33/1000 [00:53<24:00,  1.49s/it]

Total Reward for Game 32: -3.8499999999999996
Game Outcome: 0-1
Game Length: 12
Game: 33


Training:   3%|▎         | 34/1000 [00:55<23:21,  1.45s/it]

Total Reward for Game 33: -2.25
Game Outcome: 0-1
Game Length: 10
Game: 34


Training:   4%|▎         | 35/1000 [00:56<22:47,  1.42s/it]

Total Reward for Game 34: -3.8499999999999996
Game Outcome: 0-1
Game Length: 12
Game: 35


Training:   4%|▎         | 36/1000 [00:57<23:05,  1.44s/it]

Total Reward for Game 35: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 36


Training:   4%|▎         | 37/1000 [00:59<21:57,  1.37s/it]

Total Reward for Game 36: -3.3499999999999996
Game Outcome: 0-1
Game Length: 10
Game: 37


Training:   4%|▍         | 38/1000 [01:00<21:41,  1.35s/it]

Total Reward for Game 37: -3.8499999999999996
Game Outcome: 0-1
Game Length: 12
Game: 38


Training:   4%|▍         | 39/1000 [01:01<20:34,  1.29s/it]

Total Reward for Game 38: -2.3
Game Outcome: 0-1
Game Length: 9
Game: 39


Training:   4%|▍         | 40/1000 [01:03<25:56,  1.62s/it]

Total Reward for Game 39: -3.8499999999999996
Game Outcome: 0-1
Game Length: 20
Game: 40


Training:   4%|▍         | 41/1000 [01:05<26:59,  1.69s/it]

Total Reward for Game 40: -2.95
Game Outcome: 0-1
Game Length: 14
Game: 41


Training:   4%|▍         | 42/1000 [01:07<25:59,  1.63s/it]

Total Reward for Game 41: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 42


Training:   4%|▍         | 43/1000 [01:08<25:25,  1.59s/it]

Total Reward for Game 42: -3.85
Game Outcome: 0-1
Game Length: 14
Game: 43


Training:   4%|▍         | 44/1000 [01:10<26:41,  1.68s/it]

Total Reward for Game 43: -4.699999999999999
Game Outcome: 0-1
Game Length: 15
Game: 44


Training:   4%|▍         | 45/1000 [01:12<27:56,  1.76s/it]

Total Reward for Game 44: -4.3
Game Outcome: 0-1
Game Length: 15
Game: 45


Training:   5%|▍         | 46/1000 [01:14<28:51,  1.81s/it]

Total Reward for Game 45: -6.4
Game Outcome: 0-1
Game Length: 17
Game: 46


Training:   5%|▍         | 47/1000 [01:15<26:12,  1.65s/it]

Total Reward for Game 46: -3.15
Game Outcome: 0-1
Game Length: 10
Game: 47


Training:   5%|▍         | 48/1000 [01:18<32:05,  2.02s/it]

Total Reward for Game 47: -3.1
Game Outcome: 0-1
Game Length: 15
Game: 48


Training:   5%|▍         | 49/1000 [01:20<28:41,  1.81s/it]

Total Reward for Game 48: -2.55
Game Outcome: 0-1
Game Length: 10
Game: 49


Training:   5%|▌         | 50/1000 [01:22<30:14,  1.91s/it]

Total Reward for Game 49: -8.35
Game Outcome: 0-1
Game Length: 20
Game: 50


Training:   5%|▌         | 51/1000 [01:23<28:19,  1.79s/it]

Total Reward for Game 50: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 51


Training:   5%|▌         | 52/1000 [01:25<26:54,  1.70s/it]

Total Reward for Game 51: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 52


Training:   5%|▌         | 53/1000 [01:27<28:06,  1.78s/it]

Total Reward for Game 52: -4.2
Game Outcome: 0-1
Game Length: 17
Game: 53


Training:   5%|▌         | 54/1000 [01:28<26:41,  1.69s/it]

Total Reward for Game 53: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 54


Training:   6%|▌         | 55/1000 [01:31<32:41,  2.08s/it]

Total Reward for Game 54: -7.299999999999999
Game Outcome: 0-1
Game Length: 23
Game: 55


Training:   6%|▌         | 56/1000 [01:33<30:01,  1.91s/it]

Total Reward for Game 55: -4.6499999999999995
Game Outcome: 0-1
Game Length: 14
Game: 56


Training:   6%|▌         | 56/1000 [01:34<26:25,  1.68s/it]


KeyboardInterrupt: 