In [1]:
!pip install chess
import re
import chess
import pandas as pd
from tqdm.auto import tqdm



In [2]:
# Load the dataset from chess games csv file
dataset = pd.read_csv('/kaggle/input/chess-games/chess_games.csv', usecols=['Result', 'WhiteElo', 'BlackElo', 'Termination', 'AN'])

In [3]:
dataset.groupby('Termination').size()

Termination
Abandoned             14630
Normal              4230089
Rules infraction        128
Time forfeit        2011336
Unterminated              1
dtype: int64

In [4]:
# Kept only the games that was terminated normally, meaning player won by check or checkmate not timeout or forefit
dataset.drop(dataset[dataset['Termination'] != 'Normal'].index, inplace = True)
dataset.drop('Termination', axis=1, inplace = True)
dataset.head()

Unnamed: 0,Result,WhiteElo,BlackElo,AN
1,0-1,1641,1627,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
3,1-0,1706,1317,1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...
5,0-1,1773,1809,1. e4 e5 2. Nc3 d6 3. Nf3 h6 4. Bc4 c6 5. b3 Q...
7,1-0,2155,2356,1. d4 d5 2. Nf3 Nf6 3. Bf4 c6 4. e3 Bg4 5. Be2...
8,0-1,2010,2111,1. d4 Nf6 2. Bf4 e6 3. e3 d5 4. Nf3 h6 5. Bd3 ...


In [5]:
# Removed games with lower player ratings
dataset.drop(dataset[(dataset.WhiteElo < 2000) & (dataset.BlackElo < 2000)].index, inplace = True)
dataset.drop(['WhiteElo', 'BlackElo'], axis=1, inplace = True)
len(dataset)

922525

In [6]:
dataset.groupby('Result').size()

Result
0-1        417987
1-0        455205
1/2-1/2     49333
dtype: int64

In [7]:
# Excluded games without a conclusive result.
dataset.drop(dataset[dataset['Result'] == '1/2-1/2'].index, inplace = True)
dataset.drop(['Result'], axis=1, inplace = True)
len(dataset)

873192

In [8]:
def filter_fn(x):
    # Remove content within curly braces (including the braces)
    x = re.sub(r'\{[^\}]*\}', '', x)
    
    # Remove sequences of digits followed by a variable number of dots
    x = re.sub(r'\d+\.{1,}', '', x)
    
    # Remove all exclamation marks (!) and question marks (?)
    x = re.sub(r'[!?]+', '', x)
    
    # Replace multiple spaces with a single space
    x = re.sub(r'\s+', ' ', x)
    
    # Trim leading and trailing spaces
    x = x.strip()
    
    return x

In [9]:
dataset['AN'] = dataset['AN'].apply(filter_fn)

In [10]:
# Initialize a dictionary to store chess game information
moves_db = {'states': [], 'moves': [], 'player': []}

# Iterate through each game in the dataset, using tqdm for progress visualization
for game in tqdm(dataset['AN']):
    # Split the game into moves and result
    game = game.split(' ')
    moves, result = game[:-1], game[-1]
    
    # Create a chess board object
    board = chess.Board()

    # Include moves from the winning player
    for i in range(len(moves)):
        # Check if the current move belongs to the winning player
        if ((i % 2 == 0 and result == '1-0') or
            (i % 2 == 1 and result == '0-1')):
            
            # Append the current state of the board to the 'states' list
            moves_db['states'].append(board.fen())
            
            # Push the move to the board and get the UCI representation of the last move
            board.push_san(moves[i])
            move_uci = board.uci(board.pop())
            
            # Append the UCI representation of the move to the 'moves' list
            moves_db['moves'].append(move_uci)
            
            # Append the player index (0 for white, 1 for black) to the 'player' list
            if i % 2 == 0:
                moves_db['player'].append(0)
            else:
                moves_db['player'].append(1)
        
        # Push the move to the board for the next iteration
        board.push_san(moves[i])

  0%|          | 0/873192 [00:00<?, ?it/s]

In [11]:
processed_df = pd.DataFrame(moves_db)
print(processed_df.head())
len(processed_df)

                                              states moves  player
0  rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...  d2d4       0
1  rnbqkbnr/ppp1pppp/8/3p4/3P4/8/PPP1PPPP/RNBQKBN...  g1f3       0
2  rnbqkb1r/ppp1pppp/5n2/3p4/3P4/5N2/PPP1PPPP/RNB...  c1f4       0
3  rnbqkb1r/pp2pppp/2p2n2/3p4/3P1B2/5N2/PPP1PPPP/...  e2e3       0
4  rn1qkb1r/pp2pppp/2p2n2/3p4/3P1Bb1/4PN2/PPP2PPP...  f1e2       0


29051408

In [12]:
processed_df.to_csv('/kaggle/working/chess_moves.csv', index=False)