In [1]:
import pandas as pd
import re

In [2]:
# Load the dataset from chess games csv file
dataset = pd.read_csv('/kaggle/input/chess-games/chess_games.csv', usecols=['Result', 'WhiteElo', 'BlackElo', 'Termination', 'AN'])

In [3]:
dataset.groupby('Termination').size()

Termination
Abandoned             14630
Normal              4230089
Rules infraction        128
Time forfeit        2011336
Unterminated              1
dtype: int64

In [4]:
# Kept only the games that was terminated normally, meaning player won by check or checkmate not timeout or forefit
dataset.drop(dataset[dataset['Termination'] != 'Normal'].index, inplace = True)
dataset.drop('Termination', axis=1, inplace = True)
dataset.head()

Unnamed: 0,Result,WhiteElo,BlackElo,AN
1,0-1,1641,1627,1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
3,1-0,1706,1317,1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...
5,0-1,1773,1809,1. e4 e5 2. Nc3 d6 3. Nf3 h6 4. Bc4 c6 5. b3 Q...
7,1-0,2155,2356,1. d4 d5 2. Nf3 Nf6 3. Bf4 c6 4. e3 Bg4 5. Be2...
8,0-1,2010,2111,1. d4 Nf6 2. Bf4 e6 3. e3 d5 4. Nf3 h6 5. Bd3 ...


In [5]:
# Removed games with lower player ratings
dataset.drop(dataset[(dataset.WhiteElo < 2000) | (dataset.BlackElo < 2000)].index, inplace = True)
dataset.drop(['WhiteElo', 'BlackElo'], axis=1, inplace = True)
len(dataset)

412921

In [6]:
dataset.groupby('Result').size()

Result
0-1        183211
1-0        201621
1/2-1/2     28089
dtype: int64

In [7]:
# Excluded games without a conclusive result.
dataset.drop(dataset[dataset['Result'] == '1/2-1/2'].index, inplace = True)
dataset.drop(['Result'], axis=1, inplace = True)
len(dataset)

384832

In [8]:
def filter_fn(x):
    # Remove content within curly braces (including the braces)
    x = re.sub(r'\{[^\}]*\}', '', x)
    
    # Remove sequences of digits followed by a variable number of dots
    x = re.sub(r'\d+\.{1,}', '', x)
    
    # Remove all exclamation marks (!) and question marks (?)
    x = re.sub(r'[!?]+', '', x)
    
    # Replace multiple spaces with a single space
    x = re.sub(r'\s+', ' ', x)
    
    # Trim leading and trailing spaces
    x = x.strip()
    
    return x

In [9]:
dataset['AN'] = dataset['AN'].apply(filter_fn)
dataset.rename(columns = {'AN':'Moves'}, inplace = True)

In [None]:
dataset.to_csv('/kaggle/working/filtered_chess_games.csv', index=False)