In [1]:
import pandas as pd
import chess
import chess.pgn
import time

In [2]:
database_filename = 'C:/Users/alexpc2red/Documents/lichessgames/lichess_db_standard_rated_2016-03.pgn'

In [3]:
files = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
ranks = ['1', '2', '3', '4', '5', '6', '7', '8']

left_wing_files = ['a', 'b', 'c']
center_files = ['d', 'e']
right_wing_files = ['f', 'g', 'h']

In [4]:
def get_new_game_data():
    return {
        'total_moves': 0,
        'time_control': None,
        'white_win': 0,
        'white_elo': 0,
        'black_elo': 0,
        'eco': 'x',
        'white_castles': 'x',
        'black_castles': 'x',
        'white_total_left_wing_pawn_moves': 0,
        'white_total_center_pawn_moves': 0,
        'white_total_right_wing_pawn_moves': 0,
        'white_total_knight_moves': 0,
        'white_total_bishop_moves': 0,
        'white_total_rook_moves': 0,
        'white_total_queen_moves': 0,
        'white_total_king_moves': 0,
        'black_total_left_wing_pawn_moves': 0,
        'black_total_center_pawn_moves': 0,
        'black_total_right_wing_pawn_moves': 0,
        'black_total_knight_moves': 0,
        'black_total_bishop_moves': 0,
        'black_total_rook_moves': 0,
        'black_total_queen_moves': 0,
        'black_total_king_moves': 0,
        'white_doubled_pawn_files': 0,
        'black_doubled_pawn_files': 0,
        'white_no_pawn_files': 0,
        'black_no_pawn_files': 0,
        'white_total_isolated_pawn_files': 0,
        'black_total_isolated_pawn_files': 0,
        'white_total_pawns': 0,
        'white_total_knights': 0,
        'white_total_bishops': 0,
        'white_total_rooks': 0,
        'white_total_queens': 0,
        'black_total_pawns': 0,
        'black_total_knights': 0,
        'black_total_bishops': 0,
        'black_total_rooks': 0,
        'black_total_queens': 0,
        'white_total_attacked_squares': 0,
        'black_total_attacked_squares': 0,
        'white_total_attacked_pieces': 0,
        'black_total_attacked_pieces': 0,
        'white_total_defended_pieces': 0,
        'black_total_defended_pieces': 0,
        'white_furthest_pawn': 0,
        'black_furthest_pawn': 0,
        'white_total_king_blockers': 0,
        'black_total_king_blockers': 0,
        'white_total_hanging_pawns': 0,
        'white_total_hanging_pieces': 0,
        'black_total_hanging_pawns': 0,
        'black_total_hanging_pieces': 0
    }

In [5]:
def get_time_control(initial_seconds, increment):
    estimated_game_duration = initial_seconds + 40*increment
    if estimated_game_duration < 29:
        return 'ultrabullet'
    if estimated_game_duration < 179:
        return 'bullet'
    if estimated_game_duration < 479:
        return 'blitz'
    if estimated_game_duration < 1499:
        return 'rapid'
    return 'classical'

In [6]:
def parse_game(game,moves,game_data):
    board = chess.Board()

    for move in list(game.mainline_moves())[:int(2*moves)]:
        from_square = move.from_square
        piece = board.piece_at(from_square)
        color = 'white'
        if piece.color == chess.BLACK:
            color = 'black'
        piece_type = piece.piece_type
        file = chess.square_name(from_square)[0]
        if move.uci() == 'e1g1':
            game_data['white_castles'] = 'kingside'
        elif move.uci() == 'e1c1':
            game_data['white_castles'] = 'queenside'
        elif move.uci() == 'e8g8':
            game_data['black_castles'] = 'kingside'
        elif move.uci() == 'e8c8':
            game_data['black_castles'] = 'queenside'
        elif piece_type == chess.PAWN:
            if file in left_wing_files:
                game_data[color + '_total_left_wing_pawn_moves'] += 1
            elif file in center_files:
                game_data[color + '_total_center_pawn_moves'] += 1
            if file in right_wing_files:
                game_data[color + '_total_right_wing_pawn_moves'] += 1
        elif piece_type == chess.KNIGHT:
            game_data[color + '_total_knight_moves'] += 1
        elif piece_type == chess.BISHOP:
            game_data[color + '_total_bishop_moves'] += 1
        elif piece_type == chess.ROOK:
            game_data[color + '_total_rook_moves'] += 1
        elif piece_type == chess.QUEEN:
            game_data[color + '_total_queen_moves'] += 1
        elif piece_type == chess.KING:
            game_data[color + '_total_king_moves'] += 1

        board.push(move)

    white_pawn_files = []
    black_pawn_files = []
    white_pawn_ranks = []
    black_pawn_ranks = []

    for file in files:
        wp_file = 0
        bp_file = 0
        for rank in ranks:
            square_name = file + rank
            square = chess.parse_square(square_name)
            piece = board.piece_at(square)
            if piece is not None:
                piece_type = piece.piece_type
                color = 'white'
                opponent_color = 'black'
                if piece.color == chess.BLACK:
                    color = 'black'
                    opponent_color = 'white'

                if piece.symbol() == 'P':
                    wp_file += 1
                    game_data['white_total_pawns'] += 1
                    if file not in white_pawn_files:
                        white_pawn_files.append(file)
                    if rank not in white_pawn_ranks:
                        white_pawn_ranks.append(rank)
                elif piece.symbol() == 'p':
                    bp_file += 1
                    game_data['black_total_pawns'] += 1
                    if file not in black_pawn_files:
                        black_pawn_files.append(file)
                    if rank not in black_pawn_ranks:
                        black_pawn_ranks.append(rank)
                elif piece_type == chess.KNIGHT:
                    game_data[color + '_total_knights'] += 1
                elif piece_type == chess.BISHOP:
                    game_data[color + '_total_bishops'] += 1
                elif piece_type == chess.ROOK:
                    game_data[color + '_total_rooks'] += 1
                elif piece_type == chess.QUEEN:
                    game_data[color + '_total_queens'] += 1

                if board.is_attacked_by(piece.color,square) and piece_type != chess.KING:
                    game_data[color + '_total_defended_pieces'] += 1
                if board.is_attacked_by(not piece.color,square) and piece.piece_type != chess.KING:
                    game_data[opponent_color + '_total_attacked_pieces'] += 1
                    if not board.is_attacked_by(piece.color,square):
                        if piece_type == chess.PAWN:
                            game_data[color + '_total_hanging_pawns'] += 1
                        else:
                            game_data[color + '_total_hanging_pieces'] += 1

                defenders = board.attackers(piece.color, square)
                for defender_sq in defenders:
                    defender_piece = board.piece_at(defender_sq)
                    if defender_piece is not None:
                        if defender_piece.piece_type == chess.KING:
                            game_data[color + '_total_king_blockers'] += 1
                            
                    
            if board.is_attacked_by(chess.WHITE,square):
                game_data['white_total_attacked_squares'] += 1
            if board.is_attacked_by(chess.BLACK,square):
                game_data['black_total_attacked_squares'] += 1
            
            
        if wp_file > 1: 
            game_data['white_doubled_pawn_files'] += 1
        if bp_file > 1: 
            game_data['black_doubled_pawn_files'] += 1
        if wp_file == 0: 
            game_data['white_no_pawn_files'] += 1
        if bp_file == 0: 
            game_data['black_no_pawn_files'] += 1

    if 'a' in white_pawn_files and 'b' not in white_pawn_files:
        game_data['white_total_isolated_pawn_files'] += 1
    if 'h' in white_pawn_files and 'g' not in white_pawn_files:
        game_data['white_total_isolated_pawn_files'] += 1
    if 'a' in black_pawn_files and 'b' not in black_pawn_files:
        game_data['black_total_isolated_pawn_files'] += 1
    if 'h' in black_pawn_files and 'g' not in black_pawn_files:
        game_data['black_total_isolated_pawn_files'] += 1

    for i in range(1,7):
        if files[i] in white_pawn_files and files[i-1] not in white_pawn_files and files[i+1] not in white_pawn_files:
            game_data['white_total_isolated_pawn_files'] += 1
        if files[i] in black_pawn_files and files[i-1] not in black_pawn_files and files[i+1] not in black_pawn_files:
            game_data['black_total_isolated_pawn_files'] += 1

    if len(white_pawn_ranks) > 0:
        game_data['white_furthest_pawn'] = max(list(map(int, white_pawn_ranks)))
    else: 
        game_data['white_furthest_pawn'] = None
    if len(black_pawn_ranks) > 0:
        game_data['black_furthest_pawn'] = min(list(map(int, black_pawn_ranks)))
    else:
        game_data['black_furthest_pawn'] = None

    game_data['total_moves'] = len(list(game.mainline_moves()))
    game_data['white_win'] = int(game.headers.get('Result') == '1-0')
    game_data['white_elo'] = int(game.headers.get('WhiteElo'))
    game_data['black_elo'] = int(game.headers.get('BlackElo'))
    game_data['eco'] = game.headers.get('ECO')
    time_control = game.headers.get('TimeControl')
    if '+' in time_control:
        time_control_split = time_control.split('+')
        initial_seconds = int(time_control_split[0])
        increment = int(time_control_split[1])
        game_data['time_control'] = get_time_control(initial_seconds,increment)

In [7]:
start = time.time()
MOVES = 20
pgn = open(database_filename)
game_datas = [get_new_game_data() for j in range(4000000)]
populated_indices = []
i = 0
j = 0
while True:
    if i % 1e4 == 0:
        print(f'Game: {i}, total stored: {j}, total time: {time.time()-start}',flush=True)
        
    game = chess.pgn.read_game(pgn)
    if game is None:
        break
    total_moves = len(list(game.mainline_moves()))
    decisive_result = game.headers.get('Result') == '1-0' or game.headers.get('Result') == '0-1'
    if total_moves >= 2*(MOVES + 5) and decisive_result:
        parse_game(game,MOVES,game_datas[j])
        populated_indices.append(j)
        j += 1
    i += 1
    
game_datas_real = [game_datas[i] for i in populated_indices]
game_data_df = pd.DataFrame(game_datas_real)
game_data_df.to_csv('./parsed_games.csv',index=False)

print(len(game_data_df))

pgn.close()

Game: 0, total stored: 0, total time: 9.88084888458252
Game: 10000, total stored: 6747, total time: 28.55450129508972
Game: 20000, total stored: 13541, total time: 47.72839331626892
Game: 30000, total stored: 20245, total time: 66.46971607208252
Game: 40000, total stored: 26833, total time: 84.90008330345154
Game: 50000, total stored: 33429, total time: 103.77382373809814
Game: 60000, total stored: 40034, total time: 122.21271324157715
Game: 70000, total stored: 46658, total time: 140.5443754196167
Game: 80000, total stored: 53283, total time: 158.9488217830658
Game: 90000, total stored: 59816, total time: 177.3215663433075
Game: 100000, total stored: 66424, total time: 196.66622924804688
Game: 110000, total stored: 72940, total time: 215.69544339179993
Game: 120000, total stored: 79516, total time: 234.71521043777466
Game: 130000, total stored: 86141, total time: 253.93635535240173
Game: 140000, total stored: 92665, total time: 272.0328607559204
Game: 150000, total stored: 99091, tota

Game: 1240000, total stored: 818738, total time: 2269.423455476761
Game: 1250000, total stored: 825395, total time: 2287.4788706302643
Game: 1260000, total stored: 831994, total time: 2305.55228638649
Game: 1270000, total stored: 838694, total time: 2323.93270945549
Game: 1280000, total stored: 845332, total time: 2342.103127002716
Game: 1290000, total stored: 851932, total time: 2360.288545370102
Game: 1300000, total stored: 858532, total time: 2378.364961385727
Game: 1310000, total stored: 865139, total time: 2396.3793754577637
Game: 1320000, total stored: 871758, total time: 2414.823799610138
Game: 1330000, total stored: 878444, total time: 2433.143221139908
Game: 1340000, total stored: 884978, total time: 2451.300639152527
Game: 1350000, total stored: 891562, total time: 2469.4790568351746
Game: 1360000, total stored: 898237, total time: 2487.9074807167053
Game: 1370000, total stored: 904835, total time: 2506.232902288437
Game: 1380000, total stored: 911443, total time: 2524.480321

Game: 2450000, total stored: 1619042, total time: 4471.9137370586395
Game: 2460000, total stored: 1625680, total time: 4490.088042497635
Game: 2470000, total stored: 1632346, total time: 4508.3573496341705
Game: 2480000, total stored: 1638985, total time: 4526.7026579380035
Game: 2490000, total stored: 1645562, total time: 4545.120966911316
Game: 2500000, total stored: 1652245, total time: 4563.465275526047
Game: 2510000, total stored: 1658779, total time: 4581.650580883026
Game: 2520000, total stored: 1665367, total time: 4599.880887031555
Game: 2530000, total stored: 1671988, total time: 4618.272196769714
Game: 2540000, total stored: 1678707, total time: 4636.6135041713715
Game: 2550000, total stored: 1685458, total time: 4654.956815004349
Game: 2560000, total stored: 1692110, total time: 4673.296120405197
Game: 2570000, total stored: 1698722, total time: 4691.634428501129
Game: 2580000, total stored: 1705378, total time: 4709.824733972549
Game: 2590000, total stored: 1712014, total 

Game: 3660000, total stored: 2413244, total time: 6672.986257314682
Game: 3670000, total stored: 2419941, total time: 6691.249563932419
Game: 3680000, total stored: 2426561, total time: 6709.339868307114
Game: 3690000, total stored: 2433158, total time: 6727.6071746349335
Game: 3700000, total stored: 2439712, total time: 6745.715478897095
Game: 3710000, total stored: 2446204, total time: 6763.771782398224
Game: 3720000, total stored: 2452805, total time: 6782.1460909843445
Game: 3730000, total stored: 2459392, total time: 6800.351397275925
Game: 3740000, total stored: 2465961, total time: 6818.6647045612335
Game: 3750000, total stored: 2472611, total time: 6837.006012439728
Game: 3760000, total stored: 2479178, total time: 6855.287319660187
Game: 3770000, total stored: 2485717, total time: 6873.673628807068
Game: 3780000, total stored: 2492383, total time: 6892.004937410355
Game: 3790000, total stored: 2498987, total time: 6910.3152441978455
Game: 3800000, total stored: 2505530, total 

error during pgn parsing
Traceback (most recent call last):
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\__init__.py", line 2999, in parse_san
    return next(move for move in self.generate_castling_moves() if self.is_queenside_castling(move))
StopIteration

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\pgn.py", line 1685, in read_game
    move = visitor.parse_san(board_stack[-1], token)
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\pgn.py", line 1059, in parse_san
    return board.parse_san(san)
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\__init__.py", line 3001, in parse_san
    raise IllegalMoveError(f"illegal san: {san!r} in {self.fen()}")
chess.IllegalMoveError: illegal san: 'O-O-O' in r1r1kbn1/

Game: 4180000, total stored: 2756487, total time: 7627.852614879608


error during pgn parsing
Traceback (most recent call last):
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\__init__.py", line 2997, in parse_san
    return next(move for move in self.generate_castling_moves() if self.is_kingside_castling(move))
StopIteration

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\pgn.py", line 1685, in read_game
    move = visitor.parse_san(board_stack[-1], token)
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\pgn.py", line 1059, in parse_san
    return board.parse_san(san)
  File "C:\Users\alexpc2red\AppData\Local\Programs\Python\Python310\lib\site-packages\chess\__init__.py", line 3001, in parse_san
    raise IllegalMoveError(f"illegal san: {san!r} in {self.fen()}")
chess.IllegalMoveError: illegal san: 'O-O' in 2bqk1rr/1ppp

Game: 4190000, total stored: 2763091, total time: 7646.423010587692
Game: 4200000, total stored: 2769650, total time: 7664.871550321579
Game: 4210000, total stored: 2776240, total time: 7683.441432714462
Game: 4220000, total stored: 2782841, total time: 7701.937527656555
Game: 4230000, total stored: 2789426, total time: 7720.507555484772
Game: 4240000, total stored: 2796087, total time: 7739.090037584305
Game: 4250000, total stored: 2802595, total time: 7757.304639101028
Game: 4260000, total stored: 2809157, total time: 7775.461428403854
Game: 4270000, total stored: 2815728, total time: 7793.7913291454315
Game: 4280000, total stored: 2822352, total time: 7812.2303240299225
Game: 4290000, total stored: 2828933, total time: 7830.772490978241
Game: 4300000, total stored: 2835477, total time: 7849.141017436981
Game: 4310000, total stored: 2842109, total time: 7867.7842233181
Game: 4320000, total stored: 2848788, total time: 7886.460289716721
Game: 4330000, total stored: 2855293, total time

Game: 5400000, total stored: 3561019, total time: 9881.881983757019
Game: 5410000, total stored: 3567583, total time: 9900.363229751587
Game: 5420000, total stored: 3574229, total time: 9919.143479585648
Game: 5430000, total stored: 3580915, total time: 9937.753727197647
Game: 5440000, total stored: 3587336, total time: 9955.825967550278
Game: 5450000, total stored: 3593615, total time: 9973.636204481125
Game: 5460000, total stored: 3600115, total time: 9991.81744647026
Game: 5470000, total stored: 3606607, total time: 10010.196690559387
Game: 5480000, total stored: 3613214, total time: 10028.778940916061
Game: 5490000, total stored: 3619763, total time: 10047.44118642807
Game: 5500000, total stored: 3626292, total time: 10065.730428934097
Game: 5510000, total stored: 3632707, total time: 10083.912670850754
Game: 5520000, total stored: 3639297, total time: 10102.658920288086
Game: 5530000, total stored: 3645875, total time: 10121.187166690826
Game: 5540000, total stored: 3652530, total

In [8]:
game_data_df.duplicated().sum()

0

In [9]:
sum(game_data_df['total_moves']<50)

0

In [10]:
game_data_df.drop(['white_elo','black_elo','eco', 'total_moves'],axis=1).duplicated().sum()

63

In [11]:
game_data_df[game_data_df.drop(['white_elo','black_elo','eco', 'total_moves'],axis=1).duplicated()]

Unnamed: 0,total_moves,time_control,white_win,white_elo,black_elo,eco,white_castles,black_castles,white_total_left_wing_pawn_moves,white_total_center_pawn_moves,...,white_total_defended_pieces,black_total_defended_pieces,white_furthest_pawn,black_furthest_pawn,white_total_king_blockers,black_total_king_blockers,white_total_hanging_pawns,white_total_hanging_pieces,black_total_hanging_pawns,black_total_hanging_pieces
261116,62,blitz,0,2255,2406,E81,queenside,kingside,2,3,...,10,9,5.0,4.0,3,2,0,0,0,0
283700,147,blitz,1,2502,2170,B33,x,kingside,2,2,...,11,11,4.0,4.0,4,4,0,0,0,0
404462,63,blitz,0,1863,1966,C89,kingside,kingside,1,3,...,11,7,4.0,5.0,2,2,0,0,0,0
513036,54,blitz,0,2442,2343,B33,kingside,kingside,3,2,...,8,9,4.0,4.0,4,4,0,0,1,0
662110,95,blitz,1,1961,1837,B76,queenside,kingside,0,3,...,9,9,5.0,4.0,3,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3563492,79,blitz,1,1963,1844,C89,kingside,kingside,2,3,...,12,10,4.0,5.0,2,2,0,0,0,0
3596419,69,blitz,1,2294,2245,A69,kingside,kingside,3,4,...,12,7,5.0,6.0,2,4,0,0,0,0
3672418,111,blitz,1,1851,1874,C02,kingside,kingside,4,3,...,9,10,5.0,5.0,4,2,0,0,0,0
3735029,87,blitz,1,2258,1897,B19,queenside,queenside,1,3,...,10,9,5.0,6.0,2,2,0,0,0,0


In [12]:
"""
pgn.close()
pgn = open(database_filename)
game = chess.pgn.read_game(pgn)
game = chess.pgn.read_game(pgn)
print(game)
game_data_temp = get_new_game_data() 
parse_game(game,20,game_data_temp)
print(game_data_temp)
"""

'\npgn.close()\npgn = open(database_filename)\ngame = chess.pgn.read_game(pgn)\ngame = chess.pgn.read_game(pgn)\nprint(game)\ngame_data_temp = get_new_game_data() \nparse_game(game,20,game_data_temp)\nprint(game_data_temp)\n'

In [13]:
"""
temp_board = game.board()
for move in list(game.mainline_moves())[:40]:
    temp_board.push(move)
"""

'\ntemp_board = game.board()\nfor move in list(game.mainline_moves())[:40]:\n    temp_board.push(move)\n'

In [14]:
game_data_df['time_control'].unique()

array(['blitz', 'bullet', 'rapid', None, 'classical'], dtype=object)

In [15]:
game_data_df['time_control'].value_counts()

blitz        1574034
bullet       1314308
rapid         863599
classical      62133
Name: time_control, dtype: int64

In [16]:
game_data_df['time_control'].isna().sum()

11303

In [17]:
game_data_df.columns

Index(['total_moves', 'time_control', 'white_win', 'white_elo', 'black_elo',
       'eco', 'white_castles', 'black_castles',
       'white_total_left_wing_pawn_moves', 'white_total_center_pawn_moves',
       'white_total_right_wing_pawn_moves', 'white_total_knight_moves',
       'white_total_bishop_moves', 'white_total_rook_moves',
       'white_total_queen_moves', 'white_total_king_moves',
       'black_total_left_wing_pawn_moves', 'black_total_center_pawn_moves',
       'black_total_right_wing_pawn_moves', 'black_total_knight_moves',
       'black_total_bishop_moves', 'black_total_rook_moves',
       'black_total_queen_moves', 'black_total_king_moves',
       'white_doubled_pawn_files', 'black_doubled_pawn_files',
       'white_no_pawn_files', 'black_no_pawn_files',
       'white_total_isolated_pawn_files', 'black_total_isolated_pawn_files',
       'white_total_pawns', 'white_total_knights', 'white_total_bishops',
       'white_total_rooks', 'white_total_queens', 'black_total_pawns',

In [18]:
game_data_df['white_furthest_pawn'].isna().sum()

12

In [19]:
game_data_df['black_furthest_pawn'].isna().sum()

17