In [1]:
from pgn import PGNParser
import pandas as pd
import numpy as np
from datetime import datetime
import os
import glob
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [2]:
def parse_pgn(filename):
    file = open(filename, 'r', encoding='utf-8')
    fsize = os.fstat(file.fileno()).st_size
    parser = PGNParser(file)
    games = []

    while parser.has_next():
        headers, moves, data = parser.next()
        if headers is None:
            break
        headers['Moves'] = moves
        games.append(headers)
        location = file.tell()
    
    file.close()
    return pd.DataFrame(games)

In [3]:
def preprocess(df):
    # extract second word
    df['Event'] = df['Event'].str.split(' ').str[1].astype('string')
    df['Site'] = df['Site'].astype('string')
    df['White'] = df['White'].astype('string')
    df['Black'] = df['Black'].astype('string')

    # Convert result to categorical data
    resultMappings = { '1-0': 'White', '0-1': 'Black', '1/2-1/2': 'Draw', '*': 'Unknown' }
    df['Result'] = df['Result'].map(resultMappings).astype('category')

    # concat UTCDate and UTCTime
    df['Date'] = (df['UTCDate'] + ' ' + df['UTCTime']).astype('datetime64[s]')
    df.drop(['UTCDate', 'UTCTime'], axis=1, inplace=True)

    df['WhiteElo'] = pd.to_numeric(df['WhiteElo'], errors='coerce')
    df['BlackElo'] = pd.to_numeric(df['BlackElo'], errors='coerce')
    df['WhiteRatingDiff'] = pd.to_numeric(df['WhiteRatingDiff'], errors='coerce')
    df['BlackRatingDiff'] = pd.to_numeric(df['BlackRatingDiff'], errors='coerce')
    df.drop('Opening', axis=1, inplace=True)

    df['ECO'] = df['ECO'].astype('category')

    df['Termination'] = df['Termination'].astype('category')
    # Time Control is a weird one
    df['TimeControl'] = df['TimeControl'].astype('string')
    df['Moves'] = df['Moves'].astype('string')

    if 'BlackTitle' in df.columns:
        df['BlackTitle'] = df['BlackTitle'].astype('category')
    if 'WhiteTitle' in df.columns:
        df['WhiteTitle'] = df['WhiteTitle'].astype('category')
    
    if 'LichessId' in df.columns:
        df['LichessId'] = df['LichessId'].astype('string')
    
    if 'Round' in df.columns:
        df.drop('Round', axis=1, inplace=True)
    return df

In [4]:
files = glob.glob('datasets/lichess-sampled/**/*.pgn')

dfs = Parallel(n_jobs=16)(delayed(parse_pgn)(f) for f in tqdm(files))

  0%|          | 0/107 [00:00<?, ?it/s]

In [5]:
dfs = Parallel(n_jobs=16)(delayed(preprocess)(df) for df in tqdm(dfs))

  0%|          | 0/107 [00:00<?, ?it/s]

In [6]:
df = pd.concat(dfs)
# df = preprocess(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4286409 entries, 0 to 26973
Data columns (total 17 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Event            string        
 1   Site             string        
 2   White            string        
 3   Black            string        
 4   Result           object        
 5   WhiteElo         float64       
 6   BlackElo         float64       
 7   WhiteRatingDiff  float64       
 8   BlackRatingDiff  float64       
 9   ECO              object        
 10  TimeControl      string        
 11  Termination      object        
 12  Moves            string        
 13  Date             datetime64[ns]
 14  WhiteTitle       object        
 15  BlackTitle       object        
 16  LichessId        string        
dtypes: datetime64[ns](1), float64(4), object(5), string(7)
memory usage: 588.6+ MB


In [7]:
# df.to_hdf('lichess-test.h5', key='lichess', mode='w')
df.to_parquet('datasets/lichess-sampled.parquet', engine='pyarrow')

In [8]:
df = pd.read_parquet('datasets/lichess-sampled.parquet')

In [9]:
df.head()

Unnamed: 0,Event,Site,White,Black,Result,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,TimeControl,Termination,Moves,Date,WhiteTitle,BlackTitle,LichessId
0,Classical,https://lichess.org/a9tcp02g,Desmond_Wilson,savinka59,White,1654.0,1919.0,19.0,-22.0,D04,480+2,Normal,1. d4 d5 2. Nf3 Nf6 3. e3 Bf5 4. Nh4 Bg6 5. Nx...,2012-12-31 23:04:12,,,
1,Classical,https://lichess.org/iclkx584,Voltvolf,Marzinkus,White,1824.0,1811.0,11.0,-11.0,C02,360+6,Normal,1. e4 e6 2. d4 d5 3. e5 c5 4. c3 Ne7 5. f4 cxd...,2012-12-31 23:10:00,,,
2,Classical,https://lichess.org/ufcqmfxx,6WX,adamsrj,White,1463.0,1504.0,62.0,-12.0,C44,1560+30,Normal,1. e4 e5 2. Nf3 Nc6 3. Bc4 Be7 4. d4 exd4 5. N...,2012-12-31 23:16:04,,,
3,Classical,https://lichess.org/qwuudn2s,sebastian44,jtkjtkful,Black,1347.0,1519.0,-6.0,23.0,B01,300+5,Time forfeit,1. e4 d5 2. e5 d4 3. Nf3 Nc6 4. c3 d3 5. Na3 f...,2012-12-31 23:24:11,,,
4,Classical,https://lichess.org/c9qfp8es,peter2,Killi,Black,1519.0,1572.0,-9.0,9.0,C28,480+0,Normal,1. e4 Nc6 2. Nc3 e5 3. Bc4 Nf6 4. d3 h6 5. f3 ...,2013-01-01 00:51:20,,,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4286409 entries, 0 to 26973
Data columns (total 17 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Event            string        
 1   Site             string        
 2   White            string        
 3   Black            string        
 4   Result           object        
 5   WhiteElo         float64       
 6   BlackElo         float64       
 7   WhiteRatingDiff  float64       
 8   BlackRatingDiff  float64       
 9   ECO              object        
 10  TimeControl      string        
 11  Termination      object        
 12  Moves            string        
 13  Date             datetime64[ns]
 14  WhiteTitle       object        
 15  BlackTitle       object        
 16  LichessId        string        
dtypes: datetime64[ns](1), float64(4), object(5), string(7)
memory usage: 588.6+ MB
