In [None]:
from tqdm import tqdm
import pandas as pd
import chess
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, log_loss
import os
import joblib
from lichess import extract_features

def process_dataset(df: pd.DataFrame, size: int, desc: str):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True).iloc[:size]
    X = np.zeros((len(df), 2 * (12 * 64 + 4)), dtype=np.float32)
    y = df['value'].values.astype(np.float32)
    for i, fen in tqdm(enumerate(df['FEN']), total=len(df), desc=f"Featurizing {desc}"):
        X[i] = extract_features(fen)
    return X, y

def load_split_save(train_csv: str,
                    val_csv: str,
                    train_size: int = 1_000_000,
                    val_size: int = 100_000):

    if os.path.exists("X_train.parquet") and os.path.exists("y_train.parquet") \
       and os.path.exists("X_val.parquet") and os.path.exists("y_val.parquet"):
        print("Loading pre-saved Parquet datasets...")
        X_train = pd.read_parquet("X_train.parquet").values
        y_train = pd.read_parquet("y_train.parquet")["value"].values
        X_val   = pd.read_parquet("X_val.parquet").values
        y_val   = pd.read_parquet("y_val.parquet")["value"].values
        return X_train, y_train, X_val, y_val

    df_train = pd.read_csv(train_csv)
    df_val   = pd.read_csv(val_csv)

    X_train, y_train = process_dataset(df_train, train_size, 'train')
    X_val,   y_val   = process_dataset(df_val,   val_size,   'val')

    pd.DataFrame(X_train).to_parquet("X_train.parquet", index=False)
    pd.DataFrame({"value": y_train}).to_parquet("y_train.parquet", index=False)
    pd.DataFrame(X_val).to_parquet("X_val.parquet", index=False)
    pd.DataFrame({"value": y_val}).to_parquet("y_val.parquet", index=False)

    return X_train, y_train, X_val, y_val

X_train, y_train, X_val, y_val = load_split_save(
    'train.csv', 'val.csv',
    train_size=1_000_000, val_size=100_000
)

Featurizing train: 100%|██████████| 1000000/1000000 [01:22<00:00, 12115.99it/s]
Featurizing val: 100%|██████████| 100000/100000 [00:08<00:00, 11699.15it/s]


In [None]:
def train_logistic_regression(X_train: np.ndarray, y_train: np.ndarray) -> LogisticRegression:
    print("Fitting logistic regression...")
    lr = LogisticRegression(max_iter=1000, verbose=1)
    lr.fit(X_train, y_train)
    return lr

lr = train_logistic_regression(X_train, y_train)

y_prob = lr.predict_proba(X_val)
ll = log_loss(y_val, y_prob)
print(f"Log loss: {ll:.4f}")

In [None]:
joblib.dump(lr, "chess_lr.joblib")