In [None]:
#!/usr/bin/env python3
# train_chess_lr.py

from tqdm import tqdm
import pandas as pd
import chess
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, log_loss
import os

# -------------------- Feature Extraction --------------------
def extract_features(fen: str) -> np.ndarray:
    board = chess.Board(fen)
    arr = np.zeros(12 * 64, dtype=np.float32)
    for sq, piece in board.piece_map().items():
        ch = (piece.piece_type - 1) + (0 if piece.color == chess.WHITE else 6)
        arr[ch * 64 + sq] = 1.0
    n_pieces = len(board.piece_map())
    factor1 = (n_pieces - 2) / 30
    factor2 = (32 - n_pieces) / 30
    return np.concatenate([arr * factor1, arr * factor2], axis=0)

# -------------------- Dataset Processing --------------------
def process_dataset(df: pd.DataFrame, size: int, desc: str):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True).iloc[:size]
    X = np.zeros((len(df), 2 * 12 * 64), dtype=np.float32)
    y = df['value'].values.astype(np.float32)
    for i, fen in tqdm(enumerate(df['FEN']), total=len(df), desc=f"Featurizing {desc}"):
        X[i] = extract_features(fen)
    return X, y

# -------------------- Load, Split, Save (using Parquet) --------------------
def load_split_save(train_csv: str,
                    val_csv: str,
                    train_size: int = 1_000_000,
                    val_size: int = 100_000):

    # Check for existing Parquet files
    if os.path.exists("X_train.parquet") and os.path.exists("y_train.parquet") \
       and os.path.exists("X_val.parquet") and os.path.exists("y_val.parquet"):
        print("Loading pre-saved Parquet datasets...")
        X_train = pd.read_parquet("X_train.parquet").values
        y_train = pd.read_parquet("y_train.parquet")["value"].values
        X_val   = pd.read_parquet("X_val.parquet").values
        y_val   = pd.read_parquet("y_val.parquet")["value"].values
        return X_train, y_train, X_val, y_val

    # Read CSVs if Parquet not found
    df_train = pd.read_csv(train_csv)
    df_val   = pd.read_csv(val_csv)

    # Process datasets
    X_train, y_train = process_dataset(df_train, train_size, 'train')
    X_val,   y_val   = process_dataset(df_val,   val_size,   'val')

    # Save to Parquet
    pd.DataFrame(X_train).to_parquet("X_train.parquet", index=False)
    pd.DataFrame({"value": y_train}).to_parquet("y_train.parquet", index=False)
    pd.DataFrame(X_val).to_parquet("X_val.parquet", index=False)
    pd.DataFrame({"value": y_val}).to_parquet("y_val.parquet", index=False)

    return X_train, y_train, X_val, y_val

# -------------------- Train Logistic Regression --------------------
def train_logistic_regression(X_train: np.ndarray, y_train: np.ndarray) -> LogisticRegression:
    print("Fitting logistic regression...")
    lr = LogisticRegression(max_iter=1000, verbose=1)
    lr.fit(X_train, y_train)
    return lr

# -------------------- Train and Select Best Model --------------------
def train_and_select(X_train: np.ndarray,
                     y_train: np.ndarray,
                     X_val: np.ndarray,
                     y_val: np.ndarray):
    lr = train_logistic_regression(X_train, y_train)
    lr_acc = lr.score(X_val, y_val)
    print(f"Logistic Regression - Validation accuracy: {lr_acc:.4f}", end='')

    best_model, best_score, best_name = lr, lr_acc, 'LogisticRegression'

    for h in range(3, 65):
        print(f"Fitting MLPClassifier with hidden layer size {h}...")
        mlp = MLPClassifier(
            hidden_layer_sizes=(h,),
            early_stopping=True,
            validation_fraction=0.1,
            n_iter_no_change=5,
            max_iter=200,
            random_state=42,
            verbose=True
        )
        mlp.fit(X_train, y_train)
        val_acc = accuracy_score(y_val, mlp.predict(X_val))
        print(f"MLPClassifier (hidden={h}) - Validation accuracy: {val_acc:.4f}")
        if val_acc > best_score:
            best_model, best_score, best_name = mlp, val_acc, f"MLPClassifier(hidden={h})"

    print(f"Best model: {best_name} with validation accuracy: {best_score:.4f}")
    return best_model

X_train, y_train, X_val, y_val = load_split_save(
    'train.csv', 'val.csv',
    train_size=1_000_000, val_size=100_000
)
best_model = train_and_select(X_train, y_train, X_val, y_val)


Loading pre-saved Parquet datasets...
Fitting logistic regression...
Logistic Regression - Validation accuracy: 0.6026Fitting MLPClassifier with hidden layer size 3...
Iteration 1, loss = 0.81917053
Validation score: 0.590100
Iteration 2, loss = 0.79143769
Validation score: 0.594250
Iteration 3, loss = 0.78888577
Validation score: 0.593360
Iteration 4, loss = 0.78813630
Validation score: 0.598980
Iteration 5, loss = 0.78754914
Validation score: 0.598370
Iteration 6, loss = 0.78703403
Validation score: 0.596350
Iteration 7, loss = 0.78683811
Validation score: 0.598680
Iteration 8, loss = 0.78654581
Validation score: 0.598910
Iteration 9, loss = 0.78630394
Validation score: 0.598800
Iteration 10, loss = 0.78603274
Validation score: 0.599700
Iteration 11, loss = 0.78596365
Validation score: 0.596700
Iteration 12, loss = 0.78583187
Validation score: 0.600950
Iteration 13, loss = 0.78561676
Validation score: 0.600340
Iteration 14, loss = 0.78555179
Validation score: 0.601340
Iteration 15, l



MLPClassifier (hidden=17) - Validation accuracy: 0.6008
Fitting MLPClassifier with hidden layer size 18...


Featurizing train: 100%|██████████| 1000000/1000000 [01:16<00:00, 13138.70it/s]
Featurizing val: 100%|██████████| 100000/100000 [00:07<00:00, 13282.58it/s]


ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [-1.  0.  1.]