In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install required packages (uncomment if running locally)
# !pip install scikit-learn matplotlib joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from joblib import Parallel, delayed, dump

# Load datasets
chess_df = pd.read_csv('/kaggle/input/chess-dataset/chessData.csv')
random_df = pd.read_csv('/kaggle/input/chess-dataset/random_evals.csv')

# Clean evaluations
def clean_evaluations(series):
    def to_float(val):
        try:
            val = str(val).strip()
            if val.startswith("#"):
                return 10000.0 if "+" in val else -10000.0
            return float(val)
        except:
            return np.nan
    return series.map(to_float)

# FEN to features
def fen_to_features(fen):
    try:
        fields = fen.strip().split()
        board_str, active, castling = fields[0], fields[1], fields[2]
    except:
        return None

    piece_counts = {c: 0 for c in 'PNBRQKpnbrqk'}
    board_rows = board_str.split('/')
    board_grid = []
    for row in board_rows:
        expanded = []
        for ch in row:
            if ch.isdigit():
                expanded += ['.'] * int(ch)
            else:
                expanded.append(ch)
        board_grid.append(expanded)

    for row in board_grid:
        for ch in row:
            if ch in piece_counts:
                piece_counts[ch] += 1

    values = {'P':1,'N':3,'B':3,'R':5,'Q':9,'p':1,'n':3,'b':3,'r':5,'q':9}
    white_mat = sum(piece_counts[c] * values[c] for c in 'PNBRQ')
    black_mat = sum(piece_counts[c] * values[c] for c in 'pnbrq')
    material_balance = white_mat - black_mat

    def is_king_castled(color):
        row = 7 if color == 'w' else 0
        for col in range(8):
            if board_grid[row][col] == ('K' if color == 'w' else 'k'):
                return int(col in [6, 2])
        return 0

    white_king_castled = is_king_castled('w')
    black_king_castled = is_king_castled('b')

    white_pawn_ranks = [7 - i for i in range(8) for j in range(8) if board_grid[i][j] == 'P']
    black_pawn_ranks = [i for i in range(8) for j in range(8) if board_grid[i][j] == 'p']
    pawn_adv_white = sum(white_pawn_ranks) / len(white_pawn_ranks) if white_pawn_ranks else 2
    pawn_adv_black = sum(black_pawn_ranks) / len(black_pawn_ranks) if black_pawn_ranks else 2

    white_queen_central = any(board_grid[r][c] == 'Q' for r in range(8) for c in [3, 4])
    black_queen_central = any(board_grid[r][c] == 'q' for r in range(8) for c in [3, 4])

    center_squares = [(3,3), (3,4), (4,3), (4,4)]
    center_control_white = sum(1 for r, c in center_squares if board_grid[r][c].isupper())
    center_control_black = sum(1 for r, c in center_squares if board_grid[r][c].islower())

    return {
        **{f"cnt_{p}": piece_counts[p] for p in piece_counts},
        "material_balance": material_balance,
        "active_white": int(active == 'w'),
        "castle_wk": int('K' in castling),
        "castle_wq": int('Q' in castling),
        "castle_bk": int('k' in castling),
        "castle_bq": int('q' in castling),
        "white_king_castled": white_king_castled,
        "black_king_castled": black_king_castled,
        "pawn_advancement_white": pawn_adv_white,
        "pawn_advancement_black": pawn_adv_black,
        "queen_central_white": int(white_queen_central),
        "queen_central_black": int(black_queen_central),
        "center_control_white": center_control_white,
        "center_control_black": center_control_black
    }

# Parallelized preprocessing
def df_to_clean_feature_df_parallel(df, chunk_size=200_000, n_jobs=-1):
    feats_list, labs_list = [], []
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        fen_feats = [(i, fen_to_features(fen)) for i, fen in chunk["FEN"].items()]
        fen_feats = [(i, f) for i, f in fen_feats if f is not None]
        if not fen_feats:
            continue
        indices, feature_dicts = zip(*fen_feats)
        feats = pd.DataFrame(feature_dicts, index=indices)
        raw_evals = chunk.loc[feats.index, "Evaluation"]
        cleaned_evals = clean_evaluations(raw_evals).dropna()
        feats = feats.loc[cleaned_evals.index]
        feats_list.append(feats)
        labs_list.append(cleaned_evals)
    X = pd.concat(feats_list, ignore_index=True)
    y = pd.concat(labs_list, ignore_index=True)
    return X, y

# Run preprocessing
print("Processing training data...")
X_train, y_train = df_to_clean_feature_df_parallel(chess_df)

print("Processing test data...")
X_test, y_test = df_to_clean_feature_df_parallel(random_df)

# Clip and scale
y_train = y_train.clip(-1000, 1000) / 100.0
y_test = y_test.clip(-1000, 1000) / 100.0

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest
print("Training RandomForestRegressor...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    n_jobs=-1,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)
print("Training complete.")

# Predict
y_pred = rf_model.predict(X_test_scaled)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
accuracy = (np.abs(y_test - y_pred) <= 1.0).mean()

print(f"RMSE: {rmse:.2f} pawns")
print(f"R² Score: {r2:.4f}")
print(f"Accuracy within ±1 pawn: {accuracy * 100:.2f}%")

# Save model
dump(rf_model, "/kaggle/working/random_forest_model.joblib")
print("Model saved to /kaggle/working/random_forest_model.joblib")

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.3, color='dodgerblue', label="Predicted vs Actual")
plt.plot([-5, 5], [-5, 5], color='red', linestyle='--', label="Perfect Prediction")
plt.xlabel("Stockfish Evaluation (True)")
plt.ylabel("Model Prediction")
plt.title("Random Forest Model vs Stockfish")
plt.legend()
plt.grid(True)
plt.show()


Processing training data...
Processing test data...
Training RandomForestRegressor...
Training complete.
RMSE: 4.68 pawns
R² Score: 0.4125
Accuracy within ±1 pawn: 28.94%
Model saved to /kaggle/working/random_forest_model.joblib


  fig.canvas.print_figure(bytes_io, **kw)
