In [None]:
import plotly.express as px
from scipy import sparse
from tqdm import tqdm

In [None]:
import chess
import numpy as np
import pandas as pd

# how many positions to sample
size = 1_000_000

# 1) Load your CSV and take a random 100 000
df = pd.read_csv("train.csv")  # must have columns 'FEN' and 'value'
df = df.sample(frac=1, random_state=42).reset_index(drop=True).iloc[:size]


# 2) Feature extractor (768 dims: 12 piece‐types × 64 squares)
def extract_features(fen: str) -> np.ndarray:
    board = chess.Board(fen)
    arr = np.zeros(12 * 64, dtype=np.uint8)
    for sq, piece in board.piece_map().items():
        idx = (piece.piece_type - 1) + (0 if piece.color == chess.WHITE else 6)
        arr[idx * 64 + sq] = 1
    return arr


# 3) Build column names
piece_names = ["wP", "wN", "wB", "wR", "wQ", "wK", "bP", "bN", "bB", "bR", "bQ", "bK"]
col_names = [
    f"{piece_names[i // 64]}{chess.square_name(i % 64)}" for i in range(12 * 64)
]

# 4) Featurize all positions
X = np.zeros((len(df), 12 * 64), dtype=np.uint8)
for i, fen in tqdm(enumerate(df["FEN"]), total=len(df), desc="Featurizing"):
    X[i] = extract_features(fen)

# 5) Assemble into a DataFrame
X_df = pd.DataFrame(X, columns=col_names)
out_df = pd.concat([X_df, df["value"].reset_index(drop=True)], axis=1)

# 6) Save to Parquet
out_df.to_parquet("train.parquet", index=False)
print("Wrote train.parquet with shape", out_df.shape)

In [None]:
np.sum(out_df["value"] == 1)

In [None]:
np.sum(out_df["value"] == 0)

In [None]:
np.sum(out_df["value"] == -1)

In [None]:
import pandas as pd

feature_cols = [c for c in out_df.columns if c != "value"]
feature_sums = out_df[feature_cols].sum()
valid_feats = feature_sums[feature_sums > 0].index.tolist()
overall_mean = out_df["value"].mean()
value_when_1 = out_df[feature_cols].T.dot(out_df["value"]) / feature_sums
effect = value_when_1 - overall_mean
score = feature_sums * effect


feat_stats = pd.DataFrame({"sum": feature_sums, "avg_diff": effect, "score": score})

# 8) Sort by |score| descending
feat_stats = feat_stats.reindex(
    feat_stats["score"].abs().sort_values(ascending=False).index
)

In [None]:
feat_stats["t"] = np.sqrt(feat_stats["sum"]) * feat_stats["avg_diff"]
feat_stats = feat_stats.sort_values("t", key=lambda col: col.abs(), ascending=False)

In [None]:
feat_stats[abs(feat_stats["t"]) > 2]

In [None]:
# 2) Build X as int64 so dot‐products accumulate correctly
X = sparse.csr_matrix(out_df[feature_cols].values.astype(np.int64), dtype=np.int64)
y = out_df["value"].values.astype(np.float64)

# 3) Compute joint‐counts C and joint‐sum‐of‐y S
C_sparse = X.T.dot(X)  # int64 counts, up to ~100 000
Xy = X.multiply(y[:, None])  # each row k scaled by y[k]
S_sparse = X.T.dot(Xy)  # float64 sums of y over co‐occurrences

# 4) Bring into dense arrays
C = C_sparse.toarray()
S = S_sparse.toarray()

# 5) Overall mean of y
overall_mean = y.mean()

# 6) For all (i,j) with C[i,j]>0:
#      mean_when_both = S / C
#      effect         = mean_when_both - overall_mean
#      score          = C * effect
mask = C > 0
mean_when_both = np.zeros_like(S)
mean_when_both[mask] = S[mask] / C[mask]

effect = mean_when_both - overall_mean
score = C.astype(np.float64) * effect

# 7) Extract upper‐triangle (i<j), build DataFrame, filter & sort
i, j = np.triu_indices_from(score, k=1)
df_pairs = pd.DataFrame(
    {
        "feat1": np.array(feature_cols)[i],
        "feat2": np.array(feature_cols)[j],
        "joint_count": C[i, j],
        "avg_diff": effect[i, j],
        "score": score[i, j],
    }
)
df_pairs = df_pairs[df_pairs["joint_count"] > 0]
df_pairs = df_pairs.reindex(df_pairs["score"].abs().sort_values(ascending=False).index)

In [None]:
# map each feat1 and feat2 to its avg_diff in feat_stats
map_avg = feat_stats["avg_diff"]
df_pairs["avg_diff_adj"] = (
    df_pairs["avg_diff"]
    - df_pairs["feat1"].map(map_avg)
    - df_pairs["feat2"].map(map_avg)
)

In [None]:
df_pairs

In [None]:
df_pairs["t"] = np.sqrt(df_pairs["joint_count"]) * df_pairs["avg_diff"]

In [None]:
df_pairs["t_adj"] = np.sqrt(df_pairs["joint_count"]) * df_pairs["avg_diff_adj"]

In [None]:
df_pairs.sort_values("t_adj")

In [None]:
df_pairs[abs(df_pairs["t"]) > 2]

In [None]:
fig = px.histogram(feat_stats, x="sum", nbins=700)
fig.update_layout(
    title="Histogram of joint_count", xaxis_title="joint_count", yaxis_title="Count"
)
fig.show()

In [None]:
df_pairs.sort_values("avg_diff").head(30)

In [None]:
feat_stats

In [None]:
feat_stats.head(20)

In [None]:
out_df