If you're curious, here's one of the better logistic regression models I was able to come up with just 4 features (not counting ^3 transformations) -- `mate_eval`, `regular_eval`, `white_adjusted_clock_usage`, and `black_adjusted_clock_usage`. Of course, blinding throwing everything into XGBoost, as we did for our first model, still performs better.

In [None]:
NUM_MOVES = 10

feature_suffixes = ("eval", "adjusted_clock_usage")
# Not including "ply" and "elo_diff" makes the coefficients for mate_eval^3 more interpretable
# without sacrificing much accuracy.

lr_steps = (
    ml.Mutate(
        adjusted_base_time=_.base_time + _.increment * NUM_MOVES,
        white_adjusted_clock=_.white_clock + _.increment * NUM_MOVES,
        black_adjusted_clock=_.black_clock + _.increment * NUM_MOVES,
    ),
    ml.Mutate(
        white_adjusted_clock_usage=(_.adjusted_base_time - _.white_adjusted_clock)
        / _.adjusted_base_time,
        black_adjusted_clock_usage=(_.adjusted_base_time - _.black_adjusted_clock)
        / _.adjusted_base_time,
    ),
    ml.Mutate(elo_diff=_.white_elo - _.black_elo),
    # in case you want to play with adding "elo_diff" to feature_suffixes above
    ml.Drop(~ml.endswith(feature_suffixes)),
    ml.FillNA(ml.numeric(), fill_value=0),
    ml.MutateAt(ml.endswith("eval"), pow3=_**3),
    ml.ScaleStandard(~ml.contains("adjusted_clock_usage")),
)

lr_pipe = Pipeline(
    [
        ("lr_recipe", ml.Recipe(*(basic_steps + lr_steps))),
        ("lr_model", LogisticRegression()),
    ]
)
lr_pipe.fit(X_train, y_train)

In [None]:
print(f"Training score: {lr_pipe.score(X_train, y_train)}")
print(f"Test score: {lr_pipe.score(X_test, y_test)}")

X_fit_transformed = lr_pipe["lr_recipe"].to_ibis(X_train)
coef_df = pd.DataFrame(
    lr_pipe["lr_model"].coef_,
    columns=X_fit_transformed.columns,
    index=["black win", "draw", "white win"],
)
coef_df

In [None]:
# Example of game with titled player: 5vD7WOT9
# test_data[_.game_id == "5vD7WOT9"]
test_data_df.loc[test_data_df.game_id == "5vD7WOT9"]

## Clipped XGB regressor for use with LETSQL

In [None]:
# ordered_titles_list = ["BOT", "WCM", "WFM", "NM", "CM", "WIM", "FM", "WGM", "IM", "LM", "GM"]
# expression = "_.case()"

# for i, title in enumerate(ordered_titles_list):
#     expression += f".when('{title}', {i+1})"

# expression += ".end()"
# expression

# xgb_steps_plus = xgb_steps + (
#     ml.MutateAt(ml.endswith("title"), eval(expression)),
# )

X_train = train_data.drop("target")
y_train = train_data.target

xgb_reg_pipe = Pipeline(
    [
        ("xgb_recipe", ml.Recipe(*(xgb_steps + basic_steps))),
        ("xgb_reg", xgb.XGBRegressor(n_estimators=10)),
    ]
)
# The regressor seems to overfit much more quickly than the classifier
# With the default eta=0.3, try n_estimators=10
# With eta=0.1, try n_estimators=20
xgb_reg_pipe.fit(X_train, y_train)

In [None]:
# Score probably isn't meaningful for us for a regressor
print(f"Training score: {xgb_reg_pipe.score(X_train, y_train)}")
print(f"Test score: {xgb_reg_pipe.score(X_test, y_test)}")

In [None]:
y_pred_win_proba = xgb_reg_pipe.predict(X_test).clip(0, 1)

xgb_reg_test_results_df = test_results_df.copy(deep=True)
xgb_reg_test_results_df["y_pred_win"] = y_pred_win_proba

In [None]:
# Out of ~1,000,000 total rows
xgb_reg_test_results_df[xgb_reg_test_results_df.y_pred_win > 0.99]
xgb_reg_test_results_df[xgb_reg_test_results_df.y_pred_win < 0.01]

In [None]:
print_losses(
    xgb_reg_test_results_df.target,
    xgb_reg_test_results_df.y_pred_win,
    train_results_df.target,
)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
plot_losses(
    xgb_test_results_df,
    train_results_df,
    ax=axs[0],
    fmt="c",
    title="XGBoost with Added Features",
)
plot_losses(
    xgb_reg_test_results_df,
    train_results_df,
    ax=axs[1],
    fmt="b",
    title="Clipped XGBoost Regressor",
)

plot_losses(xgb_test_results_df, train_results_df, ax=axs[2], fmt="c")
plot_losses(
    xgb_reg_test_results_df, train_results_df, ax=axs[2], fmt="b", title="Comparison"
)
axs[2].legend(
    [
        "XGB with added features",
        "Predicting mean of y_train",
        "Clipped XGBoost regressor",
    ]
);

In [None]:
X_fit_transformed = xgb_reg_pipe["xgb_recipe"].to_ibis(X_train)
xgb_reg_pipe["xgb_reg"].get_booster().feature_names = X_fit_transformed.columns

xgb.plot_importance(
    xgb_reg_pipe["xgb_reg"],
    importance_type="gain",
    xlabel="Average Gain",
    show_values=False,
)
xgb.plot_importance(
    xgb_reg_pipe["xgb_reg"],
    importance_type="cover",
    xlabel="Average Coverage (# of samples impacted)",
    show_values=False,
);