In [None]:
import ibis
import ibis_ml as ml
import matplotlib.pyplot as plt
from ibis import _

ibis.options.interactive = True

Let's pick up where we left off by reloading our model input table.

In [None]:
model_input_table = ibis.read_parquet("solutions/small_model_input_table.parquet")
model_input_table

# Data splitting

To get started, let's split this single dataset into two: a _training_ set and a _testing_ set. We'll keep most of the rows in the original dataset (subset chosen randomly) in the _training_ set. The training data will be used to _fit_ the model, and the _testing_ set will be used to measure model performance.

Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. To ensure that moves corresponding to a particular game aren't split across the _training_ and _testing_ sets, we'll only split by `game_id` (instead of splitting by `game_id` and `ply`).

In [None]:
# Create data frames for the two sets:
train_data, test_data = ml.train_test_split(
    model_input_table,
    unique_key="game_id",
    # Put 3/4 of the data into the training set
    test_size=0.25,
    num_buckets=4,
    # Set the seed to enable reproducible analysis
    random_seed=111,
)

## Fit and transform `X_train` using a preprocessing recipe
`.to_ibis()` transforms the fitted `X_train` before converting it to an ibis table. 

In [None]:
lichess_recipe = ml.Recipe(
    ml.DropZeroVariance(ml.everything()),
    ml.Drop(ml.string()),
)

In [None]:
X_train = train_data.drop("target")
# Convert 0.0 (black win), 0.5 (draw), and 1.0 (white win) to [0 1 2] class labels for a classifier:
y_train = (train_data.target * 2).cast(int)

X_test = test_data.drop("target")
y_test = (test_data.target * 2).cast(int)

X_fit_transformed = lichess_recipe.fit(X_train).to_ibis(X_train)
X_fit_transformed

### Exercise 1
All features need to encoded as numeric datatypes. For the inference we'll be using later, float works best. How would you cast all columns to float64 at the end of this recipe?

In [None]:
lichess_recipe = ml.Recipe(
    ml.DropZeroVariance(ml.everything()),
    ml.Drop(ml.string()),
    # Add your code here
)

In [None]:
lichess_recipe = ml.Recipe(
    ml.DropZeroVariance(ml.everything()),
    ml.Drop(ml.string()),
    ml.Cast(ml.everything(), "float64"),
)

## Fit a model with a pipeline

In [None]:
import xgboost as xgb
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ("lichess_recipe", lichess_recipe),
        ("xgb_clf", xgb.XGBClassifier(n_estimators=20)),
    ]
)
pipe.fit(X_train, y_train)

To get a sense of our fitted XGBoost model, let's plot feature importance. `importance_type='gain'` plots the average gain of splits using a feature, and `importance_type='cover'` plots the average number of samples impacted by splits using a feature.

In [None]:
X_fit_transformed = pipe["lichess_recipe"].to_ibis(X_train)
pipe["xgb_clf"].get_booster().feature_names = X_fit_transformed.columns

xgb.plot_importance(
    pipe["xgb_clf"], importance_type="gain", xlabel="Average Gain", show_values=False
)
xgb.plot_importance(
    pipe["xgb_clf"],
    importance_type="cover",
    xlabel="Average Coverage (# of samples impacted)",
    show_values=False,
);
# If you add more features later on, you can use the max_num_features keyword argument
# to plot the more important ones.

In [None]:
print(f"Training score: {pipe.score(X_train, y_train)}")
print(f"Test score: {pipe.score(X_test, y_test)}")

## Use a trained workflow to predict

In [None]:
import numpy as np
import sys


def log_loss(y_true: np.array, y_pred: np.array):
    y_true = y_true.astype("float64")
    y_pred = y_pred.astype("float64")

    y_pred = y_pred.clip(sys.float_info.epsilon, 1 - sys.float_info.epsilon)
    log_losses = y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)

    return np.mean(-log_losses)


def print_losses(y_true, y_pred, y_train):
    print(f"Log loss: {log_loss(y_true, y_pred)}")
    print(
        f"Log loss of predicting mean of y_train: {log_loss(y_true, y_train.mean()*np.ones_like(y_true))}"
    )
    print(f"Log loss of perfect prediction: {log_loss(y_true, y_true)}")
    print()


def calculate_losses(y_true, y_pred, y_train):
    loss = log_loss(y_true, y_pred)
    loss_predicting_mean = log_loss(y_true, y_train.mean() * np.ones_like(y_true))
    loss_perfect = log_loss(y_true, y_true)

    return [loss, loss_predicting_mean, loss_perfect]


def plot_losses(
    test_results_df,
    train_results_df,
    ax=None,
    title="Adjusted Log Loss vs. Move",
    fmt="b",
):
    move_nums = range(0, 60 + 1)
    losses = []

    for move in move_nums:
        losses += [
            calculate_losses(
                test_results_df[test_results_df.ply == 2 * move + 1].target,
                test_results_df[test_results_df.ply == 2 * move + 1].y_pred_win,
                train_results_df[train_results_df.ply == 2 * move + 1].target,
            )
        ]

    losses = np.array(losses)

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 4))

    ax.plot(move_nums, losses[:, 0] - losses[:, 2], fmt)
    ax.plot(move_nums, losses[:, 1] - losses[:, 2], "r:")

    ax.set_title(title)
    ax.set_xlabel("Move")
    ax.set_ylabel("(Log loss) - (perfect log loss)")
    ax.legend(["Adjusted log loss", "Adjusted log loss of\npredicting mean of y_train"])
    ax.set_ylim(0, 0.7)

In [None]:
y_pred_proba = pipe.predict_proba(X_test)
y_pred_win_proba = y_pred_proba[:, 2] + 0.5 * y_pred_proba[:, 1]

test_results_df = test_data.select("ply", "target").to_pandas()
test_results_df["y_pred_win"] = y_pred_win_proba

train_results_df = train_data.select("ply", "target").to_pandas()

print_losses(
    test_results_df.target, test_results_df.y_pred_win, train_results_df.target
)

In [None]:
for move in range(0, 60 + 1, 5):
    print(f"Move: {move+1}")
    print_losses(
        test_results_df[test_results_df.ply == 2 * move + 1].target,
        test_results_df[test_results_df.ply == 2 * move + 1].y_pred_win,
        train_results_df[train_results_df.ply == 2 * move + 1].target,
    )

In [None]:
plot_losses(test_results_df, train_results_df)

## Create an interpretable model with logistic regression 

In [None]:
from sklearn.linear_model import LogisticRegression

basic_steps = (
    ml.DropZeroVariance(ml.everything()),
    ml.Drop(ml.string()),
    ml.Cast(ml.everything(), "float64"),
)
lr_steps = (
    ml.ImputeMean(ml.numeric()),
    ml.ScaleStandard(ml.numeric()),
)

lr_pipe = Pipeline(
    [
        ("lr_recipe", ml.Recipe(*(basic_steps + lr_steps))),
        ("lr_model", LogisticRegression()),
    ]
)
lr_pipe.fit(X_train, y_train)

In [None]:
print("Logistic regression:")
print(f"Training score: {lr_pipe.score(X_train, y_train)}")
print(f"Test score: {lr_pipe.score(X_test, y_test)}")
print()
print("XGBoost (from above):")
print(f"Training score: {pipe.score(X_train, y_train)}")
print(f"Test score: {pipe.score(X_test, y_test)}")
print()

Not too shabby for such a simple model! Predicting white win for everything would result in predicting the correct class for 48.2% of the rows in the test data.

In [None]:
test_results_df[test_results_df.target > 0.99].shape[0] / test_results_df.shape[0]

In [None]:
y_pred_proba = lr_pipe.predict_proba(X_test)
y_pred_win_proba = y_pred_proba[:, 2] + 0.5 * y_pred_proba[:, 1]

lr_test_results_df = test_results_df.copy(deep=True)
lr_test_results_df["y_pred_win"] = y_pred_win_proba

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
plot_losses(
    lr_test_results_df,
    train_results_df,
    ax=axs[0],
    fmt="g",
    title="Logistic Regression",
)
plot_losses(test_results_df, train_results_df, ax=axs[1], fmt="b", title="XGBoost")

plot_losses(lr_test_results_df, train_results_df, ax=axs[2], fmt="g")
plot_losses(test_results_df, train_results_df, ax=axs[2], fmt="b", title="Comparison")
axs[2].legend(["Logistic regression", "Predicting mean of y_train", "XGBoost"]);

Now let's take a look at the coefficients.

In [None]:
import pandas as pd

X_fit_transformed = lr_pipe["lr_recipe"].to_ibis(X_train)

coef_df = pd.DataFrame(
    lr_pipe["lr_model"].coef_,
    columns=X_fit_transformed.columns,
    index=["black win", "draw", "white win"],
)
coef_df

All of these make sense to me, except for the coefficients for `white_elo` and `black_elo`, which seem backwards—a higher rating for white decreases white's probability of winning and (very slightly) increases black's probability of winning, and a higher rating for black decreases black's probability of winning more than it decreases white's probability of winning. This might be because, after accounting for the effects of all of the other features, rating has a counterintuitive effect. To check that we didn't make a mistake while creating our `model_input_table`, let's fit a model using only those two features. By not standardizing the features, I can also apply the coefficients directly to make predictions.

In [None]:
game_level_train_data = train_data.filter(_.ply == 1)
game_level_test_data = test_data.filter(_.ply == 1)

X_train_game = game_level_train_data.drop("target")
y_train_game = (game_level_train_data.target * 2).cast(int)
X_test_game = game_level_test_data.drop("target")
y_test_game = (game_level_test_data.target * 2).cast(int)

# Preserve only "white_elo" and "black_elo":
lr_steps = (ml.Drop(~ml.endswith("elo")),)

lr_pipe = Pipeline(
    [
        ("lr_recipe", ml.Recipe(*(basic_steps + lr_steps))),
        ("lr_model", LogisticRegression(penalty=None)),
    ]
)
lr_pipe.fit(X_train_game, y_train_game)

In [None]:
X_fit_transformed = lr_pipe["lr_recipe"].to_ibis(X_train_game)
coef_df = pd.DataFrame(
    lr_pipe["lr_model"].coef_,
    columns=X_fit_transformed.columns,
    index=["black win", "draw", "white win"],
)
coef_df

In [None]:
import scipy as sp

beta_matrix = np.hstack(
    [lr_pipe["lr_model"].coef_, np.transpose([lr_pipe["lr_model"].intercept_])]
)

white_ratings = [2800, 2800, 1000, 1000]
black_ratings = [1000, 2800, 1000, 2800]

for white_rating, black_rating in zip(white_ratings, black_ratings):
    print(f"{white_rating} white, {black_rating} black:")
    print(
        sp.special.softmax(
            beta_matrix
            @ np.transpose(
                [[white_rating, black_rating, 1]]
            )  # @ signifies matrix multiplication
        )
    )
    print()

The effect of rating in this dataset is much weaker than we'd expect. Theoretically, a player rated 200 points higher than their opponent should have an expected outcome (i.e. percent win + 0.5*percent draw) of 76%. But a high percentage of fast games (blitz and bullet), combined with the vast majority of the games involving players with ratings within 100 points of each other, might be making ratings not too useful for predicting win probability here.

In [None]:
print(f"Training score: {lr_pipe.score(X_train_game, y_train_game)}")
print(f"Test score: {lr_pipe.score(X_test_game, y_test_game)}")
print()

game_level_df = test_results_df[test_results_df.ply == 1]
print(
    f"Test score of predicting white win for everything: {game_level_df[game_level_df.target > 0.99].shape[0]/game_level_df.shape[0]}"
)

# Back to XGBoost
## Create features
The difference between how much time the players have might be a useful feature. After all, it might be telling if white has only 5% of their time left while black still has 95%. We can do this type of last-mile feature processing using IbisML.

In [None]:
NUM_MOVES = 40

xgb_steps = (
    ml.Mutate(
        relative_clock_diff=(_.white_clock - _.black_clock)
        / (_.base_time + _.increment * NUM_MOVES)
    ),
    # (We're adding the increment to the base time because players get that added after every move.)
)

In [None]:
xgb_recipe = ml.Recipe(*(xgb_steps))
X_fit_transformed = xgb_recipe.fit(X_train).to_ibis(X_train)
X_fit_transformed.filter(_.ply == 50)

### Exercise 2
Unlike logistic regression, garden-variety XGBoost doesn't directly incorporate linear relationships between features. The difference between the two players' ratings is arguably more useful than the individual ratings. How would you add this feature?

In [None]:
xgb_steps = (
    ml.Mutate(
        relative_clock_diff=(_.white_clock - _.black_clock)
        / (_.base_time + _.increment * NUM_MOVES)
    ),
    # Add your code here
)

In [None]:
xgb_steps = (
    ml.Mutate(
        relative_clock_diff=(_.white_clock - _.black_clock)
        / (_.base_time + _.increment * NUM_MOVES)
    ),
    ml.Mutate(elo_diff=_.white_elo - _.black_elo),
)

In [None]:
xgb_pipe = Pipeline(
    [
        ("xgb_recipe", ml.Recipe(*(xgb_steps + basic_steps))),
        ("xgb_clf", xgb.XGBClassifier(n_estimators=20)),
    ]
)
xgb_pipe.fit(X_train, y_train)

In [None]:
X_fit_transformed = xgb_pipe["xgb_recipe"].to_ibis(X_train)
xgb_pipe["xgb_clf"].get_booster().feature_names = X_fit_transformed.columns

xgb.plot_importance(
    xgb_pipe["xgb_clf"],
    importance_type="gain",
    xlabel="Average Gain",
    show_values=False,
)
xgb.plot_importance(
    xgb_pipe["xgb_clf"],
    importance_type="cover",
    xlabel="Average Coverage (# of samples impacted)",
    show_values=False,
);

In [None]:
print("XGBoost with added features:")
print(f"Training score: {xgb_pipe.score(X_train, y_train)}")
print(f"Test score: {xgb_pipe.score(X_test, y_test)}")
print()
print("Original XGBoost model:")
print(f"Training score: {pipe.score(X_train, y_train)}")
print(f"Test score: {pipe.score(X_test, y_test)}")
print()

In [None]:
y_pred_proba = xgb_pipe.predict_proba(X_test)
y_pred_win_proba = y_pred_proba[:, 2] + 0.5 * y_pred_proba[:, 1]

xgb_test_results_df = test_results_df.copy(deep=True)
xgb_test_results_df["y_pred_win"] = y_pred_win_proba

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
plot_losses(
    xgb_test_results_df,
    train_results_df,
    ax=axs[0],
    fmt="c",
    title="XGBoost with Added Features",
)
plot_losses(
    test_results_df,
    train_results_df,
    ax=axs[1],
    fmt="b",
    title="Original XGBoost Model",
)

plot_losses(xgb_test_results_df, train_results_df, ax=axs[2], fmt="c")
plot_losses(test_results_df, train_results_df, ax=axs[2], fmt="b", title="Comparison")
axs[2].legend(
    ["XGB with added features", "Predicting mean of y_train", "Original XGB model"]
);

`elo_diff` helps the model a little at the beginning of the game, when there's no meaningful position-based evals to go off of. Overall, there's not much visible improvement, but we advertised simplicity, not accuracy, for the models we build in this tutorial ;) 

## Visualizing win probability for some example games

In [None]:
# TODO Anjali: Find example games in test_data and don't convert train_data to df; create actual plotting functions

test_data_df = test_data.to_pandas()
train_data_df = train_data.to_pandas()

In [None]:
y_pred_proba = xgb_pipe.predict_proba(X_train)
y_pred_win_proba = y_pred_proba[:, 2] + 0.5 * y_pred_proba[:, 1]

train_data_df["y_pred_win"] = y_pred_win_proba

In [None]:
game = "8220ahpc"

game_data = train_data_df[train_data_df.game_id == game].sort_values(by="ply")
game_data

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
axs[0].plot(game_data.ply, game_data.y_pred_win)
axs[1].plot(game_data.ply, game_data.white_clock)
axs[2].plot(game_data.ply, game_data.black_clock)
axs[3].plot(game_data.ply, game_data.regular_eval);  # + game_data.mate_eval);