In [1]:
import ibis
import ibis_ml as ml
from ibis import _

ibis.options.interactive = True

Let's pick up where we left off by reloading our model input table.

In [2]:
model_input_table = ibis.read_parquet("model_input_table.parquet")
model_input_table

# Data splitting

To get started, let's split this single dataset into two: a _training_ set and a _testing_ set. We'll keep most of the rows in the original dataset (subset chosen randomly) in the _training_ set. The training data will be used to _fit_ the model, and the _testing_ set will be used to measure model performance.

Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. To ensure that moves corresponding to a particular game aren't split across the _training_ and _testing_ sets, we'll only split by `game_id` (instead of splitting by `game_id` and `ply`).

In [3]:
# Create data frames for the two sets:
train_data, test_data = ml.train_test_split(
    model_input_table,
    unique_key="game_id",
    # Put 3/4 of the data into the training set
    test_size=0.25,
    num_buckets=4,
    # Fix the random numbers by setting the seed
    # This enables the analysis to be reproducible when random numbers are used
    random_seed=222,
)



# Create features

In [4]:
MATE_SCORE = 100_000

calculate_score_step = ml.Mutate(
    score=ibis.coalesce(
        (_.cp * 100).round(),
        _.mate.sign() * MATE_SCORE - _.mate.try_cast(int) * MATE_SCORE // 100,
        _.target.case().when(1.0, MATE_SCORE).when(0.0, -MATE_SCORE).when(0.5, 0).end(),
    )
)

In [5]:
calculate_score_step.fit_table(train_data, ml.core.Metadata())
calculate_score_step.transform_table(train_data)

In [None]:
lichess_rec = ml.Recipe(
    # calculate_score_step,  # TODO(deepyaman): Compute score without target, or calculate in feature engineering notebook
    ml.Mutate(
        etime=_.base_time + _.increment * 2 / 3,
        white_etime=_.white_clock + _.increment * 2 / 3,
        black_etime=_.black_clock + _.increment * 2 / 3,
    ),
    ml.Drop(ml.nominal()),
    ml.Cast(ml.everything(), "float64"),
)

# Fit a model with a recipe

In [7]:
import xgboost as xgb
from sklearn.pipeline import Pipeline

pipe = Pipeline([("lichess_rec", lichess_rec), ("clf", xgb.XGBClassifier())])

In [8]:
X_train = train_data.drop("target")
y_train = train_data.target * 2
pipe.fit(X_train, y_train)

# Use a trained workflow to predict

In [9]:
X_test = test_data.drop("target")
y_test = test_data.target * 2
pipe.score(X_test, y_test)

0.674447454333161