In [1]:
import ibis
from ibis import _

ibis.options.interactive = True

In [2]:
moves = ibis.read_parquet("/data/deepyaman/lichess-2024-07/moves/*.parquet")
moves

# Filter

In [3]:
%%time
moves.site.nunique().to_pyarrow().as_py()

CPU times: user 8min 17s, sys: 54.8 s, total: 9min 12s
Wall time: 7.07 s


89850657

In [4]:
%%time
import ibis.expr.types as ir


def parse_time_control(time_control: ir.StringColumn) -> dict[str, ir.Column]:
    index = time_control.find("+")
    base_time = time_control.substr(0, index).try_cast(int)
    increment = time_control.substr(index + 1).try_cast(int)

    time_control_type = (
        ibis.case()
        .when(time_control.isnull() | time_control.startswith("?"), "UNKNOWN")
        .when(time_control.startswith("-"), "UNLIMITED")
        .when(base_time + 60 * increment < 3 * 60, "BULLET")
        .when(base_time + 60 * increment < 15 * 60, "BLITZ")
        .when(base_time + 60 * increment < 60 * 60, "RAPID")
        .else_("STANDARD")
        .end()
    )

    return {
        "time_control_base_time": base_time,
        "time_control_increment": increment,
        "time_control_type": time_control_type,
    }


moves_with_parsed_time_control = moves.mutate(**parse_time_control(_.time_control))
moves_with_parsed_time_control.time_control_type.value_counts().preview()

CPU times: user 1h 4min 5s, sys: 5.53 s, total: 1h 4min 10s
Wall time: 15.7 s


In [5]:
%%time
games_with_evals = (
    moves.mutate(has_eval=_.move_comment.contains("[%eval"))
    .group_by(_.site)
    .agg(
        percent_has_eval=_.has_eval.mean(),
        has_no_eval_count=_.count() - _.has_eval.sum(),
    )
    .mutate(
        has_no_eval_count=ibis.ifelse(_.percent_has_eval == 0, -1, _.has_no_eval_count)
    )[_.has_no_eval_count.between(0, 1)]
    .cache()
)
games_with_evals.count().to_pyarrow().as_py()

CPU times: user 24min 12s, sys: 1min 45s, total: 25min 58s
Wall time: 11.6 s


8133979

In [6]:
%%time
moves_with_evals = games_with_evals[["site"]].join(
    moves_with_parsed_time_control, "site"
)
moves_with_evals.site.nunique().to_pyarrow().as_py()

CPU times: user 6min 34s, sys: 4.35 s, total: 6min 39s
Wall time: 3.67 s


8133979

In [7]:
%%time
moves_with_evals[_.move_ply == 1].termination.value_counts().preview()

CPU times: user 4min 37s, sys: 11 s, total: 4min 48s
Wall time: 4.33 s


In [8]:
%%time
moves_with_evals.mutate(
    has_clock=_.move_comment.contains("[%clk")
).has_clock.value_counts().preview()

CPU times: user 14min 29s, sys: 27.5 s, total: 14min 56s
Wall time: 5.27 s


In [9]:
%%time
moves_with_evals.mutate(has_clock=_.move_comment.contains("[%clk")).group_by(
    ["time_control_type", "has_clock"]
).count().preview()

CPU times: user 1h 22min 57s, sys: 33 s, total: 1h 23min 30s
Wall time: 20.9 s


In [10]:
%%time
filtered_moves = moves_with_evals[
    (_.time_control_type != "UNLIMITED")
    & (_.termination.isin(["Normal", "Time forfeit"]))
]
filtered_moves.site.nunique().to_pyarrow().as_py()

CPU times: user 1h 15min 7s, sys: 16.6 s, total: 1h 15min 23s
Wall time: 18.8 s


8123404

In [11]:
%%time
assert (
    not filtered_moves[~_.move_comment.contains("[%clk")].count().to_pyarrow().as_py()
)

CPU times: user 17min 26s, sys: 28.9 s, total: 17min 55s
Wall time: 5.07 s


# Target engineering

In [12]:
%%time
filtered_moves.result.value_counts().preview()

CPU times: user 1h 16min 11s, sys: 5.22 s, total: 1h 16min 16s
Wall time: 18.5 s


I'm not sure which computation makes the most logical sense; I suppose either works.

In [13]:
%%time
target = (
    filtered_moves.move_ply % 2
    ^ filtered_moves.result.case().when("1-0", 0).when("0-1", 1).end()
).fill_null(0.5)
target.value_counts().preview()

CPU times: user 1h 20min 3s, sys: 9.81 s, total: 1h 20min 13s
Wall time: 19.6 s


In [14]:
%%time
target = (
    filtered_moves.move_ply % 2
    - filtered_moves.result.case()
    .when("1-0", 0)
    .when("1/2-1/2", 0.5)
    .when("0-1", 1)
    .end()
).abs()
target.value_counts().preview()

CPU times: user 1h 19min 23s, sys: 13.3 s, total: 1h 19min 37s
Wall time: 19.4 s


# Game-level features

In [15]:
game_level_features = []

In [16]:
%%time
games = filtered_moves[_.move_ply == 1].drop(["move_ply", "move_comment"]).cache()
games

CPU times: user 24min 15s, sys: 1min 21s, total: 25min 36s
Wall time: 25.1 s


In [17]:
%%time
assert games.site.nunique().to_pyarrow().as_py() == games.count().to_pyarrow().as_py()

CPU times: user 2.26 s, sys: 1.01 s, total: 3.27 s
Wall time: 319 ms


## `event`-based features

In [18]:
%%time
games.event[:6].value_counts()

CPU times: user 4.4 ms, sys: 0 ns, total: 4.4 ms
Wall time: 4.68 ms


In [19]:
%%time
is_rated = games.event.startswith("Rated ")
game_level_features.append(is_rated.name("is_rated"))
is_rated.value_counts().preview()

CPU times: user 926 ms, sys: 35.7 ms, total: 962 ms
Wall time: 43.1 ms


In the future, we may want to drop unrated games in the "Filter" section above.

In [20]:
%%time
games.event.value_counts().order_by(ibis.desc("event_count")).preview()

CPU times: user 884 ms, sys: 57.8 ms, total: 942 ms
Wall time: 59.4 ms


In [21]:
%%time
is_tournament = games.event.contains("tournament")
game_level_features.append(is_tournament.name("is_tournament"))
is_tournament.value_counts().preview()

CPU times: user 1.1 s, sys: 24.9 ms, total: 1.13 s
Wall time: 37.8 ms


In [22]:
%%time
event_with_rated_removed = is_rated.ifelse(games.event[len("Rated ") :], games.event)
lichess_time_control_type = event_with_rated_removed.substr(0, event_with_rated_removed.find(" "))
game_level_features.append(lichess_time_control_type.name("lichess_time_control_type"))
lichess_time_control_type.value_counts().preview()

CPU times: user 5.13 s, sys: 67.6 ms, total: 5.2 s
Wall time: 141 ms


# User features

In [23]:
%%time
white_user_id = games.white.lower()
game_level_features.append(white_user_id.name("white_user_id"))
white_user_id.preview()

CPU times: user 49.1 ms, sys: 8.41 ms, total: 57.5 ms
Wall time: 10.5 ms


In [24]:
%%time
black_user_id = games.black.lower()
game_level_features.append(black_user_id.name("black_user_id"))
black_user_id.preview()

CPU times: user 46.4 ms, sys: 173 μs, total: 46.6 ms
Wall time: 11.6 ms


## Temporal features

In [25]:
%%time
assert not games.date.isnull().sum().to_pyarrow().as_py()
assert not games.utc_date.isnull().sum().to_pyarrow().as_py()

CPU times: user 658 ms, sys: 2.32 ms, total: 660 ms
Wall time: 28 ms


In [26]:
%%time
(games.date == games.utc_date).value_counts().preview()

CPU times: user 711 ms, sys: 11.9 ms, total: 723 ms
Wall time: 27.8 ms


In [27]:
# %%time
# utc_date = games.utc_date.to_date("%Y.%m.%d")
# game_level_features.append(utc_date.name("utc_date"))
# utc_date.value_counts().preview()

In [28]:
# %%time
# utc_time = games.utc_time.cast("time")
# game_level_features.append(utc_time.name("utc_time"))
# utc_time.value_counts().preview()

In [29]:
%%time
utc_timestamp = (games.utc_date + " " + games.utc_time).to_timestamp(
    "%Y.%m.%d %H:%M:%S"
)
game_level_features.append(utc_timestamp.name("utc_timestamp"))
utc_timestamp.preview()

CPU times: user 29.9 ms, sys: 54.4 ms, total: 84.3 ms
Wall time: 16.3 ms


It would be ideal if we could also compute the time since their last game finished; maybe this can be computed from clock times down the road.

In [30]:
%%time
white_seconds_since_previous_game = utc_timestamp.delta(
    utc_timestamp.lag().over(
        ibis.window(group_by=white_user_id, order_by=utc_timestamp)
    ),
    "second",
)
game_level_features.append(white_seconds_since_previous_game.name("white_seconds_since_previous_game"))
white_seconds_since_previous_game.preview()

CPU times: user 59.9 s, sys: 1min 21s, total: 2min 20s
Wall time: 2.97 s


In [31]:
%%time
black_seconds_since_previous_game = utc_timestamp.delta(
    utc_timestamp.lag().over(
        ibis.window(group_by=black_user_id, order_by=utc_timestamp)
    ),
    "second",
)
game_level_features.append(black_seconds_since_previous_game.name("black_seconds_since_previous_game"))
black_seconds_since_previous_game.preview()

CPU times: user 1min 9s, sys: 1min 23s, total: 2min 32s
Wall time: 2.1 s


In [32]:
%%time
white_games_in_last_hour = games.count().over(
    range=(-ibis.interval(hours=1), 0), group_by=white_user_id, order_by=utc_timestamp
)
game_level_features.append(white_games_in_last_hour.name("white_games_in_last_hour"))
white_games_in_last_hour.preview()

CPU times: user 1min 2s, sys: 1min 7s, total: 2min 10s
Wall time: 2.02 s


In [33]:
%%time
black_games_in_last_hour = games.count().over(
    range=(-ibis.interval(hours=1), 0), group_by=black_user_id, order_by=utc_timestamp
)
game_level_features.append(black_games_in_last_hour.name("black_games_in_last_hour"))
black_games_in_last_hour.preview()

CPU times: user 1min, sys: 59.3 s, total: 2min
Wall time: 1.95 s


In [34]:
%%time
white_games_in_last_day = games.count().over(
    range=(-ibis.interval(days=1), 0), group_by=white_user_id, order_by=utc_timestamp
)
game_level_features.append(white_games_in_last_day.name("white_games_in_last_day"))
white_games_in_last_day.preview()

CPU times: user 57.6 s, sys: 1min, total: 1min 57s
Wall time: 1.85 s


In [35]:
%%time
black_games_in_last_day = games.count().over(
    range=(-ibis.interval(days=1), 0), group_by=black_user_id, order_by=utc_timestamp
)
game_level_features.append(black_games_in_last_day.name("black_games_in_last_day"))
black_games_in_last_day.preview()

CPU times: user 53.9 s, sys: 57.4 s, total: 1min 51s
Wall time: 1.82 s


In [36]:
%%time
white_games_in_last_week = games.count().over(
    range=(-ibis.interval(days=7), 0), group_by=white_user_id, order_by=utc_timestamp
)
game_level_features.append(white_games_in_last_week.name("white_games_in_last_week"))
white_games_in_last_week.preview()

CPU times: user 1min 4s, sys: 1min 26s, total: 2min 30s
Wall time: 1.89 s


In [37]:
%%time
black_games_in_last_week = games.count().over(
    range=(-ibis.interval(days=7), 0), group_by=black_user_id, order_by=utc_timestamp
)
game_level_features.append(black_games_in_last_week.name("black_games_in_last_week"))
black_games_in_last_week.preview()

CPU times: user 54.4 s, sys: 1min 6s, total: 2min 1s
Wall time: 1.79 s


## Elo features

In [38]:
%%time
white_elo = games.white_elo.cast(int)
game_level_features.append(white_elo.name("white_elo"))
white_elo.preview()

CPU times: user 14.6 ms, sys: 41.7 ms, total: 56.4 ms
Wall time: 11.6 ms


In [39]:
%%time
black_elo = games.black_elo.cast(int)
game_level_features.append(black_elo.name("black_elo"))
black_elo.preview()

CPU times: user 42.7 ms, sys: 187 μs, total: 42.9 ms
Wall time: 10.5 ms


In [40]:
%time
elo_diff = white_elo - black_elo
game_level_features.append(elo_diff.name("elo_diff"))
elo_diff.preview()

CPU times: user 29 μs, sys: 0 ns, total: 29 μs
Wall time: 54.6 μs


In [41]:
%%time
white_elo_gained_since_previous_game = white_elo - white_elo.lag().over(
    ibis.window(
        group_by=[white_user_id, lichess_time_control_type], order_by=utc_timestamp
    )
)
game_level_features.append(
    white_elo_gained_since_previous_game.name("white_elo_gained_since_previous_game")
)
white_elo_gained_since_previous_game.preview()

CPU times: user 1min 14s, sys: 1min 54s, total: 3min 8s
Wall time: 2.6 s


In [42]:
black_elo_gained_since_previous_game = black_elo - black_elo.lag().over(
    ibis.window(
        group_by=[black_user_id, lichess_time_control_type], order_by=utc_timestamp
    )
)
game_level_features.append(
    black_elo_gained_since_previous_game.name("black_elo_gained_since_previous_game")
)
black_elo_gained_since_previous_game.preview()

It would be interesting to compute RD to gauge the rating reliability/volatility, too.

## Title features

In [43]:
%%time
white_title = games.white_title
game_level_features.append(white_title.name("white_title"))
white_title.value_counts().preview()

CPU times: user 612 ms, sys: 86.9 ms, total: 699 ms
Wall time: 32.7 ms


In [44]:
%%time
black_title = games.black_title
game_level_features.append(black_title.name("black_title"))
black_title.value_counts().preview()

CPU times: user 555 ms, sys: 18.2 ms, total: 573 ms
Wall time: 27.1 ms


In [45]:
%%time
games.select("site", *game_level_features)

CPU times: user 20.7 ms, sys: 22 μs, total: 20.7 ms
Wall time: 19.9 ms
