In [1]:
import ibis
from ibis import _

ibis.options.interactive = True

In [2]:
moves = ibis.read_parquet("/data/deepyaman/lichess-2024-07/moves/*.parquet")
moves

# Filter

In [3]:
%%time
moves.site.nunique().to_pyarrow().as_py()

CPU times: user 8min 16s, sys: 52.6 s, total: 9min 9s
Wall time: 6.93 s


89850657

In [4]:
%%time
import ibis.expr.types as ir


def parse_time_control(time_control: ir.StringColumn) -> dict[str, ir.Column]:
    index = time_control.find("+")
    base_time = time_control.substr(0, index).try_cast(int)
    increment = time_control.substr(index + 1).try_cast(int)

    time_control_type = (
        ibis.case()
        .when(time_control.isnull() | time_control.startswith("?"), "UNKNOWN")
        .when(time_control.startswith("-"), "UNLIMITED")
        .when(base_time + 60 * increment < 3 * 60, "BULLET")
        .when(base_time + 60 * increment < 15 * 60, "BLITZ")
        .when(base_time + 60 * increment < 60 * 60, "RAPID")
        .else_("STANDARD")
        .end()
    )

    return {
        "time_control_base_time": base_time,
        "time_control_increment": increment,
        "time_control_type": time_control_type,
    }


moves_with_parsed_time_control = moves.mutate(**parse_time_control(_.time_control))
moves_with_parsed_time_control.time_control_type.value_counts().preview()

CPU times: user 1h 5min 41s, sys: 5.67 s, total: 1h 5min 46s
Wall time: 16 s


In [5]:
%%time
games_with_evals = (
    moves.mutate(has_eval=_.move_comment.contains("[%eval"))
    .group_by(_.site)
    .agg(
        percent_has_eval=_.has_eval.mean(),
        has_no_eval_count=_.count() - _.has_eval.sum(),
    )
    .mutate(
        has_no_eval_count=ibis.ifelse(_.percent_has_eval == 0, -1, _.has_no_eval_count)
    )[_.has_no_eval_count.between(0, 1)]
    .cache()
)
games_with_evals.count().to_pyarrow().as_py()

CPU times: user 23min 39s, sys: 1min 50s, total: 25min 29s
Wall time: 12 s


8133979

In [6]:
%%time
moves_with_evals = games_with_evals[["site"]].join(
    moves_with_parsed_time_control, "site"
)
moves_with_evals.site.nunique().to_pyarrow().as_py()

CPU times: user 6min 31s, sys: 4.62 s, total: 6min 36s
Wall time: 3.85 s


8133979

In [7]:
%%time
moves_with_evals[_.move_ply == 1].termination.value_counts().preview()

CPU times: user 4min 33s, sys: 6.7 s, total: 4min 40s
Wall time: 3.88 s


In [8]:
%%time
moves_with_evals.mutate(
    has_clock=_.move_comment.contains("[%clk")
).has_clock.value_counts().preview()

CPU times: user 15min 20s, sys: 30.7 s, total: 15min 51s
Wall time: 5.43 s


In [9]:
%%time
moves_with_evals.mutate(has_clock=_.move_comment.contains("[%clk")).group_by(
    ["time_control_type", "has_clock"]
).count().preview()

CPU times: user 1h 22min 35s, sys: 41.4 s, total: 1h 23min 17s
Wall time: 20.9 s


In [10]:
%%time
filtered_moves = moves_with_evals[
    (_.time_control_type != "UNLIMITED")
    & (_.termination.isin(["Normal", "Time forfeit"]))
]
filtered_moves.site.nunique().to_pyarrow().as_py()

CPU times: user 1h 15min 11s, sys: 16.3 s, total: 1h 15min 27s
Wall time: 18.8 s


8123404

In [11]:
%%time
assert (
    not filtered_moves[~_.move_comment.contains("[%clk")].count().to_pyarrow().as_py()
)

CPU times: user 17min 13s, sys: 21.6 s, total: 17min 34s
Wall time: 4.97 s


# Target engineering

In [12]:
%%time
filtered_moves.result.value_counts().preview()

CPU times: user 1h 16min 36s, sys: 7.79 s, total: 1h 16min 44s
Wall time: 18.9 s


I'm not sure which computation makes the most logical sense; I suppose either works.

In [13]:
%%time
target = (
    filtered_moves.move_ply % 2
    ^ filtered_moves.result.case().when("1-0", 0).when("0-1", 1).end()
).fill_null(0.5)
target.value_counts().preview()

CPU times: user 1h 20min 9s, sys: 13.2 s, total: 1h 20min 22s
Wall time: 19.8 s


In [14]:
%%time
target = (
    filtered_moves.move_ply % 2
    - filtered_moves.result.case()
    .when("1-0", 0)
    .when("1/2-1/2", 0.5)
    .when("0-1", 1)
    .end()
).abs()
target.value_counts().preview()

CPU times: user 1h 19min 15s, sys: 7.63 s, total: 1h 19min 22s
Wall time: 19.5 s


In [15]:
%%time
filtered_moves.select("site", "move_ply", target).to_parquet(
    "../data/04_feature/target.parquet"
)

CPU times: user 1h 22min 36s, sys: 30.7 s, total: 1h 23min 7s
Wall time: 21.2 s


# Game-level features

In [16]:
game_level_features = []

In [17]:
%%time
games = filtered_moves[_.move_ply == 1].drop(["move_ply", "move_comment"]).cache()
games

CPU times: user 23min 54s, sys: 53.5 s, total: 24min 47s
Wall time: 24.2 s


In [18]:
%%time
assert games.site.nunique().to_pyarrow().as_py() == games.count().to_pyarrow().as_py()

CPU times: user 2.36 s, sys: 618 ms, total: 2.97 s
Wall time: 291 ms


## `event`-based features

In [19]:
%%time
games.event[:6].value_counts()

CPU times: user 4.37 ms, sys: 0 ns, total: 4.37 ms
Wall time: 4.34 ms


In [20]:
%%time
is_rated = games.event.startswith("Rated ")
game_level_features.append(is_rated.name("is_rated"))
is_rated.value_counts().preview()

CPU times: user 884 ms, sys: 40.1 ms, total: 924 ms
Wall time: 32.4 ms


In the future, we may want to drop unrated games in the "Filter" section above.

In [21]:
%%time
games.event.value_counts().order_by(ibis.desc("event_count")).preview()

CPU times: user 723 ms, sys: 35 ms, total: 758 ms
Wall time: 39 ms


In [22]:
%%time
is_tournament = games.event.contains("tournament")
game_level_features.append(is_tournament.name("is_tournament"))
is_tournament.value_counts().preview()

CPU times: user 841 ms, sys: 3.19 ms, total: 844 ms
Wall time: 31.6 ms


In [23]:
%%time
event_with_rated_removed = is_rated.ifelse(games.event[len("Rated ") :], games.event)
lichess_time_control_type = event_with_rated_removed.substr(
    0, event_with_rated_removed.find(" ")
)
game_level_features.append(lichess_time_control_type.name("lichess_time_control_type"))
lichess_time_control_type.value_counts().preview()

CPU times: user 5.04 s, sys: 68.3 ms, total: 5.11 s
Wall time: 128 ms


# User features

In [24]:
%%time
white_user_id = games.white.lower()
game_level_features.append(white_user_id.name("white_user_id"))
white_user_id.preview()

CPU times: user 30.9 ms, sys: 11.8 ms, total: 42.7 ms
Wall time: 8.5 ms


In [25]:
%%time
black_user_id = games.black.lower()
game_level_features.append(black_user_id.name("black_user_id"))
black_user_id.preview()

CPU times: user 0 ns, sys: 62 ms, total: 62 ms
Wall time: 10.6 ms


## Temporal features

In [26]:
%%time
assert not games.date.isnull().sum().to_pyarrow().as_py()
assert not games.utc_date.isnull().sum().to_pyarrow().as_py()

CPU times: user 581 ms, sys: 120 ms, total: 702 ms
Wall time: 27.9 ms


In [27]:
%%time
(games.date == games.utc_date).value_counts().preview()

CPU times: user 765 ms, sys: 12.5 ms, total: 777 ms
Wall time: 31.3 ms


In [28]:
# %%time
# utc_date = games.utc_date.to_date("%Y.%m.%d")
# game_level_features.append(utc_date.name("utc_date"))
# utc_date.value_counts().preview()

In [29]:
# %%time
# utc_time = games.utc_time.cast("time")
# game_level_features.append(utc_time.name("utc_time"))
# utc_time.value_counts().preview()

In [30]:
%%time
utc_timestamp = (games.utc_date + " " + games.utc_time).to_timestamp(
    "%Y.%m.%d %H:%M:%S"
)
game_level_features.append(utc_timestamp.name("utc_timestamp"))
utc_timestamp.preview()

CPU times: user 68.9 ms, sys: 4.49 ms, total: 73.4 ms
Wall time: 14.9 ms


It would be ideal if we could also compute the time since their last game finished; maybe this can be computed from clock times down the road.

In [31]:
%%time
white_seconds_since_previous_game = utc_timestamp.delta(
    utc_timestamp.lag().over(
        ibis.window(group_by=white_user_id, order_by=utc_timestamp)
    ),
    "second",
)
game_level_features.append(
    white_seconds_since_previous_game.name("white_seconds_since_previous_game")
)
white_seconds_since_previous_game.preview()

CPU times: user 1min 5s, sys: 1min 30s, total: 2min 35s
Wall time: 2.95 s


In [32]:
%%time
black_seconds_since_previous_game = utc_timestamp.delta(
    utc_timestamp.lag().over(
        ibis.window(group_by=black_user_id, order_by=utc_timestamp)
    ),
    "second",
)
game_level_features.append(
    black_seconds_since_previous_game.name("black_seconds_since_previous_game")
)
black_seconds_since_previous_game.preview()

CPU times: user 1min 7s, sys: 1min 11s, total: 2min 19s
Wall time: 2.34 s


In [33]:
%%time
white_games_in_last_hour = games.count().over(
    range=(-ibis.interval(hours=1), 0), group_by=white_user_id, order_by=utc_timestamp
)
game_level_features.append(white_games_in_last_hour.name("white_games_in_last_hour"))
white_games_in_last_hour.preview()

CPU times: user 1min 3s, sys: 1min 24s, total: 2min 27s
Wall time: 2.12 s


In [34]:
%%time
black_games_in_last_hour = games.count().over(
    range=(-ibis.interval(hours=1), 0), group_by=black_user_id, order_by=utc_timestamp
)
game_level_features.append(black_games_in_last_hour.name("black_games_in_last_hour"))
black_games_in_last_hour.preview()

CPU times: user 58.1 s, sys: 1min 4s, total: 2min 2s
Wall time: 1.98 s


In [35]:
%%time
white_games_in_last_day = games.count().over(
    range=(-ibis.interval(days=1), 0), group_by=white_user_id, order_by=utc_timestamp
)
game_level_features.append(white_games_in_last_day.name("white_games_in_last_day"))
white_games_in_last_day.preview()

CPU times: user 1min 1s, sys: 1min 11s, total: 2min 13s
Wall time: 1.89 s


In [36]:
%%time
black_games_in_last_day = games.count().over(
    range=(-ibis.interval(days=1), 0), group_by=black_user_id, order_by=utc_timestamp
)
game_level_features.append(black_games_in_last_day.name("black_games_in_last_day"))
black_games_in_last_day.preview()

CPU times: user 59.3 s, sys: 1min 7s, total: 2min 7s
Wall time: 1.99 s


In [37]:
%%time
white_games_in_last_week = games.count().over(
    range=(-ibis.interval(days=7), 0), group_by=white_user_id, order_by=utc_timestamp
)
game_level_features.append(white_games_in_last_week.name("white_games_in_last_week"))
white_games_in_last_week.preview()

CPU times: user 1min 3s, sys: 1min 15s, total: 2min 18s
Wall time: 1.97 s


In [38]:
%%time
black_games_in_last_week = games.count().over(
    range=(-ibis.interval(days=7), 0), group_by=black_user_id, order_by=utc_timestamp
)
game_level_features.append(black_games_in_last_week.name("black_games_in_last_week"))
black_games_in_last_week.preview()

CPU times: user 59 s, sys: 1min 11s, total: 2min 10s
Wall time: 1.83 s


## Elo features

In [39]:
%%time
white_elo = games.white_elo.cast(int)
game_level_features.append(white_elo.name("white_elo"))
white_elo.preview()

CPU times: user 56.5 ms, sys: 20.7 ms, total: 77.2 ms
Wall time: 20.7 ms


In [40]:
%%time
black_elo = games.black_elo.cast(int)
game_level_features.append(black_elo.name("black_elo"))
black_elo.preview()

CPU times: user 46.2 ms, sys: 881 μs, total: 47 ms
Wall time: 10.6 ms


In [41]:
%time
elo_diff = white_elo - black_elo
game_level_features.append(elo_diff.name("elo_diff"))
elo_diff.preview()

CPU times: user 28 μs, sys: 1e+03 ns, total: 29 μs
Wall time: 52.7 μs


In [42]:
%%time
white_elo_gained_since_previous_game = white_elo - white_elo.lag().over(
    ibis.window(
        group_by=[white_user_id, lichess_time_control_type], order_by=utc_timestamp
    )
)
game_level_features.append(
    white_elo_gained_since_previous_game.name("white_elo_gained_since_previous_game")
)
white_elo_gained_since_previous_game.preview()

CPU times: user 1min, sys: 1min 35s, total: 2min 36s
Wall time: 2.55 s


In [43]:
black_elo_gained_since_previous_game = black_elo - black_elo.lag().over(
    ibis.window(
        group_by=[black_user_id, lichess_time_control_type], order_by=utc_timestamp
    )
)
game_level_features.append(
    black_elo_gained_since_previous_game.name("black_elo_gained_since_previous_game")
)
black_elo_gained_since_previous_game.preview()

It would be interesting to compute RD to gauge the rating reliability/volatility, too.

## Title features

In [44]:
%%time
white_title = games.white_title
game_level_features.append(white_title.name("white_title"))
white_title.value_counts().preview()

CPU times: user 435 ms, sys: 315 ms, total: 750 ms
Wall time: 44.4 ms


In [45]:
%%time
black_title = games.black_title
game_level_features.append(black_title.name("black_title"))
black_title.value_counts().preview()

CPU times: user 478 ms, sys: 213 ms, total: 691 ms
Wall time: 43.1 ms


In [46]:
%%time
games.select("site", *game_level_features).preview()

CPU times: user 5min 59s, sys: 8min 2s, total: 14min 1s
Wall time: 15.7 s


In [47]:
%%time
games.select("site", *game_level_features).to_parquet(
    "../data/04_feature/game_level_features.parquet"
)

CPU times: user 6min 47s, sys: 8min 36s, total: 15min 24s
Wall time: 16.7 s


# Move-level features

## `clock`-based features

In [48]:
clock_based_features = []

In [49]:
%%time
from chess.pgn import CLOCK_REGEX

moves_with_clock = (
    filtered_moves.alias("timed_games").sql(
        f"""
        SELECT
          site,
          move_ply,
          move_comment,
          REGEXP_EXTRACT(
            move_comment,
            '{CLOCK_REGEX.pattern}',
            ['prefix', 'hours', 'minutes', 'seconds', 'suffix']
          ) AS clock
        FROM timed_games
        """
    )
).unpack("clock")
moves_with_clock.site.nunique().to_pyarrow().as_py()

CPU times: user 1h 14min 49s, sys: 6.66 s, total: 1h 14min 56s
Wall time: 19 s


8123404

In [50]:
%%time
clock = (
    moves_with_clock.hours.cast(int) * 3600
    + moves_with_clock.minutes.cast(int) * 60
    + moves_with_clock.seconds.cast(float)
)
clock_based_features.append(clock.name("clock"))
clock.preview()

CPU times: user 7.47 s, sys: 6.86 s, total: 14.3 s
Wall time: 746 ms


In [51]:
%%time
moves_with_clock.select("site", "move_ply", *clock_based_features)

CPU times: user 4.23 ms, sys: 11 μs, total: 4.24 ms
Wall time: 4.21 ms


In [52]:
%%time
moves_with_clock.select("site", "move_ply", *clock_based_features).to_parquet(
    "../data/04_feature/clock_based_features.parquet"
)

CPU times: user 1h 43min 42s, sys: 36.6 s, total: 1h 44min 18s
Wall time: 26.8 s


## `eval`-based features

In [53]:
eval_based_features = []

In [54]:
%%time
import string

from chess.pgn import EVAL_REGEX

moves_with_parsed_eval = (
    filtered_moves.alias("moves_with_evals")
    .sql(
        f"""
        SELECT
          *,
          REGEXP_EXTRACT(
            move_comment,
            '{EVAL_REGEX.pattern.translate(str.maketrans("", "", string.whitespace))}',
            ['prefix', 'mate', 'cp', 'depth', 'suffix']
          ) AS eval
        FROM moves_with_evals
        """
    )
    .unpack("eval")
)
moves_with_parsed_eval.site.nunique().to_pyarrow().as_py()

CPU times: user 1h 15min 3s, sys: 5.64 s, total: 1h 15min 8s
Wall time: 18.9 s


8123404

In [55]:
%%time
MATE_SCORE = 100000

score = ibis.coalesce(
    (moves_with_parsed_eval.cp.try_cast(float) * 100).round(),
    moves_with_parsed_eval.mate.try_cast(int).sign() * MATE_SCORE
    - moves_with_parsed_eval.mate.try_cast(int),
    moves_with_parsed_eval.result.case()
    .when("1-0", MATE_SCORE)
    .when("0-1", -MATE_SCORE)
    .when("1/2-1/2", 0)
    .end(),
)
eval_based_features.append(score.name("score"))
score.preview()

CPU times: user 8.15 s, sys: 5.8 s, total: 13.9 s
Wall time: 587 ms


In [56]:
%%time
CEILING = 1000
INITIAL = 15

cpl = ibis.greatest(
    (
        score.clip(-CEILING, CEILING)
        - score.clip(-CEILING, CEILING)
        .lag(default=INITIAL)
        .over(ibis.window(group_by="site", order_by="move_ply"))
    )
    * ibis.ifelse(moves_with_parsed_eval.move_ply % 2 != 0, -1, 1),
    0,
)
eval_based_features.append(cpl.name("cpl"))

CPU times: user 10.4 ms, sys: 94 μs, total: 10.4 ms
Wall time: 10.1 ms


In [57]:
%%time
moves_with_parsed_eval.count().to_pyarrow().as_py()

CPU times: user 1h 14min 15s, sys: 4.42 s, total: 1h 14min 20s
Wall time: 18.3 s


534115132

In [58]:
%%time
from alive_progress import alive_bar

NUM_PARTITIONS = 200

with alive_bar(NUM_PARTITIONS) as bar:
    for i in range(NUM_PARTITIONS):
        moves_with_parsed_eval.filter(_.site.hash().abs() % NUM_PARTITIONS == i).select(
            "site", "move_ply", *eval_based_features
        ).to_parquet(f"../data/04_feature/eval_based_features/{i}.parquet")
        bar()

|████████████████████████████████████████| 200/200 [100%] in 1:21:35.1 (0.04/s) 
CPU times: user 12d 50min 58s, sys: 5h 13min 51s, total: 12d 6h 4min 50s
Wall time: 1h 21min 35s


In [59]:
%%time
assert (
    ibis.read_parquet("../data/04_feature/eval_based_features/*.parquet")
    .count()
    .to_pyarrow()
    .as_py()
    == moves_with_parsed_eval.count().to_pyarrow().as_py()
)

CPU times: user 1h 13min 42s, sys: 9.18 s, total: 1h 13min 51s
Wall time: 20.7 s
