In [1]:
import ibis
from ibis import _

ibis.options.interactive = True

In [2]:
moves = ibis.read_parquet("/data/deepyaman/lichess-2024-07/moves/*.parquet")
moves

# Filter

In [3]:
%%time
moves.site.nunique().to_pyarrow().as_py()

CPU times: user 8min 8s, sys: 1min, total: 9min 8s
Wall time: 7 s


89850657

In [4]:
%%time
import ibis.expr.types as ir


def parse_time_control(time_control: ir.StringColumn) -> dict[str, ir.Column]:
    index = time_control.find("+")
    base_time = time_control.substr(0, index).try_cast(int)
    increment = time_control.substr(index + 1).try_cast(int)

    time_control_type = (
        ibis.case()
        .when(time_control.isnull() | time_control.startswith("?"), "UNKNOWN")
        .when(time_control.startswith("-"), "UNLIMITED")
        .when(base_time + 60 * increment < 3 * 60, "BULLET")
        .when(base_time + 60 * increment < 15 * 60, "BLITZ")
        .when(base_time + 60 * increment < 60 * 60, "RAPID")
        .else_("STANDARD")
        .end()
    )

    return {
        "time_control_base_time": base_time,
        "time_control_increment": increment,
        "time_control_type": time_control_type,
    }


moves_with_parsed_time_control = moves.mutate(**parse_time_control(_.time_control))
moves_with_parsed_time_control.time_control_type.value_counts().preview()

CPU times: user 1h 5min 12s, sys: 6.21 s, total: 1h 5min 19s
Wall time: 16.2 s


In [5]:
%%time
games_with_evals = (
    moves.mutate(has_eval=_.move_comment.contains("[%eval"))
    .group_by(_.site)
    .agg(
        percent_has_eval=_.has_eval.mean(),
        has_no_eval_count=_.count() - _.has_eval.sum(),
    )
    .mutate(
        has_no_eval_count=ibis.ifelse(_.percent_has_eval == 0, -1, _.has_no_eval_count)
    )[_.has_no_eval_count.between(0, 1)]
    .cache()
)
games_with_evals.count().to_pyarrow().as_py()

CPU times: user 23min 26s, sys: 1min 59s, total: 25min 26s
Wall time: 11.9 s


8133979

In [6]:
%%time
moves_with_evals = games_with_evals[["site"]].join(
    moves_with_parsed_time_control, "site"
)
moves_with_evals.site.nunique().to_pyarrow().as_py()

CPU times: user 6min 39s, sys: 6.16 s, total: 6min 45s
Wall time: 3.93 s


8133979

In [7]:
%%time
moves_with_evals[_.move_ply == 1].termination.value_counts().preview()

CPU times: user 4min 23s, sys: 12.1 s, total: 4min 35s
Wall time: 4.23 s


In [8]:
%%time
moves_with_evals.mutate(
    has_clock=_.move_comment.contains("[%clk")
).has_clock.value_counts().preview()

CPU times: user 15min 10s, sys: 34.3 s, total: 15min 45s
Wall time: 5.79 s


In [9]:
%%time
moves_with_evals.mutate(has_clock=_.move_comment.contains("[%clk")).group_by(
    ["time_control_type", "has_clock"]
).count().preview()

CPU times: user 1h 22min 26s, sys: 32.1 s, total: 1h 22min 58s
Wall time: 20.8 s


In [10]:
%%time
filtered_moves = moves_with_evals[
    (_.time_control_type != "UNLIMITED")
    & (_.termination.isin(["Normal", "Time forfeit"]))
]
filtered_moves.site.nunique().to_pyarrow().as_py()

CPU times: user 1h 15min 8s, sys: 22.6 s, total: 1h 15min 31s
Wall time: 18.9 s


8123404

In [11]:
%%time
assert (
    not filtered_moves[~_.move_comment.contains("[%clk")].count().to_pyarrow().as_py()
)

CPU times: user 17min 38s, sys: 24.3 s, total: 18min 2s
Wall time: 5.11 s
