In [None]:
import ibis
from ibis import _

# Connect to Starburst Galaxy

First, connect to Starburst Galaxy. We’ll use a `.env` in this example for secrets that are loaded as environment variables. This requires installing the `python-dotenv` package—alternatively, you can set the environment variables for your system.

In [None]:
import os

from dotenv import load_dotenv
from trino.auth import BasicAuthentication

load_dotenv()

user = os.getenv("TRINO_USERNAME")
password = os.getenv("TRINO_PASSWORD")
host = os.getenv("TRINO_HOSTNAME")
port = os.getenv("TRINO_PORTNUMBER")
catalog = "lichess"
schema = "lichess"

con = ibis.trino.connect(
    user=user,
    password=BasicAuthentication(user, password),
    host=host,
    port=port,
    database=catalog,
    schema=schema,
    roles="accountadmin",
    http_scheme="https",
)
con

# Verify connection

List the tables your connection has:

In [None]:
con.list_tables()

Run a SQL query:

In [None]:
con.sql("SELECT * FROM games LIMIT 10").execute()

# Filter out games without evals

In [None]:
unfiltered_games = con.table("games")
unfiltered_games.count().to_pyarrow().as_py()

In [None]:
unfiltered_moves = con.table("moves")
unfiltered_moves.count().to_pyarrow().as_py()

In [None]:
games_with_evals = (
    unfiltered_moves.mutate(has_eval=_.comment.contains("[%eval"))
    .group_by(_.game_id)
    .agg(
        percent_has_eval=_.has_eval.mean(),
        has_no_eval_count=_.count() - _.has_eval.sum(),
    )
    .mutate(
        has_no_eval_count=ibis.ifelse(_.percent_has_eval == 0, -1, _.has_no_eval_count)
    )
    .filter(_.has_no_eval_count.between(0, 1))
)
games_with_evals.count().to_pyarrow().as_py()

In [None]:
games = games_with_evals.select("game_id").join(unfiltered_games, "game_id")
games.count().to_pyarrow().as_py()

In [None]:
moves = games_with_evals.select("game_id").join(unfiltered_moves, "game_id")
moves.count().to_pyarrow().as_py()

# Create game-level features

In [None]:
game_level_features = []

## `event`-based features

In [None]:
is_rated = games.event.startswith("Rated ")
game_level_features.append(is_rated.name("is_rated"))

In [None]:
event_with_rated_prefix_stripped = is_rated.ifelse(
    games.event[len("Rated ") :], games.event
)
lichess_time_control_type = event_with_rated_prefix_stripped.substr(
    0, event_with_rated_prefix_stripped.find(" ")
)
game_level_features.append(lichess_time_control_type.name("lichess_time_control_type"))

In [None]:
is_tournament = games.event.contains("tournament")
game_level_features.append(is_tournament.name("is_tournament"))

## Elo-based features

In [None]:
white_elo = games.white_elo.cast(int)
game_level_features.append(white_elo.name("white_elo"))

In [None]:
black_elo = games.black_elo.cast(int)
game_level_features.append(black_elo.name("black_elo"))

## Title features

In [None]:
white_title = games.white_title
game_level_features.append(white_title.name("white_title"))

In [None]:
black_title = games.black_title
game_level_features.append(black_title.name("black_title"))

## `time_control`-based features

In [None]:
index = games.time_control.find("+")
base_time = games.time_control.substr(0, index).try_cast(int)
increment = games.time_control.substr(index + 1).try_cast(int)
game_level_features += [
    base_time.name("base_time"),
    increment.name("increment"),
]

## Target variable

In [None]:
target = games.result.case().when("1-0", 1).when("1/2-1/2", 0.5).when("0-1", 0).end()
game_level_features.append(target.name("target"))

# Create move-level features

## Eval-based features

In [None]:
eval_based_features = []

In [None]:
moves_with_parsed_eval = moves.alias("moves").sql(
    r"""
        SELECT
          *,
          REGEXP_EXTRACT(
            comment,
            '\[%eval\s(\#[+-]?\d+)|([+-]?\d{0,10}\.\d{1,2}|\d{1,10}\.?)',
            1
          ) AS mate,
          REGEXP_EXTRACT(
            comment,
            '\[%eval\s(\#[+-]?\d+)|([+-]?\d{0,10}\.\d{1,2}|\d{1,10}\.?)',
            2
          ) AS regular_eval
        FROM moves
        """
)
moves_with_parsed_eval

In [None]:
MATE_SCORE = 1_000  # Arbitrary large number greater than 121 (`max(abs(mate))`)

mate_eval = moves_with_parsed_eval.mate.substr(1).try_cast(
    int
).sign() * MATE_SCORE - moves_with_parsed_eval.mate.substr(1).try_cast(int)
eval_based_features += [
    mate_eval.name("mate_eval"),
    moves_with_parsed_eval.regular_eval.try_cast("float").name("regular_eval"),
]

## Clock-based features

In [None]:
clock_based_features = []

In [None]:
moves_with_parsed_clock = moves.alias("moves").sql(
    r"""
        SELECT
          *,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            1
          ) AS hours,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            2
          ) AS minutes,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            3
          ) AS seconds
        FROM moves
        """
)
moves_with_parsed_clock

In [None]:
clock = (
    moves_with_parsed_clock.hours.try_cast(int) * 3600
    + moves_with_parsed_clock.minutes.try_cast(int) * 60
    + moves_with_parsed_clock.seconds.try_cast(float)
)

In [None]:
w = ibis.window(group_by="game_id", order_by="ply")
previous_clock = clock.lag().over(w)
white_clock = ibis.ifelse(moves_with_parsed_clock.ply % 2 == 1, clock, previous_clock)
black_clock = ibis.ifelse(moves_with_parsed_clock.ply % 2 == 0, clock, previous_clock)
black_clock = black_clock.coalesce(white_clock)
clock_based_features += [
    white_clock.name("white_clock"),
    black_clock.name("black_clock"),
]

# Create model input table

In [None]:
move_level_features = moves_with_parsed_eval.select(
    "game_id", "ply", *eval_based_features
).join(
    moves_with_parsed_clock.select("game_id", "ply", *clock_based_features),
    ["game_id", "ply"],
)
model_input_table = games.select("game_id", *game_level_features).join(
    move_level_features, "game_id"
)
model_input_table

In [None]:
model_input_table_with_final_eval = model_input_table.mutate(
    mate_eval=model_input_table.mate_eval.coalesce(
        ibis.ifelse(
            model_input_table.regular_eval.isnull(),
            model_input_table.target.case()
            .when(1.0, MATE_SCORE)
            .when(0.0, -MATE_SCORE)
            .when(0.5, 0)
            .end(),
            None,
        )
    )
)

In [None]:
filtered_model_input_table = model_input_table_with_final_eval.filter(
    (model_input_table_with_final_eval.is_rated)
    & (model_input_table_with_final_eval.lichess_time_control_type != "Correspondence")
)

In [None]:
filtered_model_input_table.head().execute()

In [None]:
filtered_model_input_table.count().to_pyarrow().as_py()