In [1]:
import ibis
from ibis import _

# Connect to Starburst Galaxy

First, connect to Starburst Galaxy. We’ll use a `.env` in this example for secrets that are loaded as environment variables. This requires installing the `python-dotenv` package—alternatively, you can set the environment variables for your system.

In [2]:
import os

from dotenv import load_dotenv
from trino.auth import BasicAuthentication

load_dotenv()

user = os.getenv("TRINO_USERNAME")
password = os.getenv("TRINO_PASSWORD")
host = os.getenv("TRINO_HOSTNAME")
port = os.getenv("TRINO_PORTNUMBER")
catalog = "lichess"
schema = "lichess"

con = ibis.trino.connect(
    user=user,
    password=BasicAuthentication(user, password),
    host=host,
    port=port,
    database=catalog,
    schema=schema,
    roles="accountadmin",
    http_scheme="https",
)
con



<ibis.backends.trino.Backend at 0x76d1261e6c00>

# Verify connection

List the tables your connection has:

In [3]:
con.list_tables()

['games', 'moves']

Run a SQL query:

In [4]:
con.sql("SELECT * FROM games LIMIT 10").execute()

Unnamed: 0,game_id,event,site,date,round,white,black,result,utc_date,utc_time,white_elo,black_elo,white_rating_diff,black_rating_diff,eco,opening,time_control,termination,white_title,black_title
0,34brw9vk,Rated Bullet game,https://lichess.org/34brw9vk,2024.09.22,-,sriishanth,ali7_azimi7,0-1,2024.09.22,02:57:47,1906,1973,-9,5,A03,Bird Opening: Dutch Variation,60+2,Normal,,
1,J9H1AkSC,Rated Blitz game,https://lichess.org/J9H1AkSC,2024.09.22,-,asysyarof,jaygonzales1202,1-0,2024.09.22,02:57:47,1893,1892,6,-5,B10,Caro-Kann Defense: Accelerated Panov Attack,180+0,Normal,,
2,Lypztt25,Rated Bullet game,https://lichess.org/Lypztt25,2024.09.22,-,martinde25,beetlejuice64,1-0,2024.09.22,02:57:48,1930,1978,7,-7,E90,"King's Indian Defense: Normal Variation, Rare ...",60+0,Time forfeit,,
3,0ZR17pZf,Rated Bullet game,https://lichess.org/0ZR17pZf,2024.09.22,-,Homo_Sapiens_Natural,utchemy,0-1,2024.09.22,02:57:48,1265,1259,-6,8,C20,King's Pawn Game: Leonardis Variation,60+0,Time forfeit,,
4,aOmkkHVx,Rated Bullet game,https://lichess.org/aOmkkHVx,2024.09.22,-,M4yank,Andrey_Lysov,1-0,2024.09.22,02:57:48,2391,2372,5,-6,C00,French Defense: Queen's Knight,60+0,Time forfeit,,
5,ngZQxKij,Rated Bullet game,https://lichess.org/ngZQxKij,2024.09.22,-,iiliiyo,Elgranconejo,0-1,2024.09.22,02:57:48,2588,2614,-5,5,D00,Queen's Pawn Game: Chigorin Variation,60+0,Time forfeit,,
6,4NEcW1Hy,Rated Bullet game,https://lichess.org/4NEcW1Hy,2024.09.22,-,MSSAAD,reizinho,0-1,2024.09.22,02:57:48,2131,2131,-6,5,D00,"Queen's Pawn Game: Accelerated London System, ...",60+0,Time forfeit,,
7,hoRDhHhz,Rated Bullet game,https://lichess.org/hoRDhHhz,2024.09.22,-,bivalves,mountainflea,1-0,2024.09.22,02:57:48,1442,1317,3,-5,C58,"Italian Game: Two Knights Defense, Polerio Def...",60+0,Time forfeit,,
8,S11VRbM5,Rated Bullet game,https://lichess.org/S11VRbM5,2024.09.22,-,sandrodaniel,Dewakipascs,0-1,2024.09.22,02:57:48,2302,2205,-7,7,D00,Queen's Pawn Game,60+0,Normal,,
9,rteScC62,Rated Bullet game,https://lichess.org/rteScC62,2024.09.22,-,rinaakbar_luwu2014,HYCUTE,1-0,2024.09.22,02:57:48,2162,2108,5,-4,A00,Kádas Opening,60+0,Normal,,


# Filter out games without evals

In [5]:
unfiltered_games = con.table("games")
unfiltered_games.count().to_pyarrow().as_py()

87713219

In [6]:
unfiltered_moves = con.table("moves")
unfiltered_moves.count().to_pyarrow().as_py()

5837064712

In [7]:
games_with_evals = (
    unfiltered_moves.mutate(has_eval=_.comment.contains("[%eval"))
    .group_by(_.game_id)
    .agg(
        percent_has_eval=_.has_eval.mean(),
        has_no_eval_count=_.count() - _.has_eval.sum(),
    )
    .mutate(
        has_no_eval_count=ibis.ifelse(_.percent_has_eval == 0, -1, _.has_no_eval_count)
    )
    .filter(_.has_no_eval_count.between(0, 1))
)
games_with_evals.count().to_pyarrow().as_py()

7933774

In [8]:
games = games_with_evals.select("game_id").join(unfiltered_games, "game_id")
games.count().to_pyarrow().as_py()

7933774

In [9]:
moves = games_with_evals.select("game_id").join(unfiltered_moves, "game_id")
moves.count().to_pyarrow().as_py()

519982245

# Create game-level features

In [10]:
game_level_features = []

## `event`-based features

In [11]:
is_rated = games.event.startswith("Rated ")
game_level_features.append(is_rated.name("is_rated"))

In [12]:
event_with_rated_prefix_stripped = is_rated.ifelse(
    games.event[len("Rated ") :], games.event
)
lichess_time_control_type = event_with_rated_prefix_stripped.substr(
    0, event_with_rated_prefix_stripped.find(" ")
)
game_level_features.append(lichess_time_control_type.name("lichess_time_control_type"))

In [13]:
is_tournament = games.event.contains("tournament")
game_level_features.append(is_tournament.name("is_tournament"))

## Elo-based features

In [14]:
white_elo = games.white_elo.cast(int)
game_level_features.append(white_elo.name("white_elo"))

In [15]:
black_elo = games.black_elo.cast(int)
game_level_features.append(black_elo.name("black_elo"))

## Title features

In [16]:
white_title = games.white_title
game_level_features.append(white_title.name("white_title"))

In [17]:
black_title = games.black_title
game_level_features.append(black_title.name("black_title"))

## `time_control`-based features

In [18]:
index = games.time_control.find("+")
base_time = games.time_control.substr(0, index).try_cast(int)
increment = games.time_control.substr(index + 1).try_cast(int)
game_level_features += [
    base_time.name("base_time"),
    increment.name("increment"),
]

## Target variable

In [19]:
target = games.result.case().when("1-0", 1).when("1/2-1/2", 0.5).when("0-1", 0).end()
game_level_features.append(target.name("target"))

# Create move-level features

## Eval-based features

In [20]:
eval_based_features = []

In [21]:
moves_with_parsed_eval = moves.alias("moves").sql(
    r"""
        SELECT
          *,
          REGEXP_EXTRACT(
            comment,
            '\[%eval\s(\#[+-]?\d+)|([+-]?\d{0,10}\.\d{1,2}|\d{1,10}\.?)',
            1
          ) AS mate,
          REGEXP_EXTRACT(
            comment,
            '\[%eval\s(\#[+-]?\d+)|([+-]?\d{0,10}\.\d{1,2}|\d{1,10}\.?)',
            2
          ) AS regular_eval
        FROM moves
        """
)
moves_with_parsed_eval

In [22]:
MATE_SCORE = 1_000  # Arbitrary large number greater than 121 (`max(abs(mate))`)

mate_eval = moves_with_parsed_eval.mate.substr(1).try_cast(
    int
).sign() * MATE_SCORE - moves_with_parsed_eval.mate.substr(1).try_cast(int)
eval_based_features += [
    mate_eval.name("mate_eval"),
    moves_with_parsed_eval.regular_eval.try_cast("float").name("regular_eval"),
]

## Clock-based features

In [23]:
clock_based_features = []

In [24]:
moves_with_parsed_clock = moves.alias("moves").sql(
    r"""
        SELECT
          *,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            1
          ) AS hours,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            2
          ) AS minutes,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            3
          ) AS seconds
        FROM moves
        """
)
moves_with_parsed_clock

In [25]:
clock = (
    moves_with_parsed_clock.hours.try_cast(int) * 3600
    + moves_with_parsed_clock.minutes.try_cast(int) * 60
    + moves_with_parsed_clock.seconds.try_cast(float)
)

In [26]:
w = ibis.window(group_by="game_id", order_by="ply")
previous_clock = clock.lag().over(w)
white_clock = ibis.ifelse(moves_with_parsed_clock.ply % 2 == 1, clock, previous_clock)
black_clock = ibis.ifelse(moves_with_parsed_clock.ply % 2 == 0, clock, previous_clock)
black_clock = black_clock.coalesce(white_clock)
clock_based_features += [
    white_clock.name("white_clock"),
    black_clock.name("black_clock"),
]

# Create model input table

In [27]:
move_level_features = moves_with_parsed_eval.select(
    "game_id", "ply", *eval_based_features
).join(
    moves_with_parsed_clock.select("game_id", "ply", *clock_based_features),
    ["game_id", "ply"],
)
model_input_table = games.select("game_id", *game_level_features).join(
    move_level_features, "game_id"
)
model_input_table

In [28]:
model_input_table_with_final_eval = model_input_table.mutate(
    mate_eval=model_input_table.mate_eval.coalesce(
        ibis.ifelse(
            model_input_table.regular_eval.isnull(),
            model_input_table.target.case()
            .when(1.0, MATE_SCORE)
            .when(0.0, -MATE_SCORE)
            .when(0.5, 0)
            .end(),
            None,
        )
    )
)

In [29]:
filtered_model_input_table = model_input_table_with_final_eval.filter(
    (model_input_table_with_final_eval.is_rated)
    & (model_input_table_with_final_eval.lichess_time_control_type != "Correspondence")
)

In [30]:
filtered_model_input_table.head().execute()

Unnamed: 0,game_id,is_rated,lichess_time_control_type,is_tournament,white_elo,black_elo,white_title,black_title,base_time,increment,target,ply,mate_eval,regular_eval,white_clock,black_clock
0,CgOuBqqM,True,Blitz,False,1450,1160,,,300,3,0.0,1,,-0.09,300.0,300.0
1,CgOuBqqM,True,Blitz,False,1450,1160,,,300,3,0.0,2,,0.07,300.0,300.0
2,CgOuBqqM,True,Blitz,False,1450,1160,,,300,3,0.0,3,,-0.59,301.0,300.0
3,CgOuBqqM,True,Blitz,False,1450,1160,,,300,3,0.0,4,,-0.6,301.0,301.0
4,CgOuBqqM,True,Blitz,False,1450,1160,,,300,3,0.0,5,,-0.8,302.0,301.0


In [31]:
filtered_model_input_table.count().to_pyarrow().as_py()

512257915

# Apply IbisML recipe

In [32]:
import pickle

xgb_recipe = pickle.load(open("xgb-recipe.pkl", "rb"))
xgb_recipe.to_ibis(filtered_model_input_table).head().execute()

Unnamed: 0,is_tournament,white_elo,black_elo,base_time,increment,target,ply,mate_eval,regular_eval,white_clock,black_clock,relative_clock_diff,elo_diff
0,0.0,1685.0,1633.0,300.0,3.0,1.0,1.0,,0.18,300.0,300.0,0.0,52.0
1,0.0,1685.0,1633.0,300.0,3.0,1.0,2.0,,0.21,300.0,300.0,0.0,52.0
2,0.0,1685.0,1633.0,300.0,3.0,1.0,3.0,,0.08,303.0,300.0,0.007143,52.0
3,0.0,1685.0,1633.0,300.0,3.0,1.0,4.0,,0.08,303.0,302.0,0.002381,52.0
4,0.0,1685.0,1633.0,300.0,3.0,1.0,5.0,,0.11,306.0,302.0,0.009524,52.0


In [33]:
xgb_recipe.to_ibis(filtered_model_input_table).count().to_pyarrow().as_py()

512257915

# Score scikit-learn pipeline

In [34]:
xgb_pipe = pickle.load(open("xgb-pipe.pkl", "rb"))
xgb_pipe

In [35]:
X_test = filtered_model_input_table.drop("target")
y_test = filtered_model_input_table.target
xgb_pipe.score(X_test, y_test)

: 