In [1]:
import ibis
from ibis import _

# Connect to Starburst Galaxy

First, connect to Starburst Galaxy. We’ll use a `.env` in this example for secrets that are loaded as environment variables. This requires installing the `python-dotenv` package—alternatively, you can set the environment variables for your system.

In [2]:
import os

from dotenv import load_dotenv
from trino.auth import BasicAuthentication

load_dotenv()

user = os.getenv("TRINO_USERNAME")
password = os.getenv("TRINO_PASSWORD")
host = os.getenv("TRINO_HOSTNAME")
port = os.getenv("TRINO_PORTNUMBER")
catalog = "lichess"
schema = "lichess"

con = ibis.trino.connect(
    user=user,
    password=BasicAuthentication(user, password),
    host=host,
    port=port,
    database=catalog,
    schema=schema,
    roles="accountadmin",
    http_scheme="https",
)
con



<ibis.backends.trino.Backend at 0x742d996187d0>

# Verify connection

List the tables your connection has:

In [3]:
con.list_tables()

['games', 'moves']

Run a SQL query:

In [4]:
con.sql("SELECT * FROM games LIMIT 10").execute()

Unnamed: 0,game_id,event,site,date,round,white,black,result,utc_date,utc_time,white_elo,black_elo,white_rating_diff,black_rating_diff,eco,opening,time_control,termination,white_title,black_title
0,6h5ZVeKH,Rated Rapid game,https://lichess.org/6h5ZVeKH,2024.09.05,-,digididanjo,hemmeh,0-1,2024.09.05,09:00:00,1445,1480,-8.0,5.0,C50,"Italian Game: Giuoco Pianissimo, Canal Variation",600+0,Normal,,
1,JczZzUfJ,Rated Blitz game,https://lichess.org/JczZzUfJ,2024.09.09,-,Bertok,byro68,0-1,2024.09.09,18:38:55,2058,2059,-5.0,6.0,C40,Elephant Gambit,300+0,Normal,,
2,gyzyL9b0,Rated Bullet game,https://lichess.org/gyzyL9b0,2024.09.26,-,vitriol357,adamorlik,1-0,2024.09.26,14:27:01,1998,2056,9.0,-7.0,B22,Sicilian Defense: Alapin Variation,60+0,Normal,,
3,CNFkqWZw,Rated Bullet game,https://lichess.org/CNFkqWZw,2024.09.30,-,aedomskiy,ccagnus_marlsenn,0-1,2024.09.30,19:07:31,1543,1527,-6.0,6.0,A00,Van't Kruijs Opening,60+0,Time forfeit,,
4,RceXr7XX,Rated Blitz game,https://lichess.org/RceXr7XX,2024.09.13,-,OnlyMavis,sakra1967,1-0,2024.09.13,19:36:26,1658,1649,6.0,-6.0,C00,Rat Defense: Small Center Defense,300+0,Normal,,
5,vWGzgM91,Rated Rapid game,https://lichess.org/vWGzgM91,2024.09.22,-,simonouni,Debster23,1-0,2024.09.22,23:24:06,869,844,6.0,-14.0,C20,King's Pawn Game: Wayward Queen Attack,600+0,Normal,,
6,KOlWPb9N,Rated Blitz game,https://lichess.org/KOlWPb9N,2024.09.14,-,MidknightShade,insomnia57,0-1,2024.09.14,18:24:22,1280,1345,-26.0,5.0,A40,Horwitz Defense,180+0,Time forfeit,,
7,1qaw25Xs,Rated Blitz game,https://lichess.org/1qaw25Xs,2024.09.09,-,Miftahudin,deniindrawan,0-1,2024.09.09,04:53:05,1831,1880,-5.0,23.0,C50,Italian Game: Giuoco Piano,300+0,Normal,,
8,UzAwsB5k,Rated Blitz game,https://lichess.org/UzAwsB5k,2024.09.13,-,ali11229311,c815a,0-1,2024.09.13,19:36:26,1378,1378,-6.0,6.0,D00,Queen's Pawn Game: Accelerated London System,300+0,Normal,,
9,OaHdNH7Z,Rated Blitz game,https://lichess.org/OaHdNH7Z,2024.09.13,-,labestia2010,eladiamar,1-0,2024.09.13,19:36:26,2019,1516,,,D00,Queen's Pawn Game: Chigorin Variation,300+0,Normal,,


# Filter out games without evals

In [5]:
unfiltered_games = con.table("games")
unfiltered_games.count().to_pyarrow().as_py()

87713219

In [6]:
unfiltered_moves = con.table("moves")
unfiltered_moves.count().to_pyarrow().as_py()

5837064712

In [7]:
games_with_evals = (
    unfiltered_moves.mutate(has_eval=_.comment.contains("[%eval"))
    .group_by(_.game_id)
    .agg(
        percent_has_eval=_.has_eval.mean(),
        has_no_eval_count=_.count() - _.has_eval.sum(),
    )
    .mutate(
        has_no_eval_count=ibis.ifelse(_.percent_has_eval == 0, -1, _.has_no_eval_count)
    )
    .filter(_.has_no_eval_count.between(0, 1))
)
games_with_evals.count().to_pyarrow().as_py()

7933774

In [8]:
games = games_with_evals.select("game_id").join(unfiltered_games, "game_id")
games.count().to_pyarrow().as_py()

7933774

In [9]:
moves = games_with_evals.select("game_id").join(unfiltered_moves, "game_id")
moves.count().to_pyarrow().as_py()

519982245

# Create game-level features

In [10]:
game_level_features = []

## `event`-based features

In [11]:
is_rated = games.event.startswith("Rated ")
game_level_features.append(is_rated.name("is_rated"))

In [12]:
event_with_rated_prefix_stripped = is_rated.ifelse(
    games.event[len("Rated ") :], games.event
)
lichess_time_control_type = event_with_rated_prefix_stripped.substr(
    0, event_with_rated_prefix_stripped.find(" ")
)
game_level_features.append(lichess_time_control_type.name("lichess_time_control_type"))

In [13]:
is_tournament = games.event.contains("tournament")
game_level_features.append(is_tournament.name("is_tournament"))

## Elo-based features

In [14]:
white_elo = games.white_elo.cast(int)
game_level_features.append(white_elo.name("white_elo"))

In [15]:
black_elo = games.black_elo.cast(int)
game_level_features.append(black_elo.name("black_elo"))

## Title features

In [16]:
white_title = games.white_title
game_level_features.append(white_title.name("white_title"))

In [17]:
black_title = games.black_title
game_level_features.append(black_title.name("black_title"))

## `time_control`-based features

In [18]:
index = games.time_control.find("+")
base_time = games.time_control.substr(0, index).try_cast(int)
increment = games.time_control.substr(index + 1).try_cast(int)
game_level_features += [
    base_time.name("base_time"),
    increment.name("increment"),
]

## Target variable

In [19]:
target = games.result.case().when("1-0", 1).when("1/2-1/2", 0.5).when("0-1", 0).end()
game_level_features.append(target.name("target"))

# Create move-level features

## Eval-based features

In [20]:
eval_based_features = []

In [21]:
moves_with_parsed_eval = moves.alias("moves").sql(
    r"""
        SELECT
          *,
          REGEXP_EXTRACT(
            comment,
            '\[%eval\s(\#[+-]?\d+)|([+-]?\d{0,10}\.\d{1,2}|\d{1,10}\.?)',
            1
          ) AS mate,
          REGEXP_EXTRACT(
            comment,
            '\[%eval\s(\#[+-]?\d+)|([+-]?\d{0,10}\.\d{1,2}|\d{1,10}\.?)',
            2
          ) AS regular_eval
        FROM moves
        """
)
moves_with_parsed_eval

In [22]:
MATE_SCORE = 1_000  # Arbitrary large number greater than 121 (`max(abs(mate))`)

mate_eval = moves_with_parsed_eval.mate.substr(1).try_cast(
    int
).sign() * MATE_SCORE - moves_with_parsed_eval.mate.substr(1).try_cast(int)
eval_based_features += [
    mate_eval.name("mate_eval"),
    moves_with_parsed_eval.regular_eval.try_cast("float").name("regular_eval"),
]

## Clock-based features

In [23]:
clock_based_features = []

In [24]:
moves_with_parsed_clock = moves.alias("moves").sql(
    r"""
        SELECT
          *,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            1
          ) AS hours,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            2
          ) AS minutes,
          REGEXP_EXTRACT(
            comment,
            '\[%clk\s(\d+):(\d+):(\d+)\]',
            3
          ) AS seconds
        FROM moves
        """
)
moves_with_parsed_clock

In [25]:
clock = (
    moves_with_parsed_clock.hours.try_cast(int) * 3600
    + moves_with_parsed_clock.minutes.try_cast(int) * 60
    + moves_with_parsed_clock.seconds.try_cast(float)
)

In [26]:
w = ibis.window(group_by="game_id", order_by="ply")
previous_clock = clock.lag().over(w)
white_clock = ibis.ifelse(moves_with_parsed_clock.ply % 2 == 1, clock, previous_clock)
black_clock = ibis.ifelse(moves_with_parsed_clock.ply % 2 == 0, clock, previous_clock)
black_clock = black_clock.coalesce(white_clock)
clock_based_features += [
    white_clock.name("white_clock"),
    black_clock.name("black_clock"),
]

# Create model input table

In [27]:
move_level_features = moves_with_parsed_eval.select(
    "game_id", "ply", *eval_based_features
).join(
    moves_with_parsed_clock.select("game_id", "ply", *clock_based_features),
    ["game_id", "ply"],
)
model_input_table = games.select("game_id", *game_level_features).join(
    move_level_features, "game_id"
)
model_input_table

In [28]:
model_input_table_with_final_eval = model_input_table.mutate(
    mate_eval=model_input_table.mate_eval.coalesce(
        ibis.ifelse(
            model_input_table.regular_eval.isnull(),
            model_input_table.target.case()
            .when(1.0, MATE_SCORE)
            .when(0.0, -MATE_SCORE)
            .when(0.5, 0)
            .end(),
            None,
        )
    )
)

In [29]:
filtered_model_input_table = model_input_table_with_final_eval.filter(
    (model_input_table_with_final_eval.is_rated)
    & (model_input_table_with_final_eval.lichess_time_control_type != "Correspondence")
)

In [30]:
filtered_model_input_table.head().execute()

Unnamed: 0,game_id,is_rated,lichess_time_control_type,is_tournament,white_elo,black_elo,white_title,black_title,base_time,increment,target,ply,mate_eval,regular_eval,white_clock,black_clock
0,wJ5UjBwf,True,Rapid,False,714,729,,,600,0,1.0,1,,0.18,600.0,600.0
1,wJ5UjBwf,True,Rapid,False,714,729,,,600,0,1.0,2,,0.21,600.0,600.0
2,wJ5UjBwf,True,Rapid,False,714,729,,,600,0,1.0,3,,0.03,598.0,600.0
3,wJ5UjBwf,True,Rapid,False,714,729,,,600,0,1.0,4,,0.11,598.0,598.0
4,wJ5UjBwf,True,Rapid,False,714,729,,,600,0,1.0,5,,-0.5,585.0,598.0


In [31]:
filtered_model_input_table.count().to_pyarrow().as_py()

512257915