In [4]:
import torch, tqdm
import pandas as pd, psycopg2, os
from sklearn.preprocessing import StandardScaler
import numpy as np  
import torch
from torch.utils.data import Dataset, DataLoader
import joblib
import torch.nn as nn


In [5]:
conn = psycopg2.connect(
    host="localhost", dbname="postgres",
    user="keiichiro", password="" 
)
df = pd.read_sql("""
    SELECT * FROM feat.train_features
    WHERE race_date <= '2024-12-31'
""", conn)

  df = pd.read_sql("""


DatabaseError: Execution failed on sql '
    SELECT * FROM feat.train_features
    WHERE race_date <= '2024-12-31'
': column "race_date" does not exist
LINE 3:     WHERE race_date <= '2024-12-31'
                  ^


In [None]:
NUM_COLS = ["air_temp", "wind_speed", "wave_height", "water_temp"]
scaler = StandardScaler().fit(df[NUM_COLS])
df[NUM_COLS] = scaler.transform(df[NUM_COLS])
scaler_filename = "artifacts/wind_scaler.pkl"
joblib.dump(scaler, scaler_filename)

In [None]:
def encode(col):
    uniq = sorted(df[col].dropna().unique())
    mapping = {v:i for i,v in enumerate(uniq)}
    df[col + "_id"] = df[col].map(mapping).fillna(-1).astype("int16")
    return mapping
venue2id = encode("venue")
race_type2id = encode("race_type")

In [None]:
class BoatRaceDataset(Dataset):
    def __init__(self, frame):
        self.f = frame.reset_index(drop=True).astype("float32")

    def __len__(self): return len(self.f)

    def __getitem__(self, idx):
        row = self.f.iloc[idx]

        # --- context --- #
        ctx = torch.tensor([
            row["venue_id"], row["race_type_id"],
            row["air_temp"], row["wind_speed"],
            row["wave_height"], row["water_temp"]
        ])

        # --- per-boat --- #
        boat_feats, ranks = [], []
        for lane in range(1, 7):
            boat_feats.append(torch.tensor([
                row[f"lane{lane}_weight"],
                row[f"lane{lane}_exh_time"],
                row[f"lane{lane}_st"],
                row[f"lane{lane}_fs_flag"]
            ]))
            ranks.append(row[f"lane{lane}_rank"])
        # stack: [6, feat_dim]
        return ctx, torch.stack(boat_feats), torch.tensor(ranks, dtype=torch.int64)

ds_train = BoatRaceDataset(df[df["race_date"] < "2024-07-01"])
ds_val   = BoatRaceDataset(df[df["race_date"] >= "2024-07-01"])

loader_train = DataLoader(ds_train, batch_size=256, shuffle=True)
loader_val   = DataLoader(ds_val,   batch_size=512)


In [None]:
class SimpleCPLNet(nn.Module):
    def __init__(self, ctx_in=6, boat_in=4, hidden=64):
        super().__init__()
        self.ctx_mlp  = nn.Sequential(nn.Linear(ctx_in, hidden), nn.ReLU())
        self.boat_mlp = nn.Sequential(nn.Linear(boat_in, hidden),
                                      nn.ReLU(), nn.Linear(hidden, hidden))
        self.score    = nn.Linear(hidden*2, 1)

    def forward(self, ctx, boats):          # ctx:[B,6] boats:[B,6,4]
        B = ctx.size(0)
        ctx_vec = self.ctx_mlp(ctx)         # [B,H]
        ctx_rep = ctx_vec.unsqueeze(1).repeat(1,6,1)   # [B,6,H]

        boat_vec = self.boat_mlp(boats)     # [B,6,H]
        joint = torch.cat([ctx_rep, boat_vec], dim=-1) # [B,6,2H]
        scores = self.score(joint).squeeze(-1)         # [B,6]
        return scores


In [None]:
def pl_nll(scores, ranks):
    loss = 0
    for r in range(1, 7):
        mask = (ranks == r)
        s_r  = scores.masked_fill(~mask, -1e9)
        loss += (torch.logsumexp(scores, dim=1) - s_r.max(dim=1).values)
        scores = scores.masked_fill(mask, -1e9)
    return loss.mean()


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SimpleCPLNet().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(30):
    model.train()
    for ctx, boats, ranks in tqdm.tqdm(loader_train):
        ctx, boats, ranks = ctx.to(device), boats.to(device), ranks.to(device)
        loss = pl_nll(model(ctx, boats), ranks)
        opt.zero_grad(); loss.backward(); opt.step()

    # --- validation ---
    model.eval(); val_loss = 0
    with torch.no_grad():
        for ctx, boats, ranks in loader_val:
            ctx, boats, ranks = ctx.to(device), boats.to(device), ranks.to(device)
            val_loss += pl_nll(model(ctx, boats), ranks).item() * len(ctx)
    val_loss /= len(loader_val.dataset)
    print(f"epoch {epoch}  val_nll {val_loss:.4f}")


In [None]:
torch.save({
    "state_dict": model.state_dict(),
    "scaler": scaler_filename,
    "venue2id": venue2id,
    "race_type2id": race_type2id
}, "cplnet_checkpoint.pt")