In [1]:
from ibis.interactive import *
from snowflake.snowpark import Session, functions as F, types as T
from snowflake.snowpark.functions import when, lit
from snowflake.ml.modeling.preprocessing import OneHotEncoder

import os

con = ibis.snowflake.connect()

 * To change owner, run `chown $USER "/Users/cromano/.snowflake/config.toml"`.
 * To restrict permissions, run `chmod 0600 "/Users/cromano/.snowflake/config.toml"`.

  warn(f"Bad owner or permissions on {str(filep)}{chmod_message}")


In [2]:
def combine_stats(con, table_name: str, schema: str):
    t = con.table(table_name, schema=schema)
    t = t.mutate(W1COLUMN=_.WSCORE)
    t = t.mutate(W2COLUMN=_.LSCORE)
    winning = t.select("SEASON", "DAYNUM", s.startswith("W")).mutate(WON=1)
    winning = winning.rename(
        {
            c[1:]: c
            for c in winning.columns
            if c.startswith("W") and c not in ["WON", "WLOC", "W1COLUMN", "W2COLUMN"]
        }
    )

    losing = t.select(
        "SEASON", "DAYNUM", "WLOC", "W1COLUMN", "W2COLUMN", s.startswith("L")
    ).mutate(WON=0)
    losing = losing.rename(
        {
            c[1:]: c
            for c in losing.columns
            if c.startswith("L") and c not in ["WON", "WLOC"]
        }
    )
    return winning.union(losing)


def flatten_regions(con, table_name: str, schema: str):
    flattened_regions = (
        con.table(table_name, schema=schema)
        .pivot_longer(s.startswith("Region"))
        .rename({"Region": "name", "RegionName": "value"})
        .mutate(Region=_.Region.replace("Region", ""))
        .drop("DayZero")
    )
    return flattened_regions

In [3]:
m_reg = combine_stats(con, "W_REGULAR_SEASON_DETAILED_RESULTS", schema="WOMEN")

In [4]:
m_reg

In [5]:
w_margin = (
    m_reg.filter(_.WON == 1)
    .mutate(SCOREDIFF=_.W1COLUMN - _.W2COLUMN)
    .group_by(["SEASON", "TEAMID"])
    .agg(WINMARGINMEDIAN=_.SCOREDIFF.median(), WINMARGINMEAN=_.SCOREDIFF.mean())
)

l_margin = (
    m_reg.filter(_.WON == 0)
    .mutate(SCOREDIFF=_.W1COLUMN - _.W2COLUMN)
    .group_by(["SEASON", "TEAMID"])
    .agg(LOSEMARGINMEDIAN=_.SCOREDIFF.median(), LOSEMARGINMEAN=_.SCOREDIFF.mean())
)

m_season_margin = w_margin.join(l_margin, (["SEASON", "TEAMID"]))
m_reg = m_reg.drop(['W1COLUMN','W2COLUMN'])

In [6]:
season_stats = (
    m_reg.drop("DAYNUM")
    .group_by(["SEASON", "TEAMID"])
    .agg(s.across(s.numeric(), dict(MEAN=_.mean(), MEDIAN=_.median(), STDDEV=_.std())))
    .drop(s.startswith("WON_"), s.startswith("SEASON_"), s.startswith("TEAMID_"))
)

In [7]:
hna = (
    m_reg.group_by(["SEASON", "TEAMID", "WLOC"])
    .agg(WINCOUNT=_.WON.sum())
    .mutate(WLOC="WLOC" + _.WLOC)
    .pivot_wider(names_from="WLOC", values_from="WINCOUNT")
    .mutate(s.across(s.startswith("WLOC"), ibis.coalesce(_, 0)))
)

In [8]:
season_joined = (
    season_stats.join(hna, ["SEASON", "TEAMID"])
    .join(m_season_margin, ["SEASON", "TEAMID"])
).drop(s.endswith("_right")).distinct()

season_joined =season_joined.fillna(0)

In [9]:
conf_wins = (
    con.table("W_CONFERENCE_TOURNEY_GAMES")
    .mutate(
        ROWNUM=ibis.row_number().over(
            group_by=["SEASON", "CONFABBREV"], order_by=_.DAYNUM.desc()
        )
    )
    .filter(_.ROWNUM == 0)
    .drop(["DAYNUM", "ROWNUM", "LTEAMID", "CONFABBREV"])
    .mutate(WON_CONFERENCE=1)
    .rename({"TEAMID": "WTEAMID"})
)

final = (
    season_joined.join(conf_wins, ["SEASON", "TEAMID"], how="left")
    .mutate(WON_CONFERENCE=_.WON_CONFERENCE.fillna(0))
    .drop(s.endswith("_right"))
    .mutate(TOTAL_WINS=_.WLOCN + _.WLOCH + _.WLOCA)
)

ProgrammingError: 002003 (42S02): SQL compilation error:
Table 'W_CONFERENCE_TOURNEY_GAMES' does not exist or not authorized.

In [10]:
# This is super hacky, but I need to be able to use the same session to share cached tables.
# I also want to avoid this message: SnowparkSessionException: (1409): More than one active session is detected. When you call...

@classmethod
def from_ibis(self, con) -> Session:
    return Session.builder.config("connection", con.con).getOrCreate()

Session.from_ibis = from_ibis

session = Session.from_ibis(con)

In [12]:
season = session.sql(season_joined.to_sql(final))

NameError: name 'final' is not defined

In [None]:
seeds = session.table('MEN.M_NCAATOURNEY_SEEDS')
seeds.show()

### Kaggle had the play in games wrongso lets replace them

In [None]:
seeds = seeds.withColumn(
    "TEAMID",
    when((seeds["TEAMID"] == 1129) & (seeds["SEASON"] == 2024), 1160).otherwise(seeds["TEAMID"]),
)

In [None]:
seed_value = (
    seeds
    .with_column("REGION", F.substring(F.col("SEED"), 1, 1))
    .with_column(
        "SEED", F.substring(F.col("SEED"), 2, F.length(F.col("SEED")) - 1)
    )
    .select("SEASON", "TEAMID", "REGION", "SEED")
    .with_column(
        "SEED",
        F.cast(F.regexp_replace(F.col("SEED"), "[a-z]", ""), T.IntegerType()),
    )
)

seed_value.show()

In [None]:
tourney = session.table('MEN.M_NCAATOURNEY_COMPACT_RESULTS')
tourney = tourney.select('SEASON','WTEAMID','LTEAMID','WSCORE','LSCORE','DAYNUM')
tourney.show()

In [None]:
tourney_round = tourney.with_column(
    "ROUND",
    when((tourney.daynum >= 134) & (tourney.daynum <= 135), lit(0))
    .when((tourney.daynum >= 136) & (tourney.daynum <= 137), lit(1))
    .when((tourney.daynum >= 138) & (tourney.daynum <= 139), lit(2))
    .when((tourney.daynum >= 143) & (tourney.daynum <= 144), lit(3))
    .when(tourney.daynum == 145, lit(4))
    .when(tourney.daynum == 152, lit(5))
    .otherwise(lit(6)),
).drop("DAYNUM")

tourney_round.show()

In [None]:
## Add in conference names, uppercase column headers and values and one hot encode
conf = session.table('MEN.M_TEAM_CONFERENCES')

def fix_values(column):
    return F.upper(F.regexp_replace(F.col(column), "[^a-zA-Z0-9]+", "_"))

conf = conf.with_column("CONFABBREV", fix_values("CONFABBREV"))
conf = conf.with_column_renamed("SEASON", "C_SEASON")
conf = conf.with_column_renamed("TEAMID", "C_TEAMID")

conf.show()

In [None]:
tourney_conf_w = (
    tourney_round.join(
        conf,
        (tourney_round.col("WTEAMID") == conf.col("C_TEAMID"))
        & (tourney_round.col("SEASON") == conf.col("C_SEASON")),
    )
    .drop("C_SEASON", "C_TEAMID")
    .with_column_renamed("CONFABBREV", "W_CONF")
)
tourney_conf_w.show()

In [None]:
tourney_conf_round = tourney_conf_w.join(
    conf,
    (tourney_round.col("LTEAMID") == conf.col("C_TEAMID"))
    & (tourney_round.col("SEASON") == conf.col("C_SEASON"))).drop("C_SEASON","C_TEAMID").with_column_renamed("CONFABBREV", "L_CONF")
tourney_conf_round.show()

In [None]:
tourney_conf_round.show()

In [None]:
w_t = (
    tourney_conf_round.join(
        seed_value,
        (
            (tourney_conf_round.SEASON == seed_value.SEASON)
            & (tourney_conf_round.WTEAMID == seed_value.TEAMID)
        ),
        rsuffix="_W",
    )
    .drop(["SEASON_W", "TEAMID"])
    .with_column_renamed("REGION", "W_REGION")
    .with_column_renamed("SEED", "W_SEED")
    .cache_result()
)

tourney_conf_round = (
    w_t.join(
        seed_value,
        ((w_t.SEASON == seed_value.SEASON) & (w_t.LTEAMID == seed_value.TEAMID)),
        rsuffix="_L",
    )
    .drop(["SEASON_L", "TEAMID"])
    .with_column_renamed("REGION", "L_REGION")
    .with_column_renamed("SEED", "L_SEED")
)

In [None]:
tourney_conf_round.show()

In [None]:
season_w = season.select(
    *[F.col(col).alias(f"W_{col}") for col in season.columns]
)


season_l = season.select(
    *[F.col(col).alias(f"L_{col}") for col in season.columns]
)

In [None]:
final = (
    tourney_conf_round.join(
        season_w,
        on=(
            (tourney_conf_round.WTEAMID == season_w.W_TEAMID)
            & (tourney_conf_round.SEASON == season_w.W_SEASON)
        ),
    )
    .drop("W_TEAMID", "W_SEASON")
    .join(
        season_l,
        on=(
            (tourney_conf_round.LTEAMID == season_l.L_TEAMID)
            & (tourney_conf_round.SEASON == season_l.L_SEASON)
        ),
    )
    .drop("L_TEAMID", "L_SEASON")
)

final.show()

In [None]:
OHE = OneHotEncoder(
    input_cols=["W_CONF","L_CONF"],
    output_cols=["W_CONF","L_CONF"],
    drop_input_cols=True,
    drop="first",
    handle_unknown="ignore",
)

final = OHE.fit(final).transform(final)

In [None]:
final = final.with_columns(
    ["W_WLOCN", "W_WLOCH", "W_WLOCA", "L_WLOCN", "L_WLOCH", "L_WLOCA"],
    [
        F.col("W_WLOCN").cast(T.LongType()),
        F.col("W_WLOCH").cast(T.LongType()),
        F.col("W_WLOCA").cast(T.LongType()),
        F.col("L_WLOCN").cast(T.LongType()),
        F.col("L_WLOCH").cast(T.LongType()),
        F.col("L_WLOCA").cast(T.LongType()),
    ],
)

### This table is all season data joined with historic tournament data

In [None]:
final.write.save_as_table(
    "MEN.FINAL_FEATURES", mode="overwrite"
)

### Create season table for predicting 2024

In [None]:
season = (
    season.join(
        conf,
        (season.col("teamid") == conf.col("C_teamid"))
        & (season.col("season") == conf.col("C_season")),
    )
    .drop("C_SEASON", "C_TEAMID")
    .with_column_renamed("CONFABBREV", "CONF")
)

OHE = OneHotEncoder(
    input_cols=["CONF"],
    output_cols=["CONF"],
    drop_input_cols=True,
    drop="first",
    handle_unknown="ignore",
)

season = OHE.fit(season).transform(season)

In [None]:
region = seed_value.select(
    F.col("SEASON").alias("SEASON_1"), F.col("TEAMID").alias("TEAMID_1"), "REGION"
)

season = season.join(
    region, on=((season.season == region.season_1) & (season.teamid == region.teamid_1))
).drop("TEAMID_1", "SEASON_1")

season.write.save_as_table(
    "MEN.FINAL_SEASON_STATS", mode="overwrite"
)