In [1]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from catboost import Pool, CatBoostRegressor


class Dataset:
    """Transforms raw NCAA regular and tournament matchup data into a friendly format for training ML models."""
    
    def __init__(self, tournament, input_dir=""):
        self.tournament = tournament
        self.input_dir = input_dir
        self.regular_results = pd.read_csv(os.path.join(input_dir, f"{tournament}RegularSeasonDetailedResults.csv"))
        self.tourney_results = pd.read_csv(os.path.join(input_dir, f"{tournament}NCAATourneyDetailedResults.csv"))
        self.seeds = pd.read_csv(os.path.join(input_dir, f"{tournament}NCAATourneySeeds.csv"))
        self.seeds["seed"] = self.seeds["Seed"].apply(lambda x: int(x[1:3]))        
        
        self.regular_data = self.prepare_data(self.regular_results)
        self.tourney_data = self.prepare_data(self.tourney_results)
        self.tourney_data = self.tourney_data[["Season", "NumOT", "T1_TeamID", "T1_Score", "T2_TeamID", "T2_Score"]]
        
        self.season_stats_T1, self.season_stats_T2 = self.season_stats()
        self.last14days_stats_T1, self.last14days_stats_T2 = self.last14days_stats()
        self.glm_quality_T1, self.glm_quality_T2 = self.team_quality()
        self.seeds_T1, self.seeds_T2 = self.team_seed()
        

    def prepare_data(self, df):
        """Double the data by switching position of a winning and a losing team."""
        df.drop("WLoc", axis=1, inplace=True)
        dfswap = df[
            [
                "Season", "DayNum", "LTeamID", "LScore", "WTeamID", "WScore", "NumOT",
                "LFGM", "LFGA", "LFGM3", "LFGA3", "LFTM", "LFTA", "LOR", "LDR", "LAst", "LTO", "LStl", "LBlk", "LPF",
                "WFGM", "WFGA", "WFGM3", "WFGA3", "WFTM", "WFTA", "WOR", "WDR", "WAst", "WTO", "WStl", "WBlk", "WPF",
            ]
        ]
        df.columns = [x.replace("W", "T1_").replace("L", "T2_") for x in list(df.columns)]
        dfswap.columns = [x.replace("L", "T1_").replace("W", "T2_") for x in list(dfswap.columns)]
        output = pd.concat([df, dfswap]).reset_index(drop=True)
        output["PointDiff"] = (output["T1_Score"] - output["T2_Score"]).clip(-30, 30)
        return output


    def season_stats(self):
        """Aggregate boxscore statistics for the team and its opponent."""
        boxscore_cols = [
            "T1_FGM", "T1_FGA", "T1_FGM3", "T1_FGA3", "T1_FTM", "T1_FTA", 
            "T1_OR", "T1_DR", "T1_Ast", "T1_TO", "T1_Stl", "T1_Blk", "T1_PF",
            "T2_FGM", "T2_FGA", "T2_FGM3", "T2_FGA3", "T2_FTM", "T2_FTA", 
            "T2_OR", "T2_DR", "T2_Ast", "T2_TO", "T2_Stl", "T2_Blk", "T2_PF",
            "PointDiff",
        ]
        season_stats = self.regular_data.groupby(["Season", "T1_TeamID"])[boxscore_cols].agg("mean").reset_index()
        season_stats.columns = ["".join(col).strip() for col in season_stats.columns.values]
        season_stats_T1 = season_stats.copy()
        season_stats_T2 = season_stats.copy()
        season_stats_T1.columns = ["T1_" + x.replace("T1_", "").replace("T2_", "opponent_") for x in list(season_stats_T1.columns)]
        season_stats_T2.columns = ["T2_" + x.replace("T1_", "").replace("T2_", "opponent_") for x in list(season_stats_T2.columns)]
        season_stats_T1.columns.values[0] = "Season"
        season_stats_T2.columns.values[0] = "Season"
        return season_stats_T1, season_stats_T2


    def last14days_stats(self):
        """Win rate in the last 14 days of a regular season."""
        last14days_stats_T1 = self.regular_data.query("DayNum > 118").reset_index(drop=True)
        last14days_stats_T1["win"] = np.where(last14days_stats_T1["PointDiff"] > 0, 1, 0)
        last14days_stats_T1 = last14days_stats_T1.groupby(["Season", "T1_TeamID"])["win"].mean()
        last14days_stats_T1 = last14days_stats_T1.reset_index(name="T1_win_ratio_14d")        
        last14days_stats_T2 = self.regular_data.query("DayNum > 118").reset_index(drop=True)
        last14days_stats_T2["win"] = np.where(last14days_stats_T2["PointDiff"] < 0, 1, 0)
        last14days_stats_T2 = last14days_stats_T2.groupby(["Season", "T2_TeamID"])["win"].mean()
        last14days_stats_T2 = last14days_stats_T2.reset_index(name="T2_win_ratio_14d")
        return last14days_stats_T1, last14days_stats_T2

    
    def team_quality(self):
        """Mixed effects model to determine team's strength relative to all teams' strength in a season."""
        regular_season_effects = self.regular_data[["Season", "T1_TeamID", "T2_TeamID", "PointDiff"]].copy()
        regular_season_effects["T1_TeamID"] = regular_season_effects["T1_TeamID"].astype(str)
        regular_season_effects["T2_TeamID"] = regular_season_effects["T2_TeamID"].astype(str)
        regular_season_effects["win"] = np.where(regular_season_effects["PointDiff"] > 0, 1, 0)
        
        def glm(season):
            glm = sm.GLM.from_formula(
                formula="win~-1+T1_TeamID+T2_TeamID",
                data=regular_season_effects.loc[self.regular_data["Season"] == season, :],
                family=sm.families.Binomial(),
            ).fit()
        
            quality = pd.DataFrame(glm.params).reset_index()
            quality.columns = ["TeamID", "quality"]
            quality["Season"] = season
            quality["quality"] = quality["quality"]
            quality = quality.loc[quality.TeamID.str.contains("T1_")].reset_index(drop=True)
            quality["TeamID"] = quality["TeamID"].apply(lambda x: x[10:14]).astype(int)
            return quality
        
        seasons = np.unique(self.regular_data["Season"])
        glm_quality = pd.concat([glm(y) for y in seasons]).reset_index(drop=True)
        glm_quality_T1 = glm_quality.copy()
        glm_quality_T2 = glm_quality.copy()
        glm_quality_T1.columns = ["T1_TeamID", "T1_quality", "Season"]
        glm_quality_T2.columns = ["T2_TeamID", "T2_quality", "Season"]
        return glm_quality_T1, glm_quality_T2


    def team_seed(self):
        """Team seed entering the tournament."""
        seeds_T1 = self.seeds[["Season", "TeamID", "seed"]].copy()
        seeds_T2 = self.seeds[["Season", "TeamID", "seed"]].copy()
        seeds_T1.columns = ["Season", "T1_TeamID", "T1_seed"]
        seeds_T2.columns = ["Season", "T2_TeamID", "T2_seed"]
        return seeds_T1, seeds_T2

    
    def add_overtimes(self, df):
        """Add games with 0 point difference if overtime."""
        df_ot = df.query("NumOT>0").copy()
        df_ot["T1_Score"] = df_ot["T2_Score"]
        return pd.concat([df, df_ot], axis=0).reset_index(drop=True)
    
    
    def get_features(self, df=None):
        """Merge all the features into a single dataframe."""
        if df is None: # train mode. testing mode should provide df of team pairs
            df = self.tourney_data.copy()
            df = self.add_overtimes(df)
        df["tournament"] = 1 if self.tournament == "M" else 0
        df = pd.merge(df, self.season_stats_T1, on=["Season", "T1_TeamID"], how="left")
        df = pd.merge(df, self.season_stats_T2, on=["Season", "T2_TeamID"], how="left")
        df = pd.merge(df, self.last14days_stats_T1, on=["Season", "T1_TeamID"], how="left")
        df = pd.merge(df, self.last14days_stats_T2, on=["Season", "T2_TeamID"], how="left")
        df = pd.merge(df, self.glm_quality_T1, on=["Season", "T1_TeamID"], how="left")
        df = pd.merge(df, self.glm_quality_T2, on=["Season", "T2_TeamID"], how="left")
        df = pd.merge(df, self.seeds_T1, on=["Season", "T1_TeamID"], how="left")
        df = pd.merge(df, self.seeds_T2, on=["Season", "T2_TeamID"], how="left")
        return df


class Model:
    """The task of the models is to predict point difference of teams played in a matchup.
       CatBoost objective `RMSEWithUncertainty` returns both mean and variance of predictions, 
       which both will be later used to simulate brackets of the tournament.
    """
    
    def __init__(self, data, features, nmodels): 
        self.data = data
        self.features = features
        self.nmodels = nmodels
        self.catboost_params = {
            "objective": "RMSEWithUncertainty",
            "iterations": 2000,
            "learning_rate": 0.02,
            "random_state": 42,
            "grow_policy": "Lossguide",
            "max_leaves": 3,
            "subsample": 0.2,
            "colsample_bylevel": 0.2,
            "max_bin": 32,
            "reg_lambda": 1,
            "verbose": 0,
        }


    def train_cv_catboost(self, seed):
        """Train a validation model on 80%/20% split."""
        np.random.seed(seed)
        self.data["train"] = np.where(np.random.random(self.data.shape[0]) > 0.8, 1, 0)
        train, test = self.data.query("train==0"), self.data.query("train==1")
        cat_train = Pool(train[self.features], (train["T1_Score"] - train["T2_Score"]).clip(-30, 30))
        cat_test = Pool(test[self.features], (test["T1_Score"] - test["T2_Score"]).clip(-30, 30))
        model = CatBoostRegressor(**self.catboost_params)
        model.fit(cat_train, eval_set=cat_test)
        return model.best_iteration_
    
    
    def train_catboost(self):
        """Train a full data model."""
        model = CatBoostRegressor(**self.catboost_params)
        model.fit(Pool(self.data[self.features], (self.data["T1_Score"] - self.data["T2_Score"]).clip(-30, 30)))
        return model

    
    def get_optimal_rounds(self):
        """Train n validation models and determine optimal number of iterations."""
        cv_results = []
        for seed in range(self.nmodels):
            cv_results.append(self.train_cv_catboost(seed))        
        return int(np.mean([t[1] for t in cv_results]))


    def train(self):
        """Train multiple inference models on full data based on optimal CV iterations."""
        cv_results = []
        for seed in range(self.nmodels):
            cv_results.append(self.train_cv_catboost(seed))               
        
        self.models = []
        for seed in range(self.nmodels):
            self.catboost_params["iterations"] = int(np.mean(cv_results))            
            self.catboost_params["random_state"] = seed            
            self.models.append(self.train_catboost())


    def predict(self, data, model_id=None):
        """Make model predictions for a given dataframe."""
        if model_id is None:
            model_id = np.random.choice(range(self.nmodels))
        return self.models[model_id].predict(data[self.features])


class BracketSimulator:
    """Main class used to read data, train models and simulate matchup results."""

    def __init__(self, input_dir=""):
        self.input_dir = input_dir
        self.ncaa_men, self.ncaa_women = Dataset("M", input_dir), Dataset("W", input_dir)
        train_data = pd.concat([self.ncaa_men.get_features(), self.ncaa_women.get_features()], axis=0)
        self.model = Model(data=train_data, features=train_data.columns[6:], nmodels=100)
        self.model.train()

    
    def simulate_round(self, slots, round, tournament, var_coef):
        """Simulates a round utilizing model mean/stdev point difference predictions for each matchup."""
        seeds_strong = slots.rename({"Seed": "StrongSeed", "TeamID": "T1_TeamID"}, axis=1).copy()
        seeds_weak = slots.rename({"Seed": "WeakSeed", "TeamID": "T2_TeamID"}, axis=1).copy()
        if round == 0:
            df = self.tourney_slots.loc[~self.tourney_slots.Slot.str.startswith("R")].copy()
        else:
            df = self.tourney_slots.loc[self.tourney_slots.Slot.str.startswith(f"R{round}")].copy()
        df = pd.merge(df, seeds_strong, on=["Season", "StrongSeed"])
        df = pd.merge(df, seeds_weak, on=["Season", "WeakSeed"])
        df = self.ncaa_women.get_features(df) if tournament == "W" else self.ncaa_men.get_features(df)

        # pick a random trained model and predict, then sample matchup outcome from ~N(mu, stdev) distribution
        preds = self.model.predict(df)
        df["outcome"] = [np.random.normal(mu, np.sqrt(var_coef * var)) for mu, var in zip(preds[:, 0], preds[:, 1])]
        df["Team"] = df.apply(lambda t: t["StrongSeed"] if t["outcome"] > 0 else t["WeakSeed"], axis=1)
        df["TeamID"] = df.apply(lambda t: t["T1_TeamID"] if t["outcome"] > 0 else t["T2_TeamID"], axis=1)

        # override `M` top1 first round
        if tournament == "M" and round == 1:
            df.loc[df.StrongSeed.str.contains("01"), "Team"] = df.loc[df.StrongSeed.str.contains("01"), "StrongSeed"]
            df.loc[df.StrongSeed.str.contains("01"), "TeamID"] = df.loc[df.StrongSeed.str.contains("01"), "T1_TeamID"]
        # override `W` top4 first round
        if tournament == "W" and round == 1:
            df.loc[df.StrongSeed.str.contains("0[1234]"), "Team"] = df.loc[df.StrongSeed.str.contains("0[1234]"), "StrongSeed"]
            df.loc[df.StrongSeed.str.contains("0[1234]"), "TeamID"] = df.loc[df.StrongSeed.str.contains("0[1234]"), "T1_TeamID"]

        return df[["Season", "Slot", "TeamID"]].rename({"Slot": "Seed"}, axis=1), df[["Season", "Slot", "Team"]]

    
    def adjust_team_slot(self, round_curr, round_prev):
        """For later rounds we need to backtrack slots from the first round."""
        x = pd.merge(round_curr, round_prev, left_on=["Season", "Team"], right_on=["Season", "Slot"])
        return x[["Season", "Slot_x", "Team_y"]].rename({"Slot_x": "Slot", "Team_y": "Team"}, axis=1)

    
    def generate_bracket(self, bracket_id, tournament, var_coef):
        """Iteratively fills up an empty bracket."""
        S = self.tourney_seeds
        S0, R0 = self.simulate_round(S, 0, tournament, var_coef)
        S1, R1 = self.simulate_round(pd.concat([S, S0]), 1, tournament, var_coef)
        S2, R2 = self.simulate_round(pd.concat([S0, S1]), 2, tournament, var_coef)
        R2 = self.adjust_team_slot(R2, R1)
        S3, R3 = self.simulate_round(pd.concat([S1, S2]), 3, tournament, var_coef)
        R3 = self.adjust_team_slot(R3, R2)
        S4, R4 = self.simulate_round(pd.concat([S2, S3]), 4, tournament, var_coef)
        R4 = self.adjust_team_slot(R4, R3)
        S5, R5 = self.simulate_round(pd.concat([S3, S4]), 5, tournament, var_coef)
        R5 = self.adjust_team_slot(R5, R4)
        S6, R6 = self.simulate_round(pd.concat([S4, S5]), 6, tournament, var_coef)
        R6 = self.adjust_team_slot(R6, R5)
        
        bracket = pd.concat([R1, R2, R3, R4, R5, R6],axis=0).reset_index(drop=True)
        bracket.drop("Season", axis=1, inplace=True)
        bracket["Tournament"] = tournament
        bracket["Bracket"] = bracket_id
        return bracket

    
    def simulate(self, nbrackets, var_coef=1.0):
        """Main method to simulate n different brackets."""
        brackets = []
        for tournament in ("M", "W"):
            self.tourney_seeds = pd.read_csv(os.path.join(self.input_dir, f"{tournament}NCAATourneySeeds.csv"))
            self.tourney_seeds = self.tourney_seeds.query("Season==2024").reset_index(drop=True)
            self.tourney_slots = pd.read_csv(os.path.join(self.input_dir, f"{tournament}NCAATourneySlots.csv"))
            self.tourney_slots = self.tourney_slots.query("Season==2024").reset_index(drop=True)
            for i in range(nbrackets):
                brackets.append(self.generate_bracket(i + 1, tournament, var_coef))
            
        brackets = pd.concat(brackets, axis=0).reset_index(drop=True)
        brackets['RowId'] = brackets.index  
        return brackets


if __name__ == "__main__":

    input_dir="/kaggle/input/march-machine-learning-mania-2024"
    submission_cols = pd.read_csv(os.path.join(input_dir, "sample_submission.csv")).columns
    
    bracket_simulator = BracketSimulator(input_dir=input_dir)
    brackets = bracket_simulator.simulate(nbrackets=10000, var_coef=1.0) 
    brackets[submission_cols].to_csv("submission.csv", index=None)