In [2]:
## This file uses all generated data, so it needs to be run after the files that use kaggle data

In [47]:
import numpy as np
import pandas as pd
import os
from itertools import combinations


In [49]:
data = pd.read_csv('../data/2023/gold_data/gold_data_all.csv')

In [50]:
id_cols = ['TeamID','team_name','Seed','Season']
input_features = [
 'is_ap_pre_top_5',
 'is_ap_pre_top_15',
 'is_ap_pre_top_25',
 'wins_cur',
 'losses_cur',
 'games_cur',
 'coach_wl_cur',
 'wins_car',
 'losses_car',
 'games_car',
 'coach_wl_car',
 'tourneys_car',
 'sw16_car',
 'ff_car',
 'champ_car',
 'preseason_pts',
 'returning_min_pct',
 'returning_score_pct',
 'g',
 'wins',
 'losses',
 'win_loss_pct',
 'srs',
 'sos',
 'wins_conf',
 'losses_conf',
 'wins_home',
 'losses_home',
 'wins_visitor',
 'losses_visitor',
 'pts',
 'opp_pts',
 'mp',
 'fg',
 'fga',
 'fg_pct',
 'fg3',
 'fg3a',
 'fg3_pct',
 'ft',
 'fta',
 'ft_pct',
 'orb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'avg_yr',
 'avg_height',
 'avg_weight',
 'pi_i',
 'total_games',
 'num_wins',
 'num_losses',
 'win_pct',
 'avg_points_for',
 'avg_points_against',
 'avg_game_margin',
 'std_game_margin',
 'non_home_wins',
 'home_losses',
 'close_wins',
 'close_losses',
 '3FGA_pg',
 '3FGM_pg',
 '3_pct',
 'FTM_pg',
 'FTA_pg',
 'FT_pct',
 'TOV_pg',
 'avg_pace',
 'off_eff',
 'def_eff',
 'pythag_win_pct',
 'luck',
 'wins_vs_tourney_teams',
 'away_wins_vs_tourney_teams',
 'losses_vs_tourney_teams',
 'losses_vs_non_tourney_teams',
 'games_vs_tourney_teams',
 'final_net',
 'prev_net',
 'net_improvement',
 'final_pom',
 'prev_pom',
 'pom_improvement'
]

In [51]:
data = data[id_cols+input_features]

In [52]:
tourney_games = pd.read_csv('../data/2024/kaggle_data/MNCAATourneyCompactResults.csv')
tourney_games = tourney_games[tourney_games['Season'] >= data['Season'].min()]

In [53]:
tourney_games = tourney_games.merge(data, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])\
.merge(data, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=['_w', '_l'])

In [54]:
winner_first = tourney_games.copy()
winner_first['diff_score'] = winner_first['WScore'] - winner_first['LScore']
loser_first = tourney_games.copy()
loser_first['diff_score'] = loser_first['LScore'] - winner_first['WScore']


In [56]:
for col in input_features:
    try:
        winner_first['diff_'+col] = winner_first[col+'_w'] - winner_first[col+'_l']
        loser_first['diff_'+col] = loser_first[col+'_l'] - loser_first[col+'_w']
    except:
        winner_first['diff_'+col] = winner_first[col+'_w'].map(float) - winner_first[col+'_l'].map(float)
        loser_first['diff_'+col] = loser_first[col+'_l'].map(float) - loser_first[col+'_w'].map(float)

In [57]:
model_data = pd.concat([winner_first,loser_first],ignore_index=True)

In [58]:
model_data.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,TeamID_w,team_name_w,...,diff_games_vs_tourney_teams,diff_final_net,diff_prev_net,diff_net_improvement,diff_final_pom,diff_prev_pom,diff_pom_improvement,diff_is_ap_pre_top_5,diff_is_ap_pre_top_15,diff_is_ap_pre_top_25
0,2003,134,1421,92,1411,84,N,1,1421,UNC Asheville,...,5.0,-31.0,-20.0,11.0,5.0,13.0,8.0,0.0,0.0,0.0
1,2003,136,1112,80,1436,51,N,0,1112,Arizona,...,11.0,-142.0,-177.0,-35.0,-141.0,-179.0,-38.0,1.0,1.0,1.0
2,2003,138,1112,96,1211,95,N,2,1112,Arizona,...,2.0,-41.0,-38.0,3.0,-38.0,-26.0,12.0,1.0,1.0,0.0
3,2003,143,1112,88,1323,71,N,0,1112,Arizona,...,1.0,-12.0,4.0,16.0,-19.0,-9.0,10.0,1.0,1.0,1.0
4,2003,136,1113,84,1272,71,N,0,1113,Arizona State,...,7.0,6.0,-10.0,-16.0,12.0,-25.0,-37.0,0.0,0.0,0.0


In [65]:
os.makedirs('../data/2024/model_data/')
model_data.to_csv('../data/2024/model_data/matchup_features.csv', index=False)

## 2024 Matchups

In [69]:
seeds = pd.read_csv("../data/2024/kaggle_data/MNCAATourneySeeds.csv")

In [70]:
curr = seeds[seeds['Season']==2024]


In [71]:
curr.head()

Unnamed: 0,Season,Seed,TeamID
2490,2024,W01,1163
2491,2024,W02,1235
2492,2024,W03,1228
2493,2024,W04,1120
2494,2024,W05,1361


In [72]:
gold_data = pd.read_csv('../data/2024/gold_data/gold_data_all.csv')

In [74]:
gold_data = gold_data[gold_data['year']==2024]

In [82]:
pd.set_option('display.max_columns', 100)

In [83]:
gold_data.head()

Unnamed: 0,TeamID,year,team_left_1,coach_name,team_name,coach_since,is_ap_pre_top_5,is_ap_pre_top_15,is_ap_pre_top_25,wins_cur,losses_cur,games_cur,coach_wl_cur,wins_car,losses_car,games_car,coach_wl_car,tourneys_car,sw16_car,ff_car,champ_car,team_right_1,preseason_pts,team_left_3,returning_min_pct,returning_score_pct,team_right_3,school_name,g,wins,losses,win_loss_pct,srs,sos,wins_conf,losses_conf,wins_home,losses_home,wins_visitor,losses_visitor,pts,opp_pts,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,...,ft_pct,orb,trb,ast,stl,blk,tov,pf,team,avg_yr,avg_height,avg_weight,pi_i,Season,total_games,num_wins,num_losses,win_pct,avg_points_for,avg_points_against,avg_game_margin,std_game_margin,non_home_wins,home_losses,close_wins,close_losses,3FGA_pg,3FGM_pg,3_pct,FTM_pg,FTA_pg,FT_pct,TOV_pg,avg_pace,off_eff,def_eff,pythag_win_pct,luck,Seed,wins_vs_tourney_teams,away_wins_vs_tourney_teams,losses_vs_tourney_teams,losses_vs_non_tourney_teams,games_vs_tourney_teams,final_net,prev_net,net_improvement,final_pom,prev_pom,pom_improvement
1351,1163,2024,connecticut,Dan Hurley,UConn,2018-19,False,True,True,131,58,189,0.693,282,163,445,0.634,5,1,1,1,uconn,1243.0,connecticut,42.6,36.6,connecticut,Connecticut,30,27,3,0.9,24.2,6.94,17.0,2.0,16.0,0.0,7.0,3.0,2441.0,1923.0,1200.0,868.0,1755.0,0.495,263.0,721.0,0.365,442.0,...,0.738,360.0,1158.0,538.0,193.0,164.0,300.0,497.0,connecticut,2.619861,78.485912,209.799538,0.008361,2024,34,31,3,0.911765,81.470588,64.411765,17.058824,15.107344,14.0,0.0,1.0,0.0,23.970588,8.794118,0.366871,14.5,19.529412,0.74247,9.117647,64.102588,127.778254,100.196867,0.963318,-0.051553,W01,7.0,5.0,2.0,1.0,9.0,3.0,3.0,0.0,2.0,3.0,1.0
1352,1235,2024,iowa-state,T.J. Otzelberger,Iowa State,2021-22,False,False,False,65,33,98,0.663,164,96,260,0.631,4,1,0,0,,,iowa-state,37.5,30.5,iowa-state,Iowa State,30,24,6,0.8,22.17,7.8,13.0,4.0,18.0,0.0,5.0,4.0,2290.0,1859.0,1200.0,824.0,1777.0,0.464,197.0,564.0,0.349,445.0,...,0.692,342.0,1036.0,482.0,314.0,88.0,328.0,495.0,iowa-state,2.764216,77.98012,201.320388,0.007118,2024,34,27,7,0.794118,75.558824,61.294118,14.264706,18.201986,9.0,0.0,1.0,1.0,18.735294,6.529412,0.348509,14.735294,21.147059,0.696801,10.205882,65.547671,114.011203,94.409341,0.948356,-0.154238,W02,10.0,4.0,4.0,3.0,14.0,9.0,8.0,-1.0,12.0,9.0,-3.0
1353,1228,2024,illinois,Brad Underwood *,Illinois,2017-18,False,False,True,136,87,223,0.61,245,114,359,0.682,7,0,0,0,illinois,104.0,illinois,59.0,58.9,illinois,Illinois,30,22,8,0.733,19.92,8.96,13.0,6.0,15.0,3.0,5.0,5.0,2529.0,2200.0,1210.0,880.0,1863.0,0.472,254.0,723.0,0.351,515.0,...,0.745,384.0,1232.0,389.0,135.0,112.0,325.0,459.0,illinois,3.343062,78.455321,207.325999,0.006234,2024,34,26,8,0.764706,84.411765,73.794118,10.617647,12.581533,11.0,3.0,3.0,2.0,23.823529,8.323529,0.349383,17.5,23.5,0.744681,10.294118,68.170667,123.372793,107.788271,0.866446,-0.10174,W03,9.0,4.0,6.0,2.0,15.0,15.0,17.0,2.0,10.0,11.0,1.0
1354,1120,2024,auburn,Bruce Pearl,Auburn,2014-15,False,False,False,210,118,328,0.64,441,217,658,0.67,12,5,1,0,auburn,27.0,auburn,58.1,60.2,auburn,Auburn,30,23,7,0.767,22.78,7.81,12.0,5.0,14.0,1.0,5.0,5.0,2496.0,2047.0,1200.0,860.0,1824.0,0.471,241.0,695.0,0.347,535.0,...,0.76,348.0,1143.0,531.0,224.0,180.0,317.0,571.0,auburn,3.065228,77.667146,208.910791,0.007428,2024,34,27,7,0.794118,83.323529,68.029412,15.294118,13.832037,12.0,1.0,0.0,0.0,22.794118,8.029412,0.352258,17.470588,23.235294,0.751899,10.147059,68.152094,121.567992,100.387315,0.943791,-0.149673,W04,7.0,3.0,6.0,1.0,13.0,6.0,7.0,1.0,4.0,5.0,1.0
1355,1361,2024,san-diego-state,Brian Dutcher,San Diego State,2017-18,False,False,True,173,55,228,0.759,173,55,228,0.759,4,1,1,0,san diego state,529.0,san-diego-state,48.0,49.1,san-diego-state,San Diego State,31,22,9,0.71,15.48,8.35,11.0,7.0,14.0,1.0,5.0,8.0,2315.0,2054.0,1260.0,805.0,1810.0,0.445,218.0,689.0,0.316,487.0,...,0.736,350.0,1164.0,407.0,230.0,119.0,348.0,536.0,san-diego-state,3.271754,77.219545,200.877733,0.005245,2024,32,22,10,0.6875,74.1875,67.34375,6.84375,11.597708,10.0,1.0,6.0,3.0,22.09375,6.90625,0.312588,15.96875,21.90625,0.728959,10.28125,64.453533,112.801118,102.695663,0.793522,-0.106022,W05,8.0,3.0,9.0,1.0,17.0,20.0,18.0,-2.0,20.0,20.0,0.0


In [84]:
len(curr)

68

In [85]:
curr = curr.merge(gold_data[['TeamID','team_left_1','team_name']],on='TeamID')

In [88]:
curr['sr_name'] = curr['team_left_1']

In [89]:
curr = curr[['Seed','TeamID','sr_name','team_name']]

In [91]:
curr.set_index('Seed',inplace=True)

In [92]:
curr.to_json('../data/2024/model_data/seeds.json',orient='index',indent=4)

In [53]:

# get all two-pair combinations of column 'A' as a list
combos = list(combinations(seeds['TeamID'], 2))

unique_pairs = set(frozenset(pair) for pair in combos)


# create a new dataframe from the combinations list
combo_df = pd.DataFrame(unique_pairs, columns=['team1', 'team2'])
combo_df = combo_df[combo_df['team1'] != combo_df['team2']]
combo_df['Season']=2023

In [61]:
# 2023 matchups
bracket_data = combo_df.merge(data, left_on=['Season', 'team1'], right_on=['Season', 'TeamID'])\
.merge(data, left_on=['Season', 'team2'], right_on=['Season', 'TeamID'], suffixes=['_1', '_2'])

In [63]:
for col in input_features:
    bracket_data['diff_'+col] = bracket_data[col+'_1'] - bracket_data[col+'_2']
diff_cols = [col for col in bracket_data.columns if 'diff' in col]
matchup_cols = ['Season', 'TeamID_1', 'TeamID_2', 'Seed_1', 'Seed_2'] + diff_cols
bracket_data = bracket_data[matchup_cols]

In [65]:
bracket_data.to_csv('../data/2023/bracket_data/bracket_data.csv', index=False)