In [2]:
## This file uses all generated data, so it needs to be run after the files that use kaggle data

In [47]:
import numpy as np
import pandas as pd
import os
from itertools import combinations


In [49]:
data = pd.read_csv('../data/2023/gold_data/gold_data_all.csv')

In [50]:
id_cols = ['TeamID','team_name','Seed','Season']
input_features = [
 'is_ap_pre_top_5',
 'is_ap_pre_top_15',
 'is_ap_pre_top_25',
 'wins_cur',
 'losses_cur',
 'games_cur',
 'coach_wl_cur',
 'wins_car',
 'losses_car',
 'games_car',
 'coach_wl_car',
 'tourneys_car',
 'sw16_car',
 'ff_car',
 'champ_car',
 'preseason_pts',
 'returning_min_pct',
 'returning_score_pct',
 'g',
 'wins',
 'losses',
 'win_loss_pct',
 'srs',
 'sos',
 'wins_conf',
 'losses_conf',
 'wins_home',
 'losses_home',
 'wins_visitor',
 'losses_visitor',
 'pts',
 'opp_pts',
 'mp',
 'fg',
 'fga',
 'fg_pct',
 'fg3',
 'fg3a',
 'fg3_pct',
 'ft',
 'fta',
 'ft_pct',
 'orb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'avg_yr',
 'avg_height',
 'avg_weight',
 'pi_i',
 'total_games',
 'num_wins',
 'num_losses',
 'win_pct',
 'avg_points_for',
 'avg_points_against',
 'avg_game_margin',
 'std_game_margin',
 'non_home_wins',
 'home_losses',
 'close_wins',
 'close_losses',
 '3FGA_pg',
 '3FGM_pg',
 '3_pct',
 'FTM_pg',
 'FTA_pg',
 'FT_pct',
 'TOV_pg',
 'avg_pace',
 'off_eff',
 'def_eff',
 'pythag_win_pct',
 'luck',
 'wins_vs_tourney_teams',
 'away_wins_vs_tourney_teams',
 'losses_vs_tourney_teams',
 'losses_vs_non_tourney_teams',
 'games_vs_tourney_teams',
 'final_net',
 'prev_net',
 'net_improvement',
 'final_pom',
 'prev_pom',
 'pom_improvement'
]

In [51]:
data = data[id_cols+input_features]

In [52]:
tourney_games = pd.read_csv('../data/2024/kaggle_data/MNCAATourneyCompactResults.csv')
tourney_games = tourney_games[tourney_games['Season'] >= data['Season'].min()]

In [53]:
tourney_games = tourney_games.merge(data, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])\
.merge(data, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=['_w', '_l'])

In [54]:
winner_first = tourney_games.copy()
winner_first['diff_score'] = winner_first['WScore'] - winner_first['LScore']
loser_first = tourney_games.copy()
loser_first['diff_score'] = loser_first['LScore'] - winner_first['WScore']


In [56]:
for col in input_features:
    try:
        winner_first['diff_'+col] = winner_first[col+'_w'] - winner_first[col+'_l']
        loser_first['diff_'+col] = loser_first[col+'_l'] - loser_first[col+'_w']
    except:
        winner_first['diff_'+col] = winner_first[col+'_w'].map(float) - winner_first[col+'_l'].map(float)
        loser_first['diff_'+col] = loser_first[col+'_l'].map(float) - loser_first[col+'_w'].map(float)

In [57]:
model_data = pd.concat([winner_first,loser_first],ignore_index=True)

In [58]:
model_data.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,TeamID_w,team_name_w,...,diff_games_vs_tourney_teams,diff_final_net,diff_prev_net,diff_net_improvement,diff_final_pom,diff_prev_pom,diff_pom_improvement,diff_is_ap_pre_top_5,diff_is_ap_pre_top_15,diff_is_ap_pre_top_25
0,2003,134,1421,92,1411,84,N,1,1421,UNC Asheville,...,5.0,-31.0,-20.0,11.0,5.0,13.0,8.0,0.0,0.0,0.0
1,2003,136,1112,80,1436,51,N,0,1112,Arizona,...,11.0,-142.0,-177.0,-35.0,-141.0,-179.0,-38.0,1.0,1.0,1.0
2,2003,138,1112,96,1211,95,N,2,1112,Arizona,...,2.0,-41.0,-38.0,3.0,-38.0,-26.0,12.0,1.0,1.0,0.0
3,2003,143,1112,88,1323,71,N,0,1112,Arizona,...,1.0,-12.0,4.0,16.0,-19.0,-9.0,10.0,1.0,1.0,1.0
4,2003,136,1113,84,1272,71,N,0,1113,Arizona State,...,7.0,6.0,-10.0,-16.0,12.0,-25.0,-37.0,0.0,0.0,0.0


In [62]:
os.makedirs('data/2024/model_data/')
model_data.to_csv('data/2024/model_data/matchup_features.csv', index=False)

## 2024 Matchups

In [64]:
seeds = pd.read_csv("../data/2024/kaggle_data/MNCAATourneySeeds.csv")

In [53]:
curr = seeds[seeds['Season']==2024]

# get all two-pair combinations of column 'A' as a list
combos = list(combinations(seeds['TeamID'], 2))

unique_pairs = set(frozenset(pair) for pair in combos)


# create a new dataframe from the combinations list
combo_df = pd.DataFrame(unique_pairs, columns=['team1', 'team2'])
combo_df = combo_df[combo_df['team1'] != combo_df['team2']]
combo_df['Season']=2023

In [61]:
# 2023 matchups
bracket_data = combo_df.merge(data, left_on=['Season', 'team1'], right_on=['Season', 'TeamID'])\
.merge(data, left_on=['Season', 'team2'], right_on=['Season', 'TeamID'], suffixes=['_1', '_2'])

In [63]:
for col in input_features:
    bracket_data['diff_'+col] = bracket_data[col+'_1'] - bracket_data[col+'_2']
diff_cols = [col for col in bracket_data.columns if 'diff' in col]
matchup_cols = ['Season', 'TeamID_1', 'TeamID_2', 'Seed_1', 'Seed_2'] + diff_cols
bracket_data = bracket_data[matchup_cols]

In [65]:
bracket_data.to_csv('../data/2023/bracket_data/bracket_data.csv', index=False)