### Adding Advanced Tags

In [46]:
# module imports
import numpy as np
import pandas as pd
from scipy.stats import rankdata

In [47]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

#### Pre-Cleaning Steps

In [48]:
# load in full merged dataset
df = pd.read_csv("../data/full_merged_dataset.csv")
original_cols = df.columns.tolist() 

In [49]:
# set ordering list for correct dates
season_order = pd.CategoricalDtype(categories=['FR','SO','JR','SR'], ordered=True)
df['season'] = df['season'].astype(season_order)
df = df.sort_values(['season','date','match_no'], kind='mergesort').reset_index(drop=True)

#### Cleaning Steps

In [50]:
# winning/losing streaks
def streak(series, want='W'):
    out, c = [], 0
    for v in series:
        c = c + 1 if v == want else 0
        out.append(c)
    return pd.Series(out, index=series.index)

# win/loss streaks
df['win_streak'] = (
    df.groupby('season', group_keys=False, observed=False)['result']
      .apply(lambda s: streak(s, 'W'))
)

df['loss_streak'] = (
    df.groupby('season', group_keys=False, observed=False)['result']
      .apply(lambda s: streak(s, 'L'))
)

In [51]:
# details
df['stats_available'] = df['season'] != 'JR'  # flag JR season with having no stats
df['played_all_sets'] = df['did_play'] & (df['sets_played'] == df['set_count'])
df['deciding_set_played'] = df['set_result'].isin(['2-1', '1-2', '3-2', '2-3'])
df['set_scores'] = df['set_scores'].fillna('').astype(str)

In [52]:
# statistic highs
def get_season_high_flags(row):
    if not row['did_play'] or not row['stats_available']:
        return ''
    highs = [field for field in stats_high_fields if pd.notna(row[field]) and row[field] == season_max.loc[row.name,
                                                                                                           field]]
    return ';'.join(highs)

def get_career_high_flags(row):
    if not row['did_play'] or not row['stats_available']:
        return ''
    return ';'.join([
        field for field in stats_high_fields
        if pd.notna(row[field]) and row[field] == df[field].max()
    ])

stats_high_fields = ['kills', 'aces', 'digs', 'receiving', 
                     'assists', 'total_blocks']

season_max = df.groupby('season', observed=False)[stats_high_fields].transform('max')

df['season_highs_flags'] = df.apply(get_season_high_flags, axis=1)
df['career_highs_flags'] = df.apply(get_career_high_flags, axis=1)
df['record_breaker_flag'] = df['career_highs_flags'].ne('')

In [53]:
# wins/losses in close situations
df['deciding_set_win'] = (  # close win in final deciding set
    df['did_play'].fillna(False).astype(bool)
    & (df['result'] == 'W')
    & df['set_result'].isin(['2-1', '3-2'])
    & (
        df['set_scores']
        .str.extract(r'\s*(\d+)\s*-\s*(\d+)\s*$')
        .apply(pd.to_numeric, errors='coerce')
        .pipe(lambda s: (s[0] - s[1]).abs() == 2)
    )
)

df['deciding_set_loss'] = (  # close loss in final deciding set
    df['did_play'].fillna(False).astype(bool)
    & (df['result'] == 'L')
    & df['set_result'].isin(['1-2', '2-3'])
    & (
        df['set_scores']
        .str.extract(r'\s*(\d+)\s*-\s*(\d+)\s*$')
        .apply(pd.to_numeric, errors='coerce')
        .pipe(lambda s: (s[0] - s[1]).abs() == 2)
    )
)

In [54]:
# low error game
df['low_error_game'] = (
    (
        df['serve_errors'] +
        df['kill_errors'] +
        df['receiving_errors'] +
        df['ball_handling_errors'] +
        df['block_errors'] +
        df['dig_errors']
    ) <= 2
)

In [55]:
df.tail()

Unnamed: 0,match_key,career_match_index,career_stage,season,season_match_number,date,day_of_week,week_of_season,days_since_last_match,match_no,total_matches_that_day,total_sets_that_day,multi_game_day,first_match_of_day,last_match_of_day,same_day_opponent_num,opponent,opponent_slug,season_opponent_num,is_repeat_opponent,deaf_school,match_type,event_name,milestone_flag,result,set_scores,set_result,set_count,set_diff,was_set_swept,swept_opponent,win_streak,loss_streak,comeback_win,total_points_for,total_points_against,margin_pct,high_margin_win,low_margin_loss,location,did_play,sets_played,kills,kills_per_set,kill_pct,kill_attempts,kill_errors,hit_pct,assists,assists_per_set,ball_handling_attempts,ball_handling_errors,solo_blocks,assisted_blocks,total_blocks,blocks_per_set,block_errors,digs,dig_errors,digs_per_set,receiving,receiving_errors,receiving_per_set,aces,aces_per_set,ace_pct,serve_attempts,serve_errors,serve_pct,points,maxpreps,stats_available,played_all_sets,deciding_set_played,season_highs_flags,career_highs_flags,record_breaker_flag,deciding_set_win,deciding_set_loss,low_error_game
152,SR_10-23_BHA_1,147.0,late,SR,42,2019-10-23,Wednesday,8,2,1,1,3,False,,,1,Berman Hebrew Academy,BHA,2,True,False,playoff,PVAC Tournament Quarterfinals,,W,"25-9,25-16,25-16",3-0,3,3,False,True,8,0,False,75,41,0.293103,False,False,home,True,3.0,6.0,2.0,46.2,13.0,1.0,0.385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,3.7,10.0,0.0,3.3,7.0,2.3,50.0,14.0,1.0,92.9,10.0,https://www.maxpreps.com/games/10-23-2019/voll...,True,True,False,,,False,False,False,True
153,SR_10-28_SSFS_1,148.0,late,SR,43,2019-10-28,Monday,9,5,1,1,3,False,,,1,Sandy Spring Friends,SSFS,2,True,False,playoff,PVAC Tournament Semifinals,,W,"25-15,25-19,25-8",3-0,3,3,False,True,9,0,False,75,42,0.282051,False,False,home,True,3.0,10.0,3.3,34.5,29.0,6.0,0.138,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,3.0,13.0,0.0,4.3,9.0,3.0,52.9,17.0,1.0,94.1,12.0,https://www.maxpreps.com/games/10-28-2019/voll...,True,True,False,,,False,False,False,False
154,SR_10-30_WIS_1,149.0,late,SR,44,2019-10-30,Wednesday,9,2,1,1,3,False,,,1,Washington International,WIS,2,True,False,championship,PVAC Tournament Championship,,W,"25-18,26-24,25-18",3-0,3,3,False,True,10,0,False,76,60,0.117647,False,False,neutral,True,3.0,14.0,4.7,33.3,42.0,5.0,0.214,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,4.3,14.0,0.0,4.7,5.0,1.7,33.3,15.0,3.0,80.0,9.0,https://www.maxpreps.com/games/10-30-2019/voll...,True,True,False,kills,,False,False,False,False
155,SR_11-05_BELL_1,150.0,late,SR,45,2019-11-05,Tuesday,10,6,1,1,3,False,,,1,Bell,BELL,1,False,False,playoff,DCSAA State Tournament First Round,,W,"25-14,25-14,25-13",3-0,3,3,False,True,11,0,False,75,41,0.293103,False,False,away,True,3.0,8.0,2.7,53.3,15.0,2.0,0.4,1.0,0.3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,3.3,12.0,0.0,4.0,4.0,1.3,44.4,9.0,3.0,66.7,5.0,https://www.maxpreps.com/games/11-05-2019/voll...,True,True,False,,,False,False,False,False
156,SR_11-06_SJ_1,151.0,late,SR,46,2019-11-06,Wednesday,10,1,1,1,3,False,,,1,St. John's,SJ,1,False,False,playoff,DCSAA State Tournament Quarterfinals,last SR match; last MSSD match,L,"20-25,20-25,14-25",0-3,3,-3,True,False,0,1,False,54,75,-0.162791,False,False,away,True,3.0,9.0,3.0,29.0,31.0,4.0,0.161,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2.0,2.3,21.0,1.0,7.0,1.0,0.3,11.1,9.0,1.0,88.9,3.0,https://www.maxpreps.com/games/11-06-2019/voll...,True,True,False,,,False,False,False,False


#### Export

In [56]:
col_order = [
    # --- match info ---
    "match_key", "career_match_index", "career_stage", "season",
    "season_match_number", "date", "day_of_week", "week_of_season",

    # --- match context/scheduling ---
    "days_since_last_match", "match_no", "total_matches_that_day",
    "total_sets_that_day", "multi_game_day", "first_match_of_day", 
    "last_match_of_day", "same_day_opponent_num",

    # --- opponent/match details ---
    "opponent", "opponent_slug", "season_opponent_num",
    "is_repeat_opponent", "deaf_school", "match_type",
    "event_name", "milestone_flag", "result", "set_scores", 
    "set_count", "set_result", "set_diff", "location",

    # --- narrative/storyline tags ---
    "comeback_win",

    # --- scoring/margins ---
    "total_points_for", "total_points_against", "margin_pct",
    "high_margin_win", "low_margin_loss",

    # --- participation ---
    "did_play", "played_all_sets", "stats_available",

    # --- achievements/records ---
    "season_highs_flags", "career_highs_flags", "record_breaker_flag",

    # --- storyline ---
    "deciding_set_win", "deciding_set_loss",

    # --- skill profiles ---
    "low_error_game",

    # --- match flow ---
    "win_streak", "loss_streak", "was_set_swept",
    "swept_opponent", "deciding_set_played",

    # --- stat lines ---
    "sets_played",

    # attacking
    "kills", "kills_per_set", "kill_pct",
    "kill_attempts", "kill_errors", "hit_pct",

    # ball handling
    "assists", "assists_per_set",
    "ball_handling_attempts", "ball_handling_errors",

    # blocking
    "solo_blocks", "assisted_blocks", "total_blocks", "blocks_per_set", "block_errors",

    # digging
    "digs", "dig_errors", "digs_per_set",

    # serve receiving
    "receiving", "receiving_errors",
    "receiving_per_set",

    # serving
    "aces", "aces_per_set", "ace_pct",
    "serve_attempts", "serve_errors", "serve_pct", "points",

    # maxpreps
    "maxpreps"
    ]

missing_from_order = [c for c in original_cols if c not in col_order]
if missing_from_order:
    try:
        max_idx = col_order.index("maxpreps")
    except ValueError:
        max_idx = len(col_order)
    col_order = col_order[:max_idx] + [c for c in missing_from_order if c != "maxpreps"] + col_order[max_idx:]

missing_still = [c for c in original_cols if c not in col_order]
assert not missing_still, f"missing OG columns in export: {missing_still}"

df = df[col_order]

In [57]:
# casting true types to specific cols that are erroneously floats or something else, and
# also rounding margin pct for readability
df["career_match_index"] = df["career_match_index"].astype("Int64")
df["sets_played"] = df["sets_played"].astype("Int64")
df["serve_attempts"] = df["serve_attempts"].astype("Int64")
df["serve_errors"] = df["serve_errors"].astype("Int64")
df["points"] = df["points"].astype("Int64")
df["margin_pct"] = round(df["margin_pct"], 3)

In [58]:

df.to_csv("../data/full_matches.csv", index=False)