# Configs

In [1]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import warnings

from Learning.MLPClassifier import MLPClassifier
from Learning.dataset_helper_functions import *
import Learning.MLPClassifier
from sklearn.metrics import classification_report

warnings.filterwarnings(
    "ignore",
    message="DataFrameGroupBy.apply operated on the grouping columns.*",
    category=DeprecationWarning,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.backends.mps.is_available():
    device = torch.device("mps")
print('Using device:', device)


Using device: mps


# Classification Settings

In [2]:

HIGHLIGHTS_MEAN_1Q = 3
HIGHLIGHTS_MEAN_2Q = 4
HIGHLIGHTS_MEAN_3Q = 4
HIGHLIGHTS_MEAN_4Q = 4

#KEEP THESE TRUE
INC_FTS = True
INC_TIES = True

CONTEXT_WINDOW = 3

highlights_per_qtr = {'1st': HIGHLIGHTS_MEAN_1Q, '2nd': HIGHLIGHTS_MEAN_2Q, '3rd':HIGHLIGHTS_MEAN_3Q, '4th':HIGHLIGHTS_MEAN_4Q}

seed = 42
data_path = "/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/full season data/plays_with_onehot_v2_withoutOT.csv"
model_path = '/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/Learning/saved_model/mlp_tuned.pth'
#interval_weights_path = '/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/Learning/nba_stats/unique_per_interval_minute.csv'

trained_model_params={
    'hidden_dims' : [256,128,128],
    'dropout' : 0,
}

# Data Prep

In [3]:
dataset = pd.read_csv(data_path)

In [4]:

freeze_seeds(seed)

base = add_game_ids(dataset)

games_idx = base['game_id'].unique()

rng = np.random.default_rng(seed=seed)  # For reproducibility
shuffled_game_ids = rng.permutation(games_idx)
#shuffled_game_ids

split_idx = int(0.8 * len(shuffled_game_ids))
train_game_ids = shuffled_game_ids[:split_idx]
test_game_ids = shuffled_game_ids[split_idx:]
split_test_ids = int(0.5 * len(test_game_ids))
val_game_ids = test_game_ids[:split_test_ids]
test_game_ids = test_game_ids[split_test_ids:]
print(len(train_game_ids))
print(len(val_game_ids))

unaltered_df_train = base[base['game_id'].isin(train_game_ids)].copy()
unaltered_df_val   = base[base['game_id'].isin(val_game_ids)].copy()
unaltered_df_test  = base[base['game_id'].isin(test_game_ids)].copy()

unaltered_df_val = unaltered_df_val.reset_index(drop=True)
unaltered_df_val = unaltered_df_val.reset_index(drop=True)
unaltered_df_test = unaltered_df_test.reset_index(drop=True)

#ONLY for TRAIN
sp_map = build_star_power_mapping(unaltered_df_train)

altered_df_train = get_dataset(unaltered_df_train, play_context_window=3, team_context_window=3, compact_players=True, star_power_mapping=sp_map)
altered_df_val = get_dataset(unaltered_df_val, play_context_window=3,team_context_window=3,compact_players=True, star_power_mapping=sp_map)
altered_df_test = get_dataset(unaltered_df_test, play_context_window=3,team_context_window=3,compact_players=True, star_power_mapping=sp_map)

feature_cols = [c for c in altered_df_train.columns if c not in LABEL_COLS]
altered_df_train = align_to_train(altered_df_train, feature_cols)
altered_df_val = align_to_train(altered_df_val, feature_cols)
altered_df_test = align_to_train(altered_df_test, feature_cols)

X_train = altered_df_train.drop(columns=['is_highlight', 'game_id']).to_numpy(dtype=np.float32)
y_train = altered_df_train['is_highlight'].to_numpy(dtype=np.int32)
X_val = altered_df_val.drop(columns=['is_highlight', 'game_id']).to_numpy(dtype=np.float32)
y_val = altered_df_val['is_highlight'].to_numpy(dtype=np.int32)
X_test = altered_df_test.drop(columns=['is_highlight', 'game_id']).to_numpy(dtype=np.float32)
y_test = altered_df_test['is_highlight'].to_numpy(dtype=np.int32)

scaler = StandardScaler().fit(X_train)

X_train=scaler.transform(X_train)
X_val=scaler.transform(X_val)
X_test = scaler.transform(X_test)

train_ds = TensorDataset(torch.from_numpy(X_train).float(),
                         torch.from_numpy(y_train).long())
val_ds = TensorDataset(torch.from_numpy(X_val).float(),
                         torch.from_numpy(y_val).long())
test_ds = TensorDataset(torch.from_numpy(X_test).float(),
                         torch.from_numpy(y_test).long())

n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()
pos_weight_val = float(n_neg/n_pos)
pos_weight = torch.tensor(pos_weight_val, device=device, dtype=torch.float32)


764
95


In [5]:
pos_weight
if isinstance(pos_weight, torch.Tensor):
    pos_weight = pos_weight.to(device)

In [6]:
pos_weight

tensor(17.0506, device='mps:0')

# Load Model

In [7]:
checkpoint = torch.load(model_path, map_location=device)
model = MLPClassifier(input_dim=X_train.shape[1], hidden_dims=[256,128,128], dropout=0).to(device)

model.load_state_dict(checkpoint['model_state_dict'])
model.to(device).eval()

MLPClassifier(
  (net): Sequential(
    (0): Linear(in_features=734, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0, inplace=False)
    (8): Linear(in_features=128, out_features=128, bias=True)
    (9): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0, inplace=False)
    (12): Linear(in_features=128, out_features=1, bias=True)
  )
)

# Predict Play Highlight Probabilities

In [25]:
test_logits = model(torch.tensor(X_test).to(device))
test_probs = torch.sigmoid(test_logits).detach().cpu().numpy()
unaltered_df_test['probs'] = test_probs
unaltered_df_test.head()

Unnamed: 0,time_left_qtr,play,distance,quarter,home_team,away_team,current_team,name,assister,win_difference,...,Oncourt_Player_29,Oncourt_Player_30,Oncourt_Player_31,Oncourt_Player_32,Oncourt_Player_33,date,is_highlight,game_id,time_left_qtr_sec,probs
0,11:50,3,2,1st,WARRIORS,SUNS,WARRIORS,Chris Paul,Blank,1,...,0,0,0,0,0,"October 25, 2023",0,2,710,0.011785
1,11:48,7,0,1st,WARRIORS,SUNS,SUNS,Jusuf Nurkic,Blank,1,...,0,0,0,0,0,"October 25, 2023",0,2,708,0.080256
2,11:28,25,1,1st,WARRIORS,SUNS,SUNS,Jusuf Nurkic,Kevin Durant,1,...,0,0,0,0,0,"October 25, 2023",0,2,688,0.407382
3,11:28,9,0,1st,WARRIORS,SUNS,WARRIORS,Andrew Wiggins,Blank,1,...,0,0,0,0,0,"October 25, 2023",0,2,688,0.675098
4,11:28,10,0,1st,WARRIORS,SUNS,SUNS,Jusuf Nurkic,Blank,1,...,0,0,0,0,0,"October 25, 2023",0,2,688,0.616699


In [21]:
val_logits = model(torch.tensor(X_val).to(device))
val_probs = torch.sigmoid(val_logits).detach().cpu().numpy()
unaltered_df_val['probs'] = val_probs
unaltered_df_val.head()

Unnamed: 0,time_left_qtr,play,distance,quarter,home_team,away_team,current_team,name,assister,win_difference,...,Oncourt_Player_30,Oncourt_Player_31,Oncourt_Player_32,Oncourt_Player_33,date,is_highlight,game_id,predicted,time_left_qtr_sec,probs
0,11:45,3,3,1st,MAGIC,ROCKETS,MAGIC,Markelle Fultz,Blank,1,...,0,0,0,0,"October 26, 2023",0,6,0,705,0.004149
1,11:40,7,0,1st,MAGIC,ROCKETS,ROCKETS,Alperen Sengun,Blank,1,...,0,0,0,0,"October 26, 2023",0,6,0,700,0.001328
2,11:22,8,0,1st,MAGIC,ROCKETS,ROCKETS,Jalen Green,Blank,1,...,0,0,0,0,"October 26, 2023",0,6,0,682,0.011623
3,11:09,29,0,1st,MAGIC,ROCKETS,MAGIC,Jalen Suggs,Blank,1,...,0,0,0,0,"October 26, 2023",0,6,0,669,0.009441
4,11:05,7,0,1st,MAGIC,ROCKETS,ROCKETS,Alperen Sengun,Blank,1,...,0,0,0,0,"October 26, 2023",0,6,0,665,0.003392


In [None]:
'''unaltered_df_test.loc[:,'predicted'] = 0 #init predictions as 0
unaltered_df_test.head()'''


In [None]:
'''
unaltered_df_test.head()

logits = model(X_test).detach().cpu().numpy().squeeze()
unaltered_df_test.loc[:, 'logits'] = logits
unaltered_df_test'''

#unaltered_df_test.loc[:,'y_truth'] = y_test.detach().cpu().numpy()
#print(unaltered_df_test.head())

#unaltered_df_test.head()
#unaltered_df_test['y_truth'] = y_test.detach().cpu().numpy()

In [None]:
unaltered_df_test.head()

# Predict Highlights

In [22]:
pred_accuracies = []
all_preds = []
all_labels = []
unaltered_df_test['time_left_qtr_sec'] = altered_df_test['time_left_qtr']
unaltered_df_test['game_id'] = altered_df_test['game_id']

unaltered_df_val['time_left_qtr_sec'] = altered_df_val['time_left_qtr']
unaltered_df_val['game_id'] = altered_df_val['game_id']

'''df_interval_weights = pd.read_csv(interval_weights_path)
df_interval_weights.columns = df_interval_weights.columns.str.strip().str.replace('"', '')

df_interval_weights['start'] = df_interval_weights['time_left_qtr'].str.extract(r'\((\d+),')[0].astype(int)
df_interval_weights['end'] = df_interval_weights['time_left_qtr'].str.extract(r', (\d+)\]')[0].astype(int)
df_interval_weights['num_highlights_per_game'] = df_interval_weights['num_highlights_per_game'].astype(float)

unaltered_df_test.head()'''


'df_interval_weights = pd.read_csv(interval_weights_path)\ndf_interval_weights.columns = df_interval_weights.columns.str.strip().str.replace(\'"\', \'\')\n\ndf_interval_weights[\'start\'] = df_interval_weights[\'time_left_qtr\'].str.extract(r\'\\((\\d+),\')[0].astype(int)\ndf_interval_weights[\'end\'] = df_interval_weights[\'time_left_qtr\'].str.extract(r\', (\\d+)\\]\')[0].astype(int)\ndf_interval_weights[\'num_highlights_per_game\'] = df_interval_weights[\'num_highlights_per_game\'].astype(float)\n\nunaltered_df_test.head()'

In [None]:
'''def get_weight(row):
    matches = df_interval_weights[
        (df_interval_weights['quarter'] == row['quarter']) &
        (row['time_left_qtr_sec'] >= df_interval_weights['start']) &
        (row['time_left_qtr_sec'] <= df_interval_weights['end'])
    ]
    if not matches.empty:
        return matches['num_highlights_per_game'].values[0]
    return'''

In [15]:
def get_game_intervals(df):
    df = df.reset_index(drop=True)
    start_positions = list((
        df.groupby("game_id")
        .apply(lambda g: g.index[0])
        .values
    ))
    end_positions = start_positions[1:] + [len(df)]

    return start_positions, end_positions

In [16]:
test_start_position, test_end_positions = get_game_intervals(altered_df_test)

for i,j in zip(test_start_position, test_end_positions):
    print(f'[{i},{j}]')

#unaltered_df_test.to_csv("/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/Learning/full game highlight classification/check_test_set.csv")


[0,426]
[426,843]
[843,1227]
[1227,1622]
[1622,2009]
[2009,2382]
[2382,2768]
[2768,3185]
[3185,3577]
[3577,3971]
[3971,4347]
[4347,4741]
[4741,5124]
[5124,5507]
[5507,5905]
[5905,6287]
[6287,6664]
[6664,7011]
[7011,7369]
[7369,7779]
[7779,8164]
[8164,8560]
[8560,8899]
[8899,9281]
[9281,9651]
[9651,10029]
[10029,10445]
[10445,10815]
[10815,11164]
[11164,11555]
[11555,11922]
[11922,12319]
[12319,12692]
[12692,13056]
[13056,13448]
[13448,13823]
[13823,14191]
[14191,14556]
[14556,14914]
[14914,15291]
[15291,15677]
[15677,16046]
[16046,16401]
[16401,16755]
[16755,17151]
[17151,17535]
[17535,17923]
[17923,18243]
[18243,18623]
[18623,18986]
[18986,19358]
[19358,19700]
[19700,20079]
[20079,20445]
[20445,20797]
[20797,21162]
[21162,21546]
[21546,21928]
[21928,22272]
[22272,22625]
[22625,22964]
[22964,23324]
[23324,23700]
[23700,24050]
[24050,24394]
[24394,24751]
[24751,25117]
[25117,25487]
[25487,25877]
[25877,26279]
[26279,26656]
[26656,27010]
[27010,27365]
[27365,27720]
[27720,28066]
[28066,2

In [17]:
#WEIGHT_SCALE = 0
#unaltered_df_test.loc[:,'weighted_probs'] = unaltered_df_test['probs'] * (1+unaltered_df_test['weight']*WEIGHT_SCALE)

In [18]:
def select_unique_highlights_qtr(unaltered_quarter_df, num_to_select):


    temp_df = unaltered_quarter_df.groupby(['time_left_qtr_sec']).agg(max_prob = ('probs','max'))
    #print(temp_df.head())

    top_n_plays =  temp_df.nlargest(num_to_select, columns=['max_prob'])
    top_n_plays = top_n_plays.reset_index()
    #print(top_n_plays['time_left_qtr'].values)
    return top_n_plays['time_left_qtr_sec'].values

In [19]:
val_start_position, val_end_positions = get_game_intervals(altered_df_val)
for i,j in zip(val_start_position, val_end_positions):
    print(f'[{i},{j}]')

[0,374]
[374,784]
[784,1165]
[1165,1541]
[1541,1915]
[1915,2293]
[2293,2685]
[2685,3071]
[3071,3488]
[3488,3852]
[3852,4227]
[4227,4638]
[4638,5000]
[5000,5399]
[5399,5810]
[5810,6201]
[6201,6584]
[6584,6998]
[6998,7371]
[7371,7747]
[7747,8120]
[8120,8546]
[8546,8967]
[8967,9341]
[9341,9718]
[9718,10043]
[10043,10435]
[10435,10797]
[10797,11185]
[11185,11582]
[11582,11930]
[11930,12298]
[12298,12667]
[12667,13036]
[13036,13431]
[13431,13821]
[13821,14235]
[14235,14608]
[14608,14956]
[14956,15335]
[15335,15705]
[15705,16068]
[16068,16447]
[16447,16824]
[16824,17187]
[17187,17577]
[17577,17953]
[17953,18320]
[18320,18687]
[18687,19094]
[19094,19492]
[19492,19870]
[19870,20288]
[20288,20652]
[20652,20998]
[20998,21367]
[21367,21736]
[21736,22137]
[22137,22477]
[22477,22844]
[22844,23209]
[23209,23591]
[23591,23950]
[23950,24288]
[24288,24641]
[24641,24994]
[24994,25318]
[25318,25638]
[25638,26056]
[26056,26463]
[26463,26836]
[26836,27232]
[27232,27609]
[27609,27980]
[27980,28350]
[28350,2

In [24]:
'''from sklearn.model_selection import ParameterGrid
#VALIDATING WINDOW SIZE
param_grid = {
                'left_window': [0,1,2,3,4],
                'right_window': [0,1,2,3,4]
            }
val_start_positions, val_end_positions = get_game_intervals(altered_df_val)

for params in ParameterGrid(param_grid):
    freeze_seeds(seed)
    print(params)
    all_preds=[]
    all_labels=[]
    pred_accuracies = []
    unaltered_df_val['predicted']=0
    #print(test_game_ids)
    #print(df.iloc[test_start_position[test_game_ids[1]-1]]['game_id'])
    for i, (start, end) in enumerate(zip(val_start_positions, val_end_positions)):
        quarters = ['1st','2nd','3rd','4th']
        for quarter in quarters:
            mask_q = (unaltered_df_val['quarter'] == quarter) & (unaltered_df_val.index >= start) & (unaltered_df_val.index < end)
            num_highlights_to_choose = highlights_per_qtr.get(quarter, 0)
            curr_qtr_df = unaltered_df_val[mask_q]
            if INC_FTS and INC_TIES:
                selected_play_times = select_unique_highlights_qtr(curr_qtr_df, highlights_per_qtr.get(quarter, 0))
                expanded_times = set()
                for t in selected_play_times:
                    expanded_times.add(t)
                    expanded_times.update(range(t-params['left_window'], t + params['right_window']))

            else:
                raise NotImplementedError

            mask = unaltered_df_val['time_left_qtr_sec'].isin(expanded_times) & mask_q
            unaltered_df_val.loc[mask, 'predicted'] = 1
            #altered_df_game.loc[selected_plays.index, 'predicted'] = 1

        y_pred = unaltered_df_val.loc[start:end-1, 'predicted']
        y_truth = unaltered_df_val.loc[start:end-1, 'is_highlight']
        accuracy = (y_truth == y_pred).mean()
        pred_accuracies.append(accuracy)
        #print(f"Game prediction accuracy: {accuracy:.3%}")
        all_preds.extend(y_pred)
        all_labels.extend(y_truth)
    print(f"Game prediction Final Mean Accuracy: {(sum(pred_accuracies)/len(pred_accuracies)):.3%}")
    print(classification_report(all_labels, all_preds, digits=3))'''



{'left_window': 0, 'right_window': 0}
Game prediction Final Mean Accuracy: 93.165%
              precision    recall  f1-score   support

           0      0.965     0.963     0.964     33552
           1      0.380     0.393     0.386      1942

    accuracy                          0.932     35494
   macro avg      0.672     0.678     0.675     35494
weighted avg      0.933     0.932     0.932     35494

{'left_window': 0, 'right_window': 1}
Game prediction Final Mean Accuracy: 93.165%
              precision    recall  f1-score   support

           0      0.965     0.963     0.964     33552
           1      0.380     0.393     0.386      1942

    accuracy                          0.932     35494
   macro avg      0.672     0.678     0.675     35494
weighted avg      0.933     0.932     0.932     35494

{'left_window': 0, 'right_window': 2}
Game prediction Final Mean Accuracy: 93.096%
              precision    recall  f1-score   support

           0      0.965     0.962     0.96

In [26]:
LEFT_CONTEXT_WINDOW = 0
RIGHT_CONTEXT_WINDOW = 3

all_preds=[]
all_labels=[]
pred_accuracies = []
test_start_position, test_end_positions = get_game_intervals(altered_df_test)
unaltered_df_test['predicted']=0
#print(test_game_ids)
#print(df.iloc[test_start_position[test_game_ids[1]-1]]['game_id'])
for i, (start, end) in enumerate(zip(test_start_position, test_end_positions)):
    quarters = ['1st','2nd','3rd','4th']
    for quarter in quarters:
        mask_q = (unaltered_df_test['quarter'] == quarter) & (unaltered_df_test.index >= start) & (unaltered_df_test.index < end)
        num_highlights_to_choose = highlights_per_qtr.get(quarter, 0)
        curr_qtr_df = unaltered_df_test[mask_q]
        if INC_FTS and INC_TIES:
            selected_play_times = select_unique_highlights_qtr(curr_qtr_df, highlights_per_qtr.get(quarter, 0))
            expanded_times = set()
            for t in selected_play_times:
                expanded_times.add(t)
                expanded_times.update(range(t-LEFT_CONTEXT_WINDOW, t + RIGHT_CONTEXT_WINDOW))

        else:
            raise NotImplementedError

        mask = unaltered_df_test['time_left_qtr_sec'].isin(expanded_times) & mask_q
        unaltered_df_test.loc[mask, 'predicted'] = 1
        #altered_df_game.loc[selected_plays.index, 'predicted'] = 1

    y_pred = unaltered_df_test.loc[start:end-1, 'predicted']
    y_truth = unaltered_df_test.loc[start:end-1, 'is_highlight']
    accuracy = (y_truth == y_pred).mean()
    pred_accuracies.append(accuracy)
    print(f"Game prediction accuracy: {accuracy:.3%}")
    all_preds.extend(y_pred)
    all_labels.extend(y_truth)



Game prediction accuracy: 90.141%
Game prediction accuracy: 91.607%
Game prediction accuracy: 93.229%
Game prediction accuracy: 91.139%
Game prediction accuracy: 92.765%
Game prediction accuracy: 93.029%
Game prediction accuracy: 90.674%
Game prediction accuracy: 93.046%
Game prediction accuracy: 93.878%
Game prediction accuracy: 94.670%
Game prediction accuracy: 94.149%
Game prediction accuracy: 91.117%
Game prediction accuracy: 92.950%
Game prediction accuracy: 91.906%
Game prediction accuracy: 94.221%
Game prediction accuracy: 90.576%
Game prediction accuracy: 94.164%
Game prediction accuracy: 93.660%
Game prediction accuracy: 94.693%
Game prediction accuracy: 94.390%
Game prediction accuracy: 93.766%
Game prediction accuracy: 92.677%
Game prediction accuracy: 92.920%
Game prediction accuracy: 92.932%
Game prediction accuracy: 93.243%
Game prediction accuracy: 94.180%
Game prediction accuracy: 92.308%
Game prediction accuracy: 93.784%
Game prediction accuracy: 93.123%
Game predictio

In [27]:


print(f"Game prediction Final Mean Accuracy: {(sum(pred_accuracies)/len(pred_accuracies)):.3%}")
print(classification_report(all_labels, all_preds, digits=3))
#unaltered_df_test['predicted'] = all_preds

Game prediction Final Mean Accuracy: 92.771%
              precision    recall  f1-score   support

           0      0.964     0.960     0.962     33559
           1      0.357     0.382     0.369      1966

    accuracy                          0.928     35525
   macro avg      0.660     0.671     0.665     35525
weighted avg      0.930     0.928     0.929     35525



In [None]:
#unaltered_df_game

In [None]:
unaltered_df_test = unaltered_df_test.drop(columns=['time_left_qtr_sec'])

unaltered_df_test.to_csv("/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/Learning/full game highlight classification/predicted_output_test.csv", index=False)

# Data Exploration

In [None]:
altered_df_games = altered_df.iloc[test_start_position[0]:test_end_positions[3]].copy()
unaltered_df_games = unaltered_df.iloc[test_start_position[0]:test_end_positions[3]].copy()
altered_df_games

In [None]:
unaltered_df_games['time_left_qtr'] = altered_df_games['time_left_qtr']
unaltered_df_games['game_id'] = altered_df_games['game_id']
unaltered_df_games['next_play_type'] = unaltered_df_games.groupby(['game_id','quarter'])['play'].shift(-1)
unaltered_df_games.head()


In [None]:
game_highlights = unaltered_df_game.drop(unaltered_df_game[unaltered_df_game.is_highlight == 0].index)

num_unique_highlights = game_highlights.groupby(['quarter'])['time_left_qtr'].nunique()
num_unique_highlights

In [None]:
altered_df_all = altered_df.copy()
unaltered_df_all = unaltered_df.copy()

unaltered_df_all['time_left_qtr'] = altered_df_all['time_left_qtr']
unaltered_df_all['game_id'] = altered_df_all['game_id']

df_all_game_highlights = unaltered_df_all.drop(unaltered_df_all[unaltered_df_all.is_highlight == 0].index)

num_unique_highlights_per_game = df_all_game_highlights.groupby(['game_id', 'quarter'])['time_left_qtr'].nunique()

In [None]:
num_unique_highlights_per_game

In [None]:
all_games_highlights = unaltered_df_all.drop(unaltered_df_all[(unaltered_df_all.is_highlight == 0)].index)


In [None]:
unique_highlights_per_game_qtr = all_games_highlights.groupby(['game_id','quarter']).agg(avg_num_highlights = ('time_left_qtr','nunique'))
unique_highlights_per_game_qtr

## Unique highlights per quarter

In [None]:
unique_highlights_each_quarter = unique_highlights_per_game_qtr.groupby(['quarter']).agg(avg_highlights_qtr = ('avg_num_highlights', 'mean'))
unique_highlights_each_quarter

## Unique highlights per interval

In [None]:
interval_groups = np.arange(0,721,720)
interval_groups


In [None]:
per_interval = all_games_highlights.groupby(['game_id','quarter', pd.cut(all_games_highlights.time_left_qtr, interval_groups)]).agg(num_highlights = ('time_left_qtr','nunique'))

unique_players_per_interval = all_games_highlights.groupby(['game_id','quarter', pd.cut(all_games_highlights.time_left_qtr, interval_groups)]).agg(num_players = ('name','nunique'))



In [None]:
player_counts = unaltered_df['name'].value_counts()
rare_players = player_counts.sort_values(ascending=True)
common = player_counts[player_counts >=400].index
player_counts = player_counts.reset_index()
player_counts['name'] = player_counts['name'].where(player_counts['name'].isin(common), 'Other')
player_counts['name'].value_counts()

In [None]:
per_game = per_interval.groupby(['quarter', 'time_left_qtr']).agg(num_highlights=('num_highlights','mean'), unique_players=('unique_players','mean'))
per_game

In [None]:
context_window=2
unaltered_df_copy = unaltered_df.copy()
unaltered_df_copy['game_id'] = altered_df['game_id']
for i in range(context_window):
    unaltered_df_copy[f'play_{str(i + 1)}_after'] = unaltered_df_copy.groupby(['game_id', 'quarter'])['play'].shift(-(i+1))
    #dataset[f'play_{str(i + 1)}_before'] = dataset.groupby(['game_id', 'quarter'])['play'].shift((i+1))

    #begin team context

    shifted = (
    unaltered_df_copy
      .groupby(['game_id','quarter'])['current_team']
      .shift(-(i+1))
    )
    original = unaltered_df_copy['current_team']    # ← plain Series
    unaltered_df_copy[f'team_{str(i + 1)}_after'] = original == shifted

    #end team context


    unaltered_df_copy[f'play_{str(i + 1)}_after'] = unaltered_df_copy[f'play_{str(i + 1)}_after'].dropna().astype(int)
    unaltered_df_copy[f'team_{str(i + 1)}_after'] = unaltered_df_copy[f'team_{str(i + 1)}_after'].dropna().astype(int)

In [None]:
test2 = unaltered_df_copy.groupby(['game_id', 'quarter'])['current_team']
test2.head()

In [None]:
df_h = unaltered_df_copy[unaltered_df_copy['is_highlight'] == 1]

all_players = pd.concat([
    df_h['name'],
    df_h['assister'],
    df_h['stolen_by']
])

all_players = all_players[all_players != 'Blank']

star_power = all_players.value_counts()
star_power.index.name = 'name'
star_power = star_power.reset_index()
star_power


In [None]:
total_highlights = (unaltered_df_copy['is_highlight'] == 1).sum()
star_power['count'] = star_power['count']/total_highlights
star_power

In [None]:


mean, std = star_power['count'].mean(), star_power['count'].std()
star_power['count'] = (star_power['count'] - mean) / std

star_power

In [None]:
mapping = star_power.set_index('name')['count']

unaltered_df['name_star_power'] = unaltered_df['name'].map(mapping).fillna(0).astype(float)

unaltered_df['assister_star_power'] = unaltered_df['assister'].map(mapping).fillna(0).astype(float)

unaltered_df['stolen_by_star_power'] = unaltered_df['stolen_by'].map(mapping).fillna(0).astype(float)

unaltered_df.head(50)


In [None]:
df_test = get_dataset(data_path, verbose=False, rm_ft_ds=False, add_game_idx=True, compact_mode=True, play_context_window=2, team_context_window=2)
unalt_df1 = pd.read_csv(data_path)
df_test.head()

In [None]:
import json

with open('/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/full season data/temp_rosters.json') as json_data:
    team_rosters_full = json.load(json_data)
    team_rosters_full

In [None]:
print((team_rosters_full))

In [None]:
unalt_df1['current_team']
oncourt_cols = [c for c in unalt_df1 if c.startswith('Oncourt_Player')]
team_star_power = {}
mapping_dict = mapping.to_dict()
team_star_power = {team : [0]*33 for team in team_rosters_full.keys()}
for team in team_rosters_full.keys():
    print(team)
    for i, player in enumerate(team_rosters_full[team]):
        if player in mapping_dict.keys():
            team_star_power[team][i] = mapping_dict[player]
            print(f'{player}, power: {mapping_dict[player]}')

def calculate_oncourt_star(row, team_rosters):
    team = row['current_team']
    star_power = 0
    for i, col in enumerate(oncourt_cols):
        if row[col] == 1:  # Player is on court
            if team_rosters[team][i] in mapping_dict.keys():
                star_power += mapping_dict[team_rosters[team][i]]  #
    return star_power

unalt_df1['oncourt_star_power'] = unalt_df1.apply(calculate_oncourt_star, args=(team_rosters_full,), axis=1)


#for col in oncourt_cols:
#    player_index = int(col.split('_')[-1])


In [None]:
unalt_df1.head(50)

# Test

In [None]:


dataset_test = pd.read_csv(data_path)
res = get_dataset(dataset_test,
                  verbose=False,
                  rm_ft_ds=False,
                  add_game_idx=True,
                  play_context_window=1,
                  team_context_window=1,
                  compact_players=1,
                  compact_oncourt=1,
                  compact_current_team=1,
                  drop_home_away_teams=1,
                  group_all_plays=1,
                  enum_quarters=1)


In [None]:
res.head()

In [None]:
dataset_test.head()

In [None]:
dataset_test = pd.read_csv(data_path)

In [None]:
res = get_dataset(dataset_test,
                  verbose=False,
                  rm_ft_ds=False,
                  add_game_idx=True,
                  play_context_window=0,
                  team_context_window=0,
                  compact_players=False,
                  compact_oncourt=False,
                  compact_current_team=False,
                  drop_home_away_teams=False,
                  group_all_plays=False,
                  enum_quarters=False)

In [None]:
res.head()