# Configs

In [15]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import warnings

from Learning.dataset_helper_functions import *
from Learning.MLPClassifier import *
from sklearn.metrics import classification_report

warnings.filterwarnings(
    "ignore",
    message="DataFrameGroupBy.apply operated on the grouping columns.*",
    category=DeprecationWarning,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.backends.mps.is_available():
    device = torch.device("mps")
print('Using device:', device)


Using device: mps


# Classification Settings

In [16]:

HIGHLIGHTS_MEAN_1Q = 3
HIGHLIGHTS_MEAN_2Q = 4
HIGHLIGHTS_MEAN_3Q = 4
HIGHLIGHTS_MEAN_4Q = 4

#KEEP THESE TRUE
INC_FTS = True
INC_TIES = True

HIGHLIGHT_WINDOW = 4

highlights_per_qtr = {'1st': HIGHLIGHTS_MEAN_1Q, '2nd': HIGHLIGHTS_MEAN_2Q, '3rd':HIGHLIGHTS_MEAN_3Q, '4th':HIGHLIGHTS_MEAN_4Q}

seed = 42
TRAINING_END_IDX = 284986
data_path = "../../full season data/plays_with_onehot_v2_withoutOT.csv"
model_path = '/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/Learning/saved_model/mlp_final_checkpoint_withoutOT_test.pth'

trained_model_params={
    'hidden_dim' : 128,
    'dropout' : 0.3,
}

# Data Prep

In [17]:
freeze_seeds(seed)
unaltered_df = pd.read_csv(data_path)
altered_df = get_dataset(path=data_path, verbose=False, rm_ft_ds=False, add_game_idx=True)

altered_df_train = altered_df.iloc[:TRAINING_END_IDX - 1]
unaltered_df_train = unaltered_df.iloc[:TRAINING_END_IDX - 1]
altered_df_train = altered_df_train.reset_index(drop=True)
unaltered_df_train = unaltered_df_train.reset_index(drop=True)

altered_df_test = altered_df.iloc[TRAINING_END_IDX - 1:]
unaltered_df_test = unaltered_df.iloc[TRAINING_END_IDX - 1:]
altered_df_test = altered_df_test.reset_index(drop=True)
unaltered_df_test = unaltered_df_test.reset_index(drop=True)

X_train = altered_df_train.drop(columns=['is_highlight', 'game_id'])
X_test = altered_df_test.drop(columns=['is_highlight', 'game_id'])

X_train = torch.tensor(X_train.values.astype(np.float32)).to(device)
X_test = torch.tensor(X_test.values.astype(np.float32)).to(device)

# Load Model

In [18]:
checkpoint = torch.load(model_path, map_location=device)
model = MLPClassifier(input_dim=X_test.shape[1],   hidden_dim=trained_model_params['hidden_dim'],
dropout=trained_model_params['dropout']).to(device)

model.load_state_dict(checkpoint['model_state_dict'])
model.to(device).eval()

MLPClassifier(
  (net): Sequential(
    (0): Linear(in_features=1849, out_features=128, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=1, bias=True)
  )
)

# Predict Play Highlight Probabilities

In [19]:
unaltered_df_test['predicted'] = 0 #init predictions as 0
logits = model(X_test)
probs = torch.sigmoid(logits).detach().cpu().numpy()
unaltered_df_test['probs'] = probs


# Predict Highlights

In [20]:
pred_accuracies = []
all_preds = []
all_labels = []
all_probs = []
unaltered_df_test['time_left_qtr_sec'] = altered_df_test['time_left_qtr']
unaltered_df_test['game_id'] = altered_df_test['game_id']

In [21]:
def get_game_intervals(df):
    start_positions = list((
        df.groupby("game_id")
        .apply(lambda g: g.index[0])
        .values
    ))
    end_positions = start_positions[1:] + [len(df)]

    return start_positions, end_positions

In [22]:
train_start_position, train_end_positions = get_game_intervals(unaltered_df_test)
test_start_position, test_end_positions = get_game_intervals(unaltered_df_test)

In [23]:
def select_unique_highlights_qtr(unaltered_quarter_df, num_to_select):

    current_selected_num = 0

    temp_df = unaltered_quarter_df.groupby(['time_left_qtr']).agg(total_prob = ('probs','sum'))
    #print(temp_df.head())

    top_n_plays =  temp_df.nlargest(num_to_select, columns=['total_prob'])
    top_n_plays = top_n_plays.reset_index()
    #print(top_n_plays['time_left_qtr'].values)
    return top_n_plays['time_left_qtr'].values

In [24]:
for i in range(len(test_start_position)):
    altered_df_game = altered_df_test.iloc[test_start_position[i]:test_end_positions[i]].copy()
    unaltered_df_game = unaltered_df_test.iloc[test_start_position[i]:test_end_positions[i]].copy()
    quarters = ['1st','2nd','3rd','4th']
    unaltered_df_game['predicted'] = 0
    #altered_df_game['predicted'] = 0
    for quarter in quarters:
        mask_q = unaltered_df_game['quarter'] == quarter
        curr_qtr_df = unaltered_df_game[mask_q]
        if INC_FTS and INC_TIES:
            selected_play_times = select_unique_highlights_qtr(curr_qtr_df, highlights_per_qtr.get(quarter, 0))
        else:
            raise NotImplementedError

        unaltered_df_game[unaltered_df_game['time_left_qtr'].isin(selected_play_times) & mask_q] = 1
        #altered_df_game.loc[selected_plays.index, 'predicted'] = 1

    y_pred = unaltered_df_game['predicted']
    y_truth = unaltered_df_game['is_highlight']
    accuracy = (y_truth == y_pred).mean()
    pred_accuracies.append(accuracy)
    print(f"Game prediction accuracy: {accuracy:.3%}")
    all_preds.extend(y_pred)
    all_labels.extend(y_truth)



Game prediction accuracy: 96.766%
Game prediction accuracy: 96.296%
Game prediction accuracy: 96.573%
Game prediction accuracy: 98.171%
Game prediction accuracy: 96.232%
Game prediction accuracy: 95.924%
Game prediction accuracy: 96.809%
Game prediction accuracy: 96.243%
Game prediction accuracy: 97.668%
Game prediction accuracy: 97.205%
Game prediction accuracy: 98.128%
Game prediction accuracy: 96.238%
Game prediction accuracy: 95.946%
Game prediction accuracy: 96.570%
Game prediction accuracy: 97.541%
Game prediction accuracy: 96.694%
Game prediction accuracy: 96.793%
Game prediction accuracy: 96.667%
Game prediction accuracy: 96.758%
Game prediction accuracy: 98.860%
Game prediction accuracy: 97.345%
Game prediction accuracy: 96.791%
Game prediction accuracy: 95.592%
Game prediction accuracy: 97.199%
Game prediction accuracy: 98.623%
Game prediction accuracy: 97.878%
Game prediction accuracy: 97.626%
Game prediction accuracy: 97.406%
Game prediction accuracy: 97.403%
Game predictio

In [25]:
print(f"Game prediction Final Mean Accuracy: {(sum(pred_accuracies)/len(pred_accuracies)):.3%}")
print(classification_report(all_labels, all_preds, digits=3))
unaltered_df_test['predicted'] = all_preds

Game prediction Final Mean Accuracy: 96.684%
              precision    recall  f1-score   support

           0      0.965     1.000     0.982     64245
           1      1.000     0.669     0.802      7133

    accuracy                          0.967     71378
   macro avg      0.982     0.835     0.892     71378
weighted avg      0.968     0.967     0.964     71378



In [26]:
unaltered_df_test.drop(columns=['time_left_qtr_sec'])

unaltered_df_test.to_csv("/Users/galishai/PycharmProjects/AI_PROJECT_SPORTS_HIGHLIGHTS/Learning/full game highlight classification/predicted_output_test.csv", index=False)

# Data Exploration

In [44]:
altered_df_game = altered_df.iloc[test_start_position[0]:test_end_positions[0]].copy()
unaltered_df_game = unaltered_df.iloc[test_start_position[0]:test_end_positions[0]].copy()
altered_df_game

Unnamed: 0,time_left_qtr,distance,win_difference,games_played,win_percentage,home_score,away_score,team_fouls_qtr,team_turnovers,player_rebounds,...,stolen_by_Wesley Matthews,stolen_by_Xavier Moon,stolen_by_Xavier Tillman,stolen_by_Yuta Watanabe,stolen_by_Zach Collins,stolen_by_Zach LaVine,stolen_by_Zavier Simpson,stolen_by_Zeke Nnaji,stolen_by_Ziaire Williams,stolen_by_Zion Williamson
0,702,0,1,1,0.0,0,2,0,0,0,...,False,False,False,False,False,False,False,False,False,False
1,675,2,1,1,100.0,2,2,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,657,3,1,1,0.0,2,5,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,640,2,1,1,100.0,4,5,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,633,3,1,1,0.0,4,8,0,0,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,425,0,1,1,100.0,12,17,2,2,2,...,False,False,False,False,False,False,False,False,False,False
398,425,0,1,1,100.0,12,17,2,3,2,...,False,False,False,False,False,False,False,False,False,False
399,424,0,1,1,0.0,12,17,3,1,0,...,False,False,False,False,False,False,False,False,False,False
400,415,2,1,1,100.0,12,17,2,3,2,...,False,False,False,False,False,False,False,False,False,False


In [46]:
unaltered_df_game['time_left_qtr'] = altered_df_game['time_left_qtr']
unaltered_df_game


Unnamed: 0,time_left_qtr,play,distance,quarter,home_team,away_team,current_team,name,assister,win_difference,...,Oncourt_Player_26,Oncourt_Player_27,Oncourt_Player_28,Oncourt_Player_29,Oncourt_Player_30,Oncourt_Player_31,Oncourt_Player_32,Oncourt_Player_33,date,is_highlight
0,702,47,0,1st,NUGGETS,LAKERS,LAKERS,Anthony Davis,D'Angelo Russell,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
1,675,28,2,1st,NUGGETS,LAKERS,NUGGETS,Nikola Jokic,Jamal Murray,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
2,657,0,3,1st,NUGGETS,LAKERS,LAKERS,Taurean Prince,LeBron James,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
3,640,2,2,1st,NUGGETS,LAKERS,NUGGETS,Jamal Murray,Blank,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
4,633,0,3,1st,NUGGETS,LAKERS,LAKERS,Taurean Prince,LeBron James,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,425,54,0,1st,WARRIORS,SUNS,SUNS,Kevin Durant,Blank,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
398,425,8,0,1st,WARRIORS,SUNS,SUNS,Kevin Durant,Blank,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
399,424,8,0,1st,WARRIORS,SUNS,WARRIORS,Kevon Looney,Blank,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0
400,415,3,2,1st,WARRIORS,SUNS,SUNS,Kevin Durant,Blank,1,...,0,0,0,0,0,0,0,0,"October 25, 2023",0


In [74]:
game_highlights = unaltered_df_game.drop(unaltered_df_game[unaltered_df_game.is_highlight == 0].index)

num_unique_highlights = game_highlights.groupby(['quarter'])['time_left_qtr'].nunique()
num_unique_highlights

quarter
1st    8
2nd    4
3rd    6
4th    6
Name: time_left_qtr, dtype: int64

In [77]:
altered_df_all = altered_df.copy()
unaltered_df_all = unaltered_df.copy()

unaltered_df_all['time_left_qtr'] = altered_df_all['time_left_qtr']
unaltered_df_all['game_id'] = altered_df_all['game_id']

df_all_game_highlights = unaltered_df_all.drop(unaltered_df_all[unaltered_df_all.is_highlight == 0].index)

num_unique_highlights_per_game = df_all_game_highlights.groupby(['game_id', 'quarter'])['time_left_qtr'].nunique()

In [79]:
num_unique_highlights_per_game

game_id  quarter
1        1st        5
         2nd        4
         3rd        6
         4th        6
2        1st        6
                   ..
954      4th        4
955      1st        3
         2nd        6
         3rd        4
         4th        5
Name: time_left_qtr, Length: 3745, dtype: int64

In [165]:
all_games_highlights = unaltered_df_all.drop(unaltered_df_all[(unaltered_df_all.is_highlight == 0)].index)


In [167]:
unique_highlights_per_game_qtr = all_games_highlights.groupby(['game_id','quarter']).agg(avg_num_highlights = ('time_left_qtr','nunique'))
unique_highlights_per_game_qtr

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_num_highlights
game_id,quarter,Unnamed: 2_level_1
1,1st,5
1,2nd,4
1,3rd,6
1,4th,6
2,1st,6
...,...,...
954,4th,4
955,1st,3
955,2nd,6
955,3rd,4


## Unique highlights per quarter

In [160]:
unique_highlights_each_quarter = unique_highlights_per_game_qtr.groupby(['quarter']).agg(avg_highlights_qtr = ('avg_num_highlights', 'mean'))
unique_highlights_each_quarter

Unnamed: 0_level_0,avg_highlights_qtr
quarter,Unnamed: 1_level_1
1st,3.674493
2nd,4.437037
3rd,4.497366
4th,4.977024


## Unique highlights per interval

In [145]:
interval_groups = np.arange(0,721,720)
interval_groups


array([  0, 720])

In [171]:
per_interval = all_games_highlights.groupby(['game_id','quarter', pd.cut(all_games_highlights.time_left_qtr, interval_groups)]).agg(num_highlights = ('time_left_qtr','nunique'))

unique_players_per_interval = all_games_highlights.groupby(['game_id','quarter', pd.cut(all_games_highlights.time_left_qtr, interval_groups)]).agg(num_players = ('name','nunique'))



  per_interval = all_games_highlights.groupby(['game_id','quarter', pd.cut(all_games_highlights.time_left_qtr, interval_groups)]).agg(num_highlights = ('time_left_qtr','nunique'))
  unique_players_per_interval = all_games_highlights.groupby(['game_id','quarter', pd.cut(all_games_highlights.time_left_qtr, interval_groups)]).agg(num_players = ('name','nunique'))


In [172]:
unique_players_per_interval

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_players
game_id,quarter,time_left_qtr,Unnamed: 3_level_1
1,1st,"(0, 720]",2
1,2nd,"(0, 720]",6
1,3rd,"(0, 720]",6
1,4th,"(0, 720]",4
2,1st,"(0, 720]",6
...,...,...,...
954,4th,"(0, 720]",5
955,1st,"(0, 720]",2
955,2nd,"(0, 720]",5
955,3rd,"(0, 720]",5


In [150]:
per_game = per_interval.groupby(['quarter', 'time_left_qtr']).agg(num_highlights=('num_highlights','mean'), unique_players=('unique_players','mean'))
per_game

  per_game = per_interval.groupby(['quarter', 'time_left_qtr']).agg(num_highlights=('num_highlights','mean'), unique_players=('unique_players','mean'))


Unnamed: 0_level_0,Unnamed: 1_level_0,num_highlights,unique_players
quarter,time_left_qtr,Unnamed: 2_level_1,Unnamed: 3_level_1
1st,"(0, 720]",3.561257,3.148691
2nd,"(0, 720]",4.322513,3.843979
3rd,"(0, 720]",4.408377,3.832461
4th,"(0, 720]",4.662827,3.925654
