In [2]:
import team_individual_stat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import leaguegamelog

In [3]:
#!pip install nba_api
#!pip install seaborn

## RETRIEVE DATA FOR ML PART

In [4]:
from nba_api.stats.endpoints import leaguegamelog

# Fetch all PLAYER logs (not team logs) for the 2023-24 Playoffs
logs = leaguegamelog.LeagueGameLog(
    season='2023-24',
    season_type_all_star='Playoffs',
    player_or_team_abbreviation='P'   # ← 'P' for players, 'T' for teams :contentReference[oaicite:0]{index=0}
)

players = logs.get_data_frames()[0]

# Now df.columns will include PLAYER_ID and PLAYER_NAME
print(players.columns.tolist())

# Extract unique player IDs
player_ids = players['PLAYER_ID'].unique().tolist()
print(f"{len(player_ids)} players appeared in the 2023-24 Playoffs")
print("Sample player IDs:", player_ids[:10])


['SEASON_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'FANTASY_PTS', 'VIDEO_AVAILABLE']
214 players appeared in the 2023-24 Playoffs
Sample player IDs: [1629636, 1628978, 2544, 202704, 203076, 203200, 203484, 203915, 203932, 203999]


In [5]:
player_ids

[1629636,
 1628978,
 2544,
 202704,
 203076,
 203200,
 203484,
 203915,
 203932,
 203999,
 1626156,
 1629008,
 1629060,
 1629637,
 1630559,
 201587,
 202699,
 202711,
 1627741,
 1628404,
 1628973,
 1629011,
 1630178,
 1630194,
 1630540,
 204060,
 1627747,
 1627777,
 1628365,
 1628976,
 1629021,
 1628392,
 1628384,
 1630241,
 1629622,
 1628386,
 1628378,
 1628371,
 203914,
 1630591,
 1631094,
 201142,
 201152,
 201569,
 202738,
 203078,
 203933,
 203937,
 1626157,
 1626164,
 1628420,
 1628960,
 1629006,
 1629162,
 1629234,
 1629626,
 1629642,
 1630183,
 1630568,
 1631111,
 200768,
 203954,
 1626162,
 1626166,
 1630532,
 1630171,
 1630175,
 1630596,
 201599,
 1627750,
 1627752,
 1629216,
 1631128,
 1631212,
 201144,
 203497,
 203994,
 1626220,
 1629638,
 1629675,
 1630162,
 1629611,
 1628467,
 1627884,
 1627826,
 203957,
 203939,
 202681,
 202331,
 1627759,
 1631170,
 1631107,
 1629639,
 1628997,
 204001,
 201950,
 1629655,
 201567,
 1631119,
 1630700,
 1630529,
 1630198,
 1629652,
 1629

In [6]:
import time
import pandas as pd
from nba_api.stats.endpoints import playergamelog

def fetch_last_n_regular_season_games(player_ids, season, n=20):
    """
    Fetch the LAST n completed Regular Season games (PTS, AST) for each player_id
    in player_ids, and return one combined DataFrame with columns:
        ['PLAYER_ID','GAME_DATE','PTS','AST']
    
    Parameters:
        player_ids (list of int): List of NBA player IDs.
        season (str): NBA season in 'YYYY-YY' format, e.g. '2023-24'.
        n (int): Number of most recent games to retrieve per player.
    """
    records = []
    
    for pid in player_ids:
        try:
            # 1) Rate-limit pause
            time.sleep(1)
            
            # 2) Fetch all Regular-Season logs for this player
            logs = playergamelog.PlayerGameLog(
                player_id=pid,
                season=season,
                season_type_all_star='Regular Season'
            )
            df = logs.get_data_frames()[0]
            
            # 3) Keep only games where the player actually logged minutes
            df = df[df['MIN'].notna()]
            
            # 4) Normalize column names to uppercase (safer to select)
            df.columns = [c.upper() for c in df.columns]
            
            # 5) Sort by date descending to get most recent first
            df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
            df = df.sort_values('GAME_DATE', ascending=False)
            
            # 6) Take only the top n games
            df = df.head(n)
            
            # 7) Re-sort ascending so your windows come in chronological order
            df = df.sort_values('GAME_DATE', ascending=True)
            
            # 8) Inject the player_id column
            df['PLAYER_ID'] = pid
            
            # 9) Keep only the columns we care about
            df = df[['PLAYER_ID','GAME_DATE','PTS','AST']]
            
            records.append(df)
        
        except Exception as e:
            print(f"Warning: could not fetch last {n} regular games for player {pid}: {e}")
            continue
    
    # 10) Concatenate all players' DataFrames, or return empty template if none
    if records:
        return pd.concat(records, ignore_index=True)
    else:
        return pd.DataFrame(columns=['PLAYER_ID','GAME_DATE','PTS','AST'])


In [7]:
plogs = leaguegamelog.LeagueGameLog(
    season='2023-24',
    season_type_all_star='Playoffs',
    player_or_team_abbreviation='P'
).get_data_frames()[0]
plogs.columns = [c.upper() for c in plogs.columns]
player_ids = plogs['PLAYER_ID'].unique().tolist()

# 2) Fetch last 20 regular games for each playoff player
reg20_df = fetch_last_n_regular_season_games(player_ids, '2023-24', n=20)

print(reg20_df.shape)  
print(reg20_df.head(10))

(4185, 4)
   PLAYER_ID  GAME_DATE  PTS  AST
0    1629636 2024-03-06   15    7
1    1629636 2024-03-08   34    8
2    1629636 2024-03-10   14    5
3    1629636 2024-03-11   30    5
4    1629636 2024-03-13   27   11
5    1629636 2024-03-16   12    4
6    1629636 2024-03-18   13    7
7    1629636 2024-03-20   20    9
8    1629636 2024-03-22   19    4
9    1629636 2024-03-24    9    3


In [8]:
print(reg20_df.head(50))

    PLAYER_ID  GAME_DATE  PTS  AST
0     1629636 2024-03-06   15    7
1     1629636 2024-03-08   34    8
2     1629636 2024-03-10   14    5
3     1629636 2024-03-11   30    5
4     1629636 2024-03-13   27   11
5     1629636 2024-03-16   12    4
6     1629636 2024-03-18   13    7
7     1629636 2024-03-20   20    9
8     1629636 2024-03-22   19    4
9     1629636 2024-03-24    9    3
10    1629636 2024-03-25   15   10
11    1629636 2024-03-27   14   12
12    1629636 2024-03-29   14   12
13    1629636 2024-03-31    5    7
14    1629636 2024-04-02    8    8
15    1629636 2024-04-03   15    8
16    1629636 2024-04-06   26    2
17    1629636 2024-04-07   28    8
18    1629636 2024-04-10   16    9
19    1629636 2024-04-12   16    4
20    1628978 2024-03-08   11    6
21    1628978 2024-03-10   15    3
22    1628978 2024-03-12   16    4
23    1628978 2024-03-14   12    3
24    1628978 2024-03-16   15    2
25    1628978 2024-03-18   18    1
26    1628978 2024-03-21   11    4
27    1628978 2024-0

In [9]:
reg20_df.shape

(4185, 4)

In [10]:
def fetch_first_playoff_games(player_ids, season):
    """
    For each player_id, fetch their first Playoff game of the given season
    in which they actually logged minutes.  Returns a DataFrame with columns
    ['PLAYER_ID','GAME_DATE','PTS','AST'].
    """
    records = []
    
    for pid in player_ids:
        try:
            # 1) Rate-limit pause
            time.sleep(1)
            
            # 2) Fetch *all* playoff logs for this player
            logs = playergamelog.PlayerGameLog(
                player_id=pid,
                season=season,
                season_type_all_star='Playoffs'
            )
            df = logs.get_data_frames()[0]
            
            # 3) Keep only games where they logged minutes
            df = df[df['MIN'].notna()]
            
            # 4) Normalize column names to uppercase
            df.columns = [c.upper() for c in df.columns]
            
            # 5) Inject PLAYER_ID
            df['PLAYER_ID'] = pid
            
            # 6) Parse dates with inference (handles "Apr 30, 2024", "April 30, 2024", ISO, etc.)
            df['GAME_DATE'] = pd.to_datetime(
                df['GAME_DATE'], 
                infer_datetime_format=True, 
                errors='coerce'
            )
            # Drop any rows that failed to parse
            df = df[df['GAME_DATE'].notna()]
            
            # 7) Sort ascending and pick the first row
            df = df.sort_values('GAME_DATE')
            if df.empty:
                continue
            
            first = df.iloc[0]
            records.append({
                'PLAYER_ID': pid,
                'GAME_DATE': first['GAME_DATE'],
                'PTS': first['PTS'],
                'AST': first['AST']
            })
        
        except Exception as e:
            print(f"Warning: could not fetch first playoff game for player {pid}: {e}")
            continue

    return pd.DataFrame(records, columns=['PLAYER_ID','GAME_DATE','PTS','AST'])

In [11]:
first_playoffs = fetch_first_playoff_games(player_ids, '2023-24')


In [12]:
first_playoffs.head(10)

Unnamed: 0,PLAYER_ID,GAME_DATE,PTS,AST
0,1629636,2024-05-03,21,5
1,1628978,2024-05-02,23,7
2,2544,2024-04-20,27,8
3,202704,2024-05-04,5,1
4,203076,2024-04-20,32,5
5,203200,2024-05-04,3,0
6,203484,2024-05-04,6,3
7,203915,2024-04-20,0,1
8,203932,2024-05-04,9,6
9,203999,2024-05-04,32,9


In [13]:
def build_Xy_from_groups(reg20_df, first_playoffs):
    """
    reg20_df: DataFrame with exactly 20 reg-season rows per player 
              columns ['PLAYER_ID','GAME_DATE','PTS','AST']
    first_playoffs: DataFrame with one row per player 
              columns ['PLAYER_ID','GAME_DATE','PTS','AST']
    Returns:
      X: np.array of shape (n_players, 20, 2)
      y: np.array of shape (n_players, 2)
    """
    X_list, y_list = [], []
    
    # Ensure data is sorted by date per player
    reg20_df = reg20_df.sort_values(['PLAYER_ID','GAME_DATE'])
    
    for pid, group in reg20_df.groupby('PLAYER_ID'):
        if len(group) != 20:
            # skip any player if they don't have exactly 20 games
            continue
        
        # 1) Build the input sequence: shape (20,2)
        seq = group[['PTS','AST']].values
        X_list.append(seq)
        
        # 2) Find the first-playoff target for this player
        fp = first_playoffs[first_playoffs['PLAYER_ID']==pid]
        if fp.empty:
            continue
        target = fp[['PTS','AST']].iloc[0].values
        y_list.append(target)
    
    # Stack into numpy arrays
    X = np.stack(X_list, axis=0)   # (n_players, 20, 2)
    y = np.vstack(y_list)          # (n_players, 2)
    return X, y

# ----------------------
# Example usage:

# first_playoffs = fetch_first_playoff_games(player_ids, '2023-24')
# reg20_df   = fetch_last_n_regular_season_games(player_ids, '2023-24', n=20)

X, y = build_Xy_from_groups(reg20_df, first_playoffs)
print("X shape:", X.shape)  # (n_players,20,2)
print("y shape:", y.shape)  # (n_players,2)

X shape: (202, 20, 2)
y shape: (202, 2)


In [14]:
#!pip install --upgrade pip


In [19]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# ——— 1. Chuẩn bị dữ liệu ———

# sort đảm bảo đúng thứ tự ngày
reg20_df = reg20_df.sort_values(['PLAYER_ID','GAME_DATE'])
first_playoffs = first_playoffs.sort_values(['PLAYER_ID','GAME_DATE'])

# chỉ lấy những cầu thủ vừa có 20 trận reg  vừa có playoff
players = np.intersect1d(reg20_df['PLAYER_ID'].unique(),
                         first_playoffs['PLAYER_ID'].unique())

X_list, y_list = [], []
for pid in players:
    reg = reg20_df[reg20_df['PLAYER_ID']==pid].sort_values('GAME_DATE')
    if len(reg) < 20:  
        continue
    # X: 20 trận cuối Regular
    X_list.append(reg[['PTS','AST']].values[-20:])  # shape (20,2)
    # y: trận Playoff đầu tiên
    row = first_playoffs[first_playoffs['PLAYER_ID']==pid].iloc[0]
    y_list.append(row[['PTS','AST']].values)       # shape (2,)

X = np.stack(X_list)   # (n_players, 20, 2)
y = np.stack(y_list)   # (n_players, 2)

print("X shape:", X.shape)
print("y shape:", y.shape)

# ——— 2. Train / Test split ———
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# ——— 3. Scale ———
W, F = X_train.shape[1], X_train.shape[2]
scaler_X = MinMaxScaler().fit(X_train.reshape(-1, F))
X_train_s = scaler_X.transform(X_train.reshape(-1, F)).reshape(X_train.shape)
X_test_s  = scaler_X.transform(X_test.reshape(-1, F)).reshape(X_test.shape)

scaler_y = MinMaxScaler().fit(y_train)
y_train_s = scaler_y.transform(y_train)
y_test_s  = scaler_y.transform(y_test)

# ——— 4. Build & train global RNN ———
model = Sequential([
    SimpleRNN(64, input_shape=(W, F), activation='tanh'),
    Dense(2)
])
model.compile(optimizer=Adam(1e-3), loss='mse', metrics=['mae'])
model.summary()

history = model.fit(
    X_train_s, y_train_s,
    validation_data=(X_test_s, y_test_s),
    epochs=50, batch_size=16, verbose=1
)

# ——— 5. Evaluate on test set ———
y_pred_s = model.predict(X_test_s, verbose=0)
y_pred = scaler_y.inverse_transform(y_pred_s)
y_true = y_test  # already unscaled

print("Test MAE PTS:", mean_absolute_error(y_true[:,0], y_pred[:,0]))
print("Test MAE AST:", mean_absolute_error(y_true[:,1], y_pred[:,1]))

# ——— 6. Inference cho toàn bộ cầu thủ ———
results = []
for i, pid in enumerate(players):
    reg = reg20_df[reg20_df['PLAYER_ID']==pid].sort_values('GAME_DATE')
    if len(reg) < 20: 
        continue
    x_in = reg[['PTS','AST']].values[-20:].reshape(1,20,2)
    x_in_s = scaler_X.transform(x_in.reshape(-1,2)).reshape(1,20,2)
    y_pred_s = model.predict(x_in_s, verbose=0)
    y_hat = scaler_y.inverse_transform(y_pred_s)[0]
    y_hat = np.round(np.maximum(y_hat, 0))  # clip & round
    row = first_playoffs[first_playoffs['PLAYER_ID']==pid].iloc[0]
    results.append({
        'PLAYER_ID': pid,
        'PTS_true': row['PTS'],
        'AST_true': row['AST'],
        'PTS_pred': y_hat[0],
        'AST_pred': y_hat[1],
    })

results_df = pd.DataFrame(results)
print(results_df.head())
print("Overall MAE PTS:", mean_absolute_error(results_df['PTS_true'], results_df['PTS_pred']))
print("Overall MAE AST:", mean_absolute_error(results_df['AST_true'], results_df['AST_pred']))


X shape: (202, 20, 2)
y shape: (202, 2)


Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0449 - mae: 0.1603 - val_loss: 0.0362 - val_mae: 0.1374
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0218 - mae: 0.1137 - val_loss: 0.0330 - val_mae: 0.1203
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0207 - mae: 0.1041 - val_loss: 0.0331 - val_mae: 0.1252
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0193 - mae: 0.1025 - val_loss: 0.0321 - val_mae: 0.1183
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0201 - mae: 0.1049 - val_loss: 0.0365 - val_mae: 0.1282
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0193 - mae: 0.1065 - val_loss: 0.0320 - val_mae: 0.1222
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0207 

In [20]:
reg20_df.head(20)

Unnamed: 0,PLAYER_ID,GAME_DATE,PTS,AST
40,2544,2024-02-28,34,8
41,2544,2024-02-29,31,9
42,2544,2024-03-02,26,9
43,2544,2024-03-04,19,8
44,2544,2024-03-06,31,13
45,2544,2024-03-10,29,9
46,2544,2024-03-13,18,9
47,2544,2024-03-16,40,9
48,2544,2024-03-18,25,10
49,2544,2024-03-22,20,6


In [21]:
first_playoffs[first_playoffs['PLAYER_ID'] == 2544]

Unnamed: 0,PLAYER_ID,GAME_DATE,PTS,AST
2,2544,2024-04-20,27,8


In [25]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

# Assumes reg20_df and first_playoffs are already loaded
W = 20  # window size
EPOCHS = 100

results = []

# Lặp qua từng player
for player_id in reg20_df['PLAYER_ID'].unique():
    reg_player = reg20_df[reg20_df['PLAYER_ID'] == player_id].sort_values('GAME_DATE')
    playoff_row = first_playoffs[first_playoffs['PLAYER_ID'] == player_id]

    # Nếu không đủ data
    if len(reg_player) < W or playoff_row.empty:
        continue

    # Lấy X từ regular season
    X = reg_player[['PTS','AST']].values  # (n_games, 2)
    X = X[-W:]                            # lấy đúng 20 trận cuối
    X_len = len(X)

    # Lấy ground-truth playoff
    y_true = playoff_row[['PTS','AST']].values[0]

    # Scale
    scaler_X = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    X_scaled = X_scaled.reshape(1, W, 2)  # (1,20,2)

    # Build small RNN for this player
    model = Sequential([
        SimpleRNN(32, input_shape=(W, 2), activation='tanh'),
        Dense(2)
    ])
    model.compile(optimizer=Adam(1e-2), loss='mse', metrics=['mae'])

    # Train (self reconstruct last frame)
    model.fit(X_scaled, X_scaled[:, -1, :], epochs=EPOCHS, batch_size=1, verbose=0)

    # Predict playoff
    y_pred_scaled = model.predict(X_scaled, verbose=0)
    y_pred = scaler_X.inverse_transform(y_pred_scaled)[0]

    # Clip negatives
    y_pred = np.round(np.maximum(y_pred, 0))

    # Lưu kết quả
    results.append({
        'PLAYER_ID': player_id,
        'X_len': X_len,
        'PTS_true': float(y_true[0]),
        'AST_true': float(y_true[1]),
        'PTS_pred': float(y_pred[0]),
        'AST_pred': float(y_pred[1]),
    })

# --- Xử lý kết quả ---
results_df = pd.DataFrame(results)
print(results_df.head())

# MAE chung
print("Final MAE PTS:", mean_absolute_error(results_df['PTS_true'], results_df['PTS_pred']))
print("Final MAE AST:", mean_absolute_error(results_df['AST_true'], results_df['AST_pred']))


   PLAYER_ID  X_len  PTS_true  AST_true  PTS_pred  AST_pred
0       2544     20      27.0       8.0      28.0      17.0
1     200768     20       0.0       3.0       0.0       8.0
2     200782     20       3.0       0.0       0.0       0.0
3     201142     20      31.0       1.0      15.0       1.0
4     201143     20      10.0       0.0       6.0       1.0
Final MAE PTS: 6.128712871287129
Final MAE AST: 1.801980198019802


In [26]:
print(results_df.head(20))

    PLAYER_ID  X_len  PTS_true  AST_true  PTS_pred  AST_pred
0        2544     20      27.0       8.0      28.0      17.0
1      200768     20       0.0       3.0       0.0       8.0
2      200782     20       3.0       0.0       0.0       0.0
3      201142     20      31.0       1.0      15.0       1.0
4      201143     20      10.0       0.0       6.0       1.0
5      201144     20      14.0      10.0      17.0       2.0
6      201152     20       0.0       0.0       5.0       0.0
7      201566     20       6.0       0.0       4.0       0.0
8      201567     20       2.0       0.0       0.0       0.0
9      201568     20       1.0       0.0      11.0       0.0
10     201569     20       0.0       1.0       2.0       3.0
11     201572     20      20.0       1.0      11.0       0.0
12     201587     20      16.0       2.0       0.0       5.0
13     201599     20       0.0       0.0       0.0       0.0
14     201935     20       7.0       7.0       4.0       5.0
15     201950     20    