In [1]:
import team_individual_stat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import leaguegamelog



In [2]:
#!pip install nba_api
#!pip install seaborn

## RETRIEVE DATA FOR ML PART

In [29]:
from nba_api.stats.endpoints import leaguegamelog

# Fetch all PLAYER logs (not team logs) for the 2023-24 Playoffs
logs = leaguegamelog.LeagueGameLog(
    season='2023-24',
    season_type_all_star='Playoffs',
    player_or_team_abbreviation='P'   # ← 'P' for players, 'T' for teams :contentReference[oaicite:0]{index=0}
)

players = logs.get_data_frames()[0]

# Now df.columns will include PLAYER_ID and PLAYER_NAME
print(players.columns.tolist())

# Extract unique player IDs
player_ids = players['PLAYER_ID'].unique().tolist()
print(f"{len(player_ids)} players appeared in the 2023-24 Playoffs")
print("Sample player IDs:", player_ids[:10])


['SEASON_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'FANTASY_PTS', 'VIDEO_AVAILABLE']
192 players appeared in the 2023-24 Playoffs
Sample player IDs: [1641709, 1631128, 202699, 1628969, 1628973, 1629011, 1629013, 1630191, 1641842, 1628384]


In [30]:
player_ids

[1641709,
 1631128,
 202699,
 1628969,
 1628973,
 1629011,
 1629013,
 1630191,
 1641842,
 1628384,
 203471,
 1626166,
 1630595,
 1628404,
 1631105,
 1626157,
 1627736,
 1630540,
 203501,
 2544,
 203458,
 203497,
 1629003,
 1629020,
 1629060,
 1629216,
 1629637,
 1629638,
 1630162,
 1630545,
 1630559,
 1630568,
 1630692,
 1631159,
 1631169,
 1641740,
 1642261,
 1642355,
 201144,
 203944,
 1627827,
 1628978,
 1629029,
 1629675,
 1630183,
 203999,
 1627826,
 1627750,
 201572,
 203507,
 1626167,
 1626192,
 1627752,
 1628398,
 1629614,
 1630167,
 1630169,
 1630174,
 1631157,
 1642277,
 1641748,
 1641753,
 1630579,
 1631097,
 1631260,
 1628418,
 1626171,
 1627783,
 204456,
 1629018,
 1641716,
 1629645,
 201587,
 203992,
 1627732,
 1627884,
 1629008,
 1629618,
 1631212,
 201566,
 201935,
 202695,
 203932,
 1626181,
 1627739,
 201143,
 1631170,
 1631107,
 1629750,
 1629731,
 1629643,
 1629639,
 1629636,
 1629631,
 1629622,
 1628386,
 203937,
 202692,
 1630532,
 1631216,
 1628976,
 1630573,
 20

In [31]:
import time
import pandas as pd
from nba_api.stats.endpoints import playergamelog

def fetch_last_n_regular_season_games(player_ids, season, n=20):
    """
    Fetch the LAST n completed Regular Season games (PTS, AST) for each player_id
    in player_ids, and return one combined DataFrame with columns:
        ['PLAYER_ID','GAME_DATE','PTS','AST']
    
    Parameters:
        player_ids (list of int): List of NBA player IDs.
        season (str): NBA season in 'YYYY-YY' format, e.g. '2023-24'.
        n (int): Number of most recent games to retrieve per player.
    """
    records = []
    
    for pid in player_ids:
        try:
            # 1) Rate-limit pause
            time.sleep(1)
            
            # 2) Fetch all Regular-Season logs for this player
            logs = playergamelog.PlayerGameLog(
                player_id=pid,
                season=season,
                season_type_all_star='Regular Season'
            )
            df = logs.get_data_frames()[0]
            
            # 3) Keep only games where the player actually logged minutes
            df = df[df['MIN'].notna()]
            
            # 4) Normalize column names to uppercase (safer to select)
            df.columns = [c.upper() for c in df.columns]
            
            # 5) Sort by date descending to get most recent first
            df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
            df = df.sort_values('GAME_DATE', ascending=False)
            
            # 6) Take only the top n games
            df = df.head(n)
            
            # 7) Re-sort ascending so your windows come in chronological order
            df = df.sort_values('GAME_DATE', ascending=True)
            
            # 8) Inject the player_id column
            df['PLAYER_ID'] = pid
            
            # 9) Keep only the columns we care about
            df = df[['PLAYER_ID','GAME_DATE','PTS','AST']]
            
            records.append(df)
        
        except Exception as e:
            print(f"Warning: could not fetch last {n} regular games for player {pid}: {e}")
            continue
    
    # 10) Concatenate all players' DataFrames, or return empty template if none
    if records:
        return pd.concat(records, ignore_index=True)
    else:
        return pd.DataFrame(columns=['PLAYER_ID','GAME_DATE','PTS','AST'])


In [32]:
plogs = leaguegamelog.LeagueGameLog(
    season='2023-24',
    season_type_all_star='Playoffs',
    player_or_team_abbreviation='P'
).get_data_frames()[0]
plogs.columns = [c.upper() for c in plogs.columns]
player_ids = plogs['PLAYER_ID'].unique().tolist()

# 2) Fetch last 20 regular games for each playoff player
reg20_df = fetch_last_n_regular_season_games(player_ids, '2023-24', n=20)

print(reg20_df.shape)  
print(reg20_df.head(10))



KeyboardInterrupt: 

In [None]:
print(reg20_df.head(50))

    PLAYER_ID  GAME_DATE  PTS  AST
0      203497 2024-02-28    8    0
1      203497 2024-03-01   16    1
2      203497 2024-03-03   12    2
3      203497 2024-03-04   25    0
4      203497 2024-03-07   18    0
5      203497 2024-03-08    7    2
6      203497 2024-03-12    8    3
7      203497 2024-03-22    9    1
8      203497 2024-03-24   17    2
9      203497 2024-03-27   11    4
10     203497 2024-03-29   21    2
11     203497 2024-03-31   19    1
12     203497 2024-04-02   12    0
13     203497 2024-04-03   11    3
14     203497 2024-04-05    4    1
15     203497 2024-04-07   18    3
16     203497 2024-04-09   19    3
17     203497 2024-04-10   13    3
18     203497 2024-04-12   25    0
19     203497 2024-04-14   21    1
20     203484 2024-03-07   11    1
21     203484 2024-03-09   13    2
22     203484 2024-03-11    3    6
23     203484 2024-03-13    9    2
24     203484 2024-03-15    4    3
25     203484 2024-03-17   10    2
26     203484 2024-03-19    9    1
27     203484 2024-0

In [None]:
reg20_df.shape

(4185, 4)

In [None]:
def fetch_first_playoff_games(player_ids, season):
    """
    For each player_id, fetch their first Playoff game of the given season
    in which they actually logged minutes.  Returns a DataFrame with columns
    ['PLAYER_ID','GAME_DATE','PTS','AST'].
    """
    records = []
    
    for pid in player_ids:
        try:
            # 1) Rate-limit pause
            time.sleep(1)
            
            # 2) Fetch *all* playoff logs for this player
            logs = playergamelog.PlayerGameLog(
                player_id=pid,
                season=season,
                season_type_all_star='Playoffs'
            )
            df = logs.get_data_frames()[0]
            
            # 3) Keep only games where they logged minutes
            df = df[df['MIN'].notna()]
            
            # 4) Normalize column names to uppercase
            df.columns = [c.upper() for c in df.columns]
            
            # 5) Inject PLAYER_ID
            df['PLAYER_ID'] = pid
            
            # 6) Parse dates with inference (handles "Apr 30, 2024", "April 30, 2024", ISO, etc.)
            df['GAME_DATE'] = pd.to_datetime(
                df['GAME_DATE'], 
                infer_datetime_format=True, 
                errors='coerce'
            )
            # Drop any rows that failed to parse
            df = df[df['GAME_DATE'].notna()]
            
            # 7) Sort ascending and pick the first row
            df = df.sort_values('GAME_DATE')
            if df.empty:
                continue
            
            first = df.iloc[0]
            records.append({
                'PLAYER_ID': pid,
                'GAME_DATE': first['GAME_DATE'],
                'PTS': first['PTS'],
                'AST': first['AST']
            })
        
        except Exception as e:
            print(f"Warning: could not fetch first playoff game for player {pid}: {e}")
            continue

    return pd.DataFrame(records, columns=['PLAYER_ID','GAME_DATE','PTS','AST'])

In [None]:
first_playoffs = fetch_first_playoff_games(player_ids, '2023-24')


In [None]:
first_playoffs.head(10)

Unnamed: 0,PLAYER_ID,GAME_DATE,PTS,AST
0,203497,2024-05-04,6,3
1,203484,2024-05-04,6,3
2,203915,2024-04-20,0,1
3,203932,2024-05-04,9,6
4,203999,2024-05-04,32,9
5,1626156,2024-04-20,13,3
6,1629008,2024-05-04,20,1
7,1629060,2024-04-20,7,1
8,1629637,2024-04-20,0,0
9,1630559,2024-04-20,13,3


In [None]:
def build_Xy_from_groups(reg20_df, first_playoffs):
    """
    reg20_df: DataFrame with exactly 20 reg-season rows per player 
              columns ['PLAYER_ID','GAME_DATE','PTS','AST']
    first_playoffs: DataFrame with one row per player 
              columns ['PLAYER_ID','GAME_DATE','PTS','AST']
    Returns:
      X: np.array of shape (n_players, 20, 2)
      y: np.array of shape (n_players, 2)
    """
    X_list, y_list = [], []
    
    # Ensure data is sorted by date per player
    reg20_df = reg20_df.sort_values(['PLAYER_ID','GAME_DATE'])
    
    for pid, group in reg20_df.groupby('PLAYER_ID'):
        if len(group) != 20:
            # skip any player if they don't have exactly 20 games
            continue
        
        # 1) Build the input sequence: shape (20,2)
        seq = group[['PTS','AST']].values
        X_list.append(seq)
        
        # 2) Find the first-playoff target for this player
        fp = first_playoffs[first_playoffs['PLAYER_ID']==pid]
        if fp.empty:
            continue
        target = fp[['PTS','AST']].iloc[0].values
        y_list.append(target)
    
    # Stack into numpy arrays
    X = np.stack(X_list, axis=0)   # (n_players, 20, 2)
    y = np.vstack(y_list)          # (n_players, 2)
    return X, y

# ----------------------
# Example usage:

# first_playoffs = fetch_first_playoff_games(player_ids, '2023-24')
# reg20_df   = fetch_last_n_regular_season_games(player_ids, '2023-24', n=20)

X, y = build_Xy_from_groups(reg20_df, first_playoffs)
print("X shape:", X.shape)  # (n_players,20,2)
print("y shape:", y.shape)  # (n_players,2)

X shape: (202, 20, 2)
y shape: (202, 2)


In [None]:
#!pip install --upgrade pip


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# 1) Split into train / val
#    (e.g. 80% train, 20% val)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# 2) Scale inputs and outputs
#    We flatten time & features into 2D for the scaler, then reshape back.
n_train, w, f = X_train.shape
scaler_X = MinMaxScaler()
X_train_flat = X_train.reshape(n_train * w, f)
X_train_scaled = scaler_X.fit_transform(X_train_flat).reshape(n_train, w, f)
X_val_flat = X_val.reshape(X_val.shape[0] * w, f)
X_val_scaled = scaler_X.transform(X_val_flat).reshape(X_val.shape[0], w, f)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled   = scaler_y.transform(y_val)

# 3) Build a “vanilla” RNN
model = Sequential([
    SimpleRNN(64, input_shape=(w, f), activation='tanh'),
    Dense(2)   # output: [PTS, AST]
])

model.compile(
    optimizer=Adam(1e-3),
    loss='mse',
    metrics=['mae']
)
model.summary()

# 4) Train
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_val_scaled, y_val_scaled),
    epochs=50,
    batch_size=16
)

# 5) Evaluate & predict
val_loss, val_mae = model.evaluate(X_val_scaled, y_val_scaled)
print(f"Validation MAE (scaled): {val_mae:.4f}")

# To get actual PTS/AST back:
y_pred_scaled = model.predict(X_val_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled)

# Compare a few
for i in range(5):
    print("True:", y_val[i], "Pred:", y_pred[i])


Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0444 - mae: 0.1588 - val_loss: 0.0533 - val_mae: 0.1542
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0370 - mae: 0.1432 - val_loss: 0.0337 - val_mae: 0.1326
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0201 - mae: 0.1071 - val_loss: 0.0305 - val_mae: 0.1256
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0190 - mae: 0.1024 - val_loss: 0.0292 - val_mae: 0.1256
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0210 - mae: 0.1094 - val_loss: 0.0369 - val_mae: 0.1357
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0227 - mae: 0.1110 - val_loss: 0.0332 - val_mae: 0.1341
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0206 

In [None]:
reg20_df.head(20)

Unnamed: 0,PLAYER_ID,GAME_DATE,PTS,AST
1408,2544,2024-02-28,34,8
1409,2544,2024-02-29,31,9
1410,2544,2024-03-02,26,9
1411,2544,2024-03-04,19,8
1412,2544,2024-03-06,31,13
1413,2544,2024-03-10,29,9
1414,2544,2024-03-13,18,9
1415,2544,2024-03-16,40,9
1416,2544,2024-03-18,25,10
1417,2544,2024-03-22,20,6


In [None]:
first_playoffs[first_playoffs['PLAYER_ID'] == 2544]

Unnamed: 0,PLAYER_ID,GAME_DATE,PTS,AST
72,2544,2024-04-20,27,8


In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# Sắp xếp theo thời gian
reg20_df = reg20_df.sort_values(['PLAYER_ID', 'GAME_DATE'])

results = []

# Train mô hình riêng cho từng cầu thủ
for pid, group in reg20_df.groupby('PLAYER_ID'):
    if len(group) != 20:
        continue

    # Input: 20 trận gần nhất
    X = group[['PTS', 'AST']].values.reshape(1, 20, 2)

    # Output: trận playoff đầu tiên
    y_row = first_playoffs[first_playoffs['PLAYER_ID'] == pid]
    if y_row.empty:
        continue
    import random
    rand_pid = random.choice(first_playoffs['PLAYER_ID'].tolist())
    y_rand = first_playoffs[first_playoffs['PLAYER_ID'] == rand_pid][['PTS', 'AST']].values.reshape(1, 2)
    y = y_rand
    
    # Scale
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X.reshape(-1, 2)).reshape(1, 20, 2)
    y_scaled = scaler_y.fit_transform(y)

    # RNN cho 1 người
    model = Sequential([
        SimpleRNN(16, input_shape=(20, 2), activation='tanh'),
        Dense(2)
    ])
    model.compile(optimizer=Adam(1e-2), loss='mse', metrics=['mae'])
    model.fit(X_scaled, y_scaled, epochs=20, batch_size=1, verbose=0)

    # Dự đoán
    y_pred_scaled = model.predict(X_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # Lưu kết quả
    results.append({
        'PLAYER_ID': pid,
        'PTS_true': float(y[0][0]),
        'AST_true': float(y[0][1]),
        'PTS_pred': float(y_pred[0][0]),
        'AST_pred': float(y_pred[0][1]),
    })

# Kết quả tổng hợp
results_df = pd.DataFrame(results)
print(results_df.head())


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44

In [35]:
print(results_df.head(20))

    PLAYER_ID  PTS_true  AST_true   PTS_pred  AST_pred
0        2544       0.0       0.0   0.085079  0.100802
1      200768       3.0       1.0   2.912780  0.902498
2      200782       6.0       0.0   5.974120  0.007996
3      201142      20.0       2.0  19.990265  1.962977
4      201143       7.0       3.0   7.025530  2.925709
5      201144       0.0       1.0  -0.027878  1.064706
6      201152       6.0       5.0   6.040592  4.935461
7      201566      13.0       2.0  13.076576  1.948578
8      201567       0.0       0.0  -0.017992 -0.004943
9      201568      14.0       0.0  13.942332 -0.037469
10     201569       0.0       0.0  -0.013873 -0.111192
11     201572       0.0       3.0  -0.000553  2.993359
12     201587       0.0       0.0   0.040065 -0.035361
13     201599       2.0       0.0   1.913497  0.091457
14     201935       8.0       1.0   8.027447  1.046381
15     201950       3.0       0.0   3.092822  0.035420
16     201976       7.0       3.0   6.918115  2.942003
17     201

# TIME SERIES

In [280]:
#!pip install tensorflow


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip[0m
