In [22]:
import team_individual_stat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import leaguegamelog

In [2]:
#!pip install nba_api
#!pip install seaborn

## RETRIEVE DATA FOR ML PART

In [55]:
from nba_api.stats.endpoints import leaguegamelog

# Fetch all PLAYER logs (not team logs) for the 2023-24 Playoffs
logs = leaguegamelog.LeagueGameLog(
    season='2023-24',
    season_type_all_star='Playoffs',
    player_or_team_abbreviation='P'   # ‚Üê 'P' for players, 'T' for teams :contentReference[oaicite:0]{index=0}
)

players = logs.get_data_frames()[0]

# Now df.columns will include PLAYER_ID and PLAYER_NAME
print(players.columns.tolist())

# Extract unique player IDs
player_ids = players['PLAYER_ID'].unique().tolist()
print(f"{len(player_ids)} players appeared in the 2023-24 Playoffs")
print("Sample player IDs:", player_ids[:10])


['SEASON_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'FANTASY_PTS', 'VIDEO_AVAILABLE']
214 players appeared in the 2023-24 Playoffs
Sample player IDs: [204060, 1626157, 203937, 203933, 203078, 202738, 201569, 201152, 201142, 1631094]


In [56]:
player_ids

[204060,
 1626157,
 203937,
 203933,
 203078,
 202738,
 201569,
 201152,
 201142,
 1631094,
 1630591,
 1630532,
 1630241,
 1629622,
 1628386,
 1628378,
 1628371,
 203914,
 1627747,
 1627777,
 1628365,
 1628976,
 1629021,
 1629636,
 1630171,
 1630175,
 1630596,
 201599,
 1627750,
 1627752,
 1629216,
 1631128,
 1631212,
 201144,
 203497,
 203994,
 1626220,
 1629638,
 1629675,
 1630162,
 1626164,
 1628420,
 1630540,
 1630194,
 1630178,
 1629011,
 1628973,
 1628404,
 1627741,
 202711,
 202699,
 201587,
 1630559,
 1629637,
 1629060,
 1629008,
 1626156,
 203999,
 203932,
 203915,
 203484,
 203200,
 1628960,
 1629006,
 1629162,
 1629234,
 1629626,
 1629642,
 1630183,
 1630568,
 1631111,
 200768,
 203954,
 1626162,
 1626166,
 1628384,
 1628392,
 1628978,
 2544,
 202704,
 203076,
 1629652,
 1630198,
 1630529,
 1630700,
 1631119,
 1629026,
 1628983,
 1627742,
 202330,
 1629614,
 1626167,
 1641767,
 201143,
 201567,
 1626153,
 1628369,
 1628389,
 1628401,
 1629130,
 1629312,
 1630202,
 1630573,
 

In [63]:
import time
import pandas as pd
from nba_api.stats.endpoints import playergamelog

def fetch_last_n_regular_season_games(player_ids, season, n=20):
    """
    Fetch the LAST n completed Regular Season games (PTS, AST) for each player_id
    in player_ids, and return one combined DataFrame with columns:
        ['PLAYER_ID','GAME_DATE','PTS','AST']
    
    Parameters:
        player_ids (list of int): List of NBA player IDs.
        season (str): NBA season in 'YYYY-YY' format, e.g. '2023-24'.
        n (int): Number of most recent games to retrieve per player.
    """
    records = []
    
    for pid in player_ids:
        try:
            # 1) Rate-limit pause
            time.sleep(1)
            
            # 2) Fetch all Regular-Season logs for this player
            logs = playergamelog.PlayerGameLog(
                player_id=pid,
                season=season,
                season_type_all_star='Regular Season'
            )
            df = logs.get_data_frames()[0]
            
            # 3) Keep only games where the player actually logged minutes
            df = df[df['MIN'].notna()]
            
            # 4) Normalize column names to uppercase (safer to select)
            df.columns = [c.upper() for c in df.columns]
            
            # 5) Sort by date descending to get most recent first
            df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
            df = df.sort_values('GAME_DATE', ascending=False)
            
            # 6) Take only the top n games
            df = df.head(n)
            
            # 7) Re-sort ascending so your windows come in chronological order
            df = df.sort_values('GAME_DATE', ascending=True)
            
            # 8) Inject the player_id column
            df['PLAYER_ID'] = pid
            
            # 9) Keep only the columns we care about
            df = df[['PLAYER_ID','GAME_DATE','PTS','AST']]
            
            records.append(df)
        
        except Exception as e:
            print(f"Warning: could not fetch last {n} regular games for player {pid}: {e}")
            continue
    
    # 10) Concatenate all players' DataFrames, or return empty template if none
    if records:
        return pd.concat(records, ignore_index=True)
    else:
        return pd.DataFrame(columns=['PLAYER_ID','GAME_DATE','PTS','AST'])


In [64]:
plogs = leaguegamelog.LeagueGameLog(
    season='2023-24',
    season_type_all_star='Playoffs',
    player_or_team_abbreviation='P'
).get_data_frames()[0]
plogs.columns = [c.upper() for c in plogs.columns]
player_ids = plogs['PLAYER_ID'].unique().tolist()

# 2) Fetch last 20 regular games for each playoff player
reg20_df = fetch_last_n_regular_season_games(player_ids, '2023-24', n=20)

print(reg20_df.shape)  
print(reg20_df.head(10))

(4185, 4)
   PLAYER_ID  GAME_DATE  PTS  AST
0     204060 2024-03-06    0    0
1     204060 2024-03-08    1    1
2     204060 2024-03-10   11    2
3     204060 2024-03-13    6    3
4     204060 2024-03-15    5    5
5     204060 2024-03-17    5    2
6     204060 2024-03-19    3    4
7     204060 2024-03-21    0    2
8     204060 2024-03-23    0    3
9     204060 2024-03-27    2    4


In [67]:
print(reg20_df.head(50))

    PLAYER_ID  GAME_DATE  PTS  AST
0      204060 2024-03-06    0    0
1      204060 2024-03-08    1    1
2      204060 2024-03-10   11    2
3      204060 2024-03-13    6    3
4      204060 2024-03-15    5    5
5      204060 2024-03-17    5    2
6      204060 2024-03-19    3    4
7      204060 2024-03-21    0    2
8      204060 2024-03-23    0    3
9      204060 2024-03-27    2    4
10     204060 2024-03-29    5    4
11     204060 2024-03-30   11    2
12     204060 2024-04-01    0    2
13     204060 2024-04-03    4    2
14     204060 2024-04-05    3    2
15     204060 2024-04-07    9    5
16     204060 2024-04-09    7    2
17     204060 2024-04-10    3    3
18     204060 2024-04-12    2    6
19     204060 2024-04-14    3    2
20    1626157 2024-01-24   27    3
21    1626157 2024-01-25   27    3
22    1626157 2024-01-27   19    4
23    1626157 2024-01-29   21    6
24    1626157 2024-01-31   29    4
25    1626157 2024-02-02   19    1
26    1626157 2024-02-04   14    0
27    1626157 2024-0

In [71]:
reg20_df.shape

(4185, 4)

X shape: (202, 20, 2)
y shape: (202, 2)


In [80]:
def fetch_first_playoff_games(player_ids, season):
    """
    For each player_id, fetch their first Playoff game of the given season
    in which they actually logged minutes.  Returns a DataFrame with columns
    ['PLAYER_ID','GAME_DATE','PTS','AST'].
    """
    records = []
    
    for pid in player_ids:
        try:
            # 1) Rate-limit pause
            time.sleep(1)
            
            # 2) Fetch *all* playoff logs for this player
            logs = playergamelog.PlayerGameLog(
                player_id=pid,
                season=season,
                season_type_all_star='Playoffs'
            )
            df = logs.get_data_frames()[0]
            
            # 3) Keep only games where they logged minutes
            df = df[df['MIN'].notna()]
            
            # 4) Normalize column names to uppercase
            df.columns = [c.upper() for c in df.columns]
            
            # 5) Inject PLAYER_ID
            df['PLAYER_ID'] = pid
            
            # 6) Parse dates with inference (handles "Apr 30, 2024", "April 30, 2024", ISO, etc.)
            df['GAME_DATE'] = pd.to_datetime(
                df['GAME_DATE'], 
                infer_datetime_format=True, 
                errors='coerce'
            )
            # Drop any rows that failed to parse
            df = df[df['GAME_DATE'].notna()]
            
            # 7) Sort ascending and pick the first row
            df = df.sort_values('GAME_DATE')
            if df.empty:
                continue
            
            first = df.iloc[0]
            records.append({
                'PLAYER_ID': pid,
                'GAME_DATE': first['GAME_DATE'],
                'PTS': first['PTS'],
                'AST': first['AST']
            })
        
        except Exception as e:
            print(f"Warning: could not fetch first playoff game for player {pid}: {e}")
            continue

    return pd.DataFrame(records, columns=['PLAYER_ID','GAME_DATE','PTS','AST'])

In [75]:
first_playoffs = fetch_first_playoff_games(player_ids, '2023-24')


In [77]:
first_playoffs.head(10)

Unnamed: 0,PLAYER_ID,GAME_DATE,PTS,AST
0,204060,2024-05-03,0,2
1,1626157,2024-05-04,20,3
2,203937,2024-05-04,2,1
3,203933,2024-05-10,0,1
4,203078,2024-04-20,15,6
5,202738,2024-04-20,0,0
6,201569,2024-04-20,0,1
7,201152,2024-04-20,0,0
8,201142,2024-04-20,31,1
9,1631094,2024-05-03,27,4


In [81]:
def build_Xy_from_groups(reg20_df, first_playoffs):
    """
    reg20_df: DataFrame with exactly 20 reg-season rows per player 
              columns ['PLAYER_ID','GAME_DATE','PTS','AST']
    first_playoffs: DataFrame with one row per player 
              columns ['PLAYER_ID','GAME_DATE','PTS','AST']
    Returns:
      X: np.array of shape (n_players, 20, 2)
      y: np.array of shape (n_players, 2)
    """
    X_list, y_list = [], []
    
    # Ensure data is sorted by date per player
    reg20_df = reg20_df.sort_values(['PLAYER_ID','GAME_DATE'])
    
    for pid, group in reg20_df.groupby('PLAYER_ID'):
        if len(group) != 20:
            # skip any player if they don't have exactly 20 games
            continue
        
        # 1) Build the input sequence: shape (20,2)
        seq = group[['PTS','AST']].values
        X_list.append(seq)
        
        # 2) Find the first-playoff target for this player
        fp = first_playoffs[first_playoffs['PLAYER_ID']==pid]
        if fp.empty:
            continue
        target = fp[['PTS','AST']].iloc[0].values
        y_list.append(target)
    
    # Stack into numpy arrays
    X = np.stack(X_list, axis=0)   # (n_players, 20, 2)
    y = np.vstack(y_list)          # (n_players, 2)
    return X, y

# ----------------------
# Example usage:

# first_playoffs = fetch_first_playoff_games(player_ids, '2023-24')
# reg20_df   = fetch_last_n_regular_season_games(player_ids, '2023-24', n=20)

X, y = build_Xy_from_groups(reg20_df, first_playoffs)
print("X shape:", X.shape)  # (n_players,20,2)
print("y shape:", y.shape)  # (n_players,2)

X shape: (202, 20, 2)
y shape: (202, 2)


first last playoff {'PTS': 27, 'AST': 8}


# TIME SERIES

In [280]:
#!pip install tensorflow


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip[0m
